List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:io.anserini.embeddings.search.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); final Directory dir = FSDirectory.open(indexPath); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setUseCompoundFile(false);// www.j a v a 2 s.co m config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); Document document = new Document(); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null; bRdr.readLine(); while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES)); document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1])); } }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//ww w .ja v a 2 s. c om "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);// w ww . jav a2 s . c o m config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexGov2.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { IndexArgs indexArgs = new IndexArgs(); CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90)); try {/*from w w w. j av a2s . c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: IndexGov2" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } final String dirPath = indexArgs.index; final String dataDir = indexArgs.input; final int docCountLimit = indexArgs.doclimit; final int numThreads = indexArgs.threads; final boolean positions = indexArgs.positions; final boolean optimize = indexArgs.optimize; final Analyzer a = new EnglishAnalyzer(); final TrecContentSource trecSource = createGov2Source(dataDir); final Directory dir = FSDirectory.open(Paths.get(dirPath)); LOG.info("Index path: " + dirPath); LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); LOG.info("Threads: " + numThreads); LOG.info("Positions: " + positions); LOG.info("Optimize (merge segments): " + optimize); final IndexWriterConfig config = new IndexWriterConfig(a); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); final IndexWriter writer = new IndexWriter(dir, config); Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit); LOG.info("Indexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done()) { Thread.sleep(100); } threads.stop(); final long t1 = System.currentTimeMillis(); LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs"); if (docCountLimit != -1 && writer.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit); } if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } final long t2; t2 = System.currentTimeMillis(); final Map<String, String> commitData = new HashMap<String, String>(); commitData.put("userData", "multi"); writer.setCommitData(commitData); writer.commit(); final long t3 = System.currentTimeMillis(); LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)"); if (optimize) { LOG.info("Indexer: merging all segments"); writer.forceMerge(1); final long t4 = System.currentTimeMillis(); LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)"); } // TODO: Doesn't compile for 5.3.1 - do we actually need this? //LOG.info("Indexer: at close: " + writer.segString()); final long tCloseStart = System.currentTimeMillis(); writer.close(); LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec"); dir.close(); final long tFinal = System.currentTimeMillis(); LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)"); LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed()); LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.)) + " GB/hour plain text"); }
From source file:io.anserini.index.IndexTweets.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(//from w w w. ja v a2 s . c o m OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexTweets.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(Paths.get(indexPath)); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:io.anserini.index.IndexTweetsUpdatePlace.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("collection").hasArg() .withDescription("source collection directory").create(COLLECTION_OPTION)); options.addOption(/*from ww w . j av a 2 s. c om*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); System.out.println(collectionPath + " " + indexPath); LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } final StatusStream stream = new JsonStatusCorpusReader(file); final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION))); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.APPEND); final IndexWriter writer = new IndexWriter(dir, config); System.out.print("Original # of docs " + writer.numDocs()); int updateCount = 0; Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { stream.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { dir.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Shutting down"); } }); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getPlace() != null) { // Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(), // status.getId(), true, true); // System.out.print("Deleting docCount="+writer.numDocs()); // writer.deleteDocuments(q); // writer.commit(); // System.out.print(" Deleted docCount="+writer.numDocs()); Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES)); doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES)); doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } long id = status.getId(); BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCodedBytes(id, 0, brb); Term term = new Term(StatusField.ID.name, brb.get()); writer.updateDocument(term, doc); // writer.addDocument(doc); updateCount += 1; if (updateCount % 10000 == 0) { LOG.info(updateCount + " statuses updated"); writer.commit(); System.out.println("Updated docCount=" + writer.numDocs()); } } } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512);// w ww . j a v a2 s .c om iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.UpdateIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(/*from www . j a v a2s . c om*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(UpdateIndex.class.getName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); textOptions.setStoreTermVectors(true); LOG.info("index: " + indexPath); File file = new File("PittsburghUserTimeline"); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } final StatusStream stream = new JsonStatusCorpusReader(file); Status status; String s; HashMap<Long, String> hm = new HashMap<Long, String>(); try { while ((s = stream.nextRaw()) != null) { try { status = DataObjectFactory.createStatus(s); if (status.getText() == null) { continue; } hm.put(status.getUser().getId(), hm.get(status.getUser().getId()) + status.getText().replaceAll("[\\r\\n]+", " ")); } catch (Exception e) { } } } catch (Exception e) { e.printStackTrace(); } finally { stream.close(); } ArrayList<String> userIDList = new ArrayList<String>(); try (BufferedReader br = new BufferedReader(new FileReader(new File("userID")))) { String line; while ((line = br.readLine()) != null) { userIDList.add(line.replaceAll("[\\r\\n]+", "")); // process the line. } } try { reader = DirectoryReader .open(FSDirectory.open(new File(cmdline.getOptionValue(INDEX_OPTION)).toPath())); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION))); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); final IndexWriter writer = new IndexWriter(dir, config); IndexSearcher searcher = new IndexSearcher(reader); System.out.println("The total number of docs indexed " + searcher.collectionStatistics(TweetStreamReader.StatusField.TEXT.name).docCount()); for (int city = 0; city < cityName.length; city++) { // Pittsburgh's coordinate -79.976389, 40.439722 Query q_long = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LONGITUDE.name, new Double(longitude[city] - 0.05), new Double(longitude[city] + 0.05), true, true); Query q_lat = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LATITUDE.name, new Double(latitude[city] - 0.05), new Double(latitude[city] + 0.05), true, true); BooleanQuery bqCityName = new BooleanQuery(); Term t = new Term("place", cityName[city]); TermQuery query = new TermQuery(t); bqCityName.add(query, BooleanClause.Occur.SHOULD); System.out.println(query.toString()); for (int i = 0; i < cityNameAlias[city].length; i++) { t = new Term("place", cityNameAlias[city][i]); query = new TermQuery(t); bqCityName.add(query, BooleanClause.Occur.SHOULD); System.out.println(query.toString()); } BooleanQuery bq = new BooleanQuery(); BooleanQuery finalQuery = new BooleanQuery(); // either a coordinate match bq.add(q_long, BooleanClause.Occur.MUST); bq.add(q_lat, BooleanClause.Occur.MUST); finalQuery.add(bq, BooleanClause.Occur.SHOULD); // or a place city name match finalQuery.add(bqCityName, BooleanClause.Occur.SHOULD); TotalHitCountCollector totalHitCollector = new TotalHitCountCollector(); // Query hasFieldQuery = new ConstantScoreQuery(new // FieldValueFilter("timeline")); // // searcher.search(hasFieldQuery, totalHitCollector); // // if (totalHitCollector.getTotalHits() > 0) { // TopScoreDocCollector collector = // TopScoreDocCollector.create(Math.max(0, // totalHitCollector.getTotalHits())); // searcher.search(finalQuery, collector); // ScoreDoc[] hits = collector.topDocs().scoreDocs; // // // HashMap<String, Integer> hasHit = new HashMap<String, Integer>(); // int dupcount = 0; // for (int i = 0; i < hits.length; ++i) { // int docId = hits[i].doc; // Document d; // // d = searcher.doc(docId); // // System.out.println(d.getFields()); // } // } // totalHitCollector = new TotalHitCountCollector(); searcher.search(finalQuery, totalHitCollector); if (totalHitCollector.getTotalHits() > 0) { TopScoreDocCollector collector = TopScoreDocCollector .create(Math.max(0, totalHitCollector.getTotalHits())); searcher.search(finalQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits."); HashMap<String, Integer> hasHit = new HashMap<String, Integer>(); int dupcount = 0; for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d; d = searcher.doc(docId); if (userIDList.contains(d.get(IndexTweets.StatusField.USER_ID.name)) && hm.containsKey(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name)))) { // System.out.println("Has timeline field?" + (d.get("timeline") != null)); // System.out.println(reader.getDocCount("timeline")); // d.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), // textOptions)); System.out.println("Found a user hit"); BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCodedBytes(Long.parseLong(d.get(IndexTweets.StatusField.ID.name)), 0, brb); Term term = new Term(IndexTweets.StatusField.ID.name, brb.get()); // System.out.println(reader.getDocCount("timeline")); Document d_new = new Document(); // for (IndexableField field : d.getFields()) { // d_new.add(field); // } // System.out.println(d_new.getFields()); d_new.add(new StringField("userBackground", d.get(IndexTweets.StatusField.USER_ID.name), Store.YES)); d_new.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), textOptions)); // System.out.println(d_new.get()); writer.addDocument(d_new); writer.commit(); // t = new Term("label", "why"); // TermQuery tqnew = new TermQuery(t); // // totalHitCollector = new TotalHitCountCollector(); // // searcher.search(tqnew, totalHitCollector); // // if (totalHitCollector.getTotalHits() > 0) { // collector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits())); // searcher.search(tqnew, collector); // hits = collector.topDocs().scoreDocs; // // System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits."); // // for (int k = 0; k < hits.length; k++) { // docId = hits[k].doc; // d = searcher.doc(docId); // System.out.println(d.get(IndexTweets.StatusField.ID.name)); // System.out.println(d.get(IndexTweets.StatusField.PLACE.name)); // } // } // writer.deleteDocuments(term); // writer.commit(); // writer.addDocument(d); // writer.commit(); // System.out.println(reader.getDocCount("timeline")); // writer.updateDocument(term, d); // writer.commit(); } } } } reader.close(); writer.close(); }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(// w ww .java2s. com "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); for (Path f : discoverWarcFiles(docDir)) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void buildTestIndex() throws IOException { Directory dir = FSDirectory.open(tempDir1); Analyzer analyzer = new EnglishAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textOptions.setStored(true);/*from ww w . ja v a 2s . co m*/ textOptions.setTokenized(true); textOptions.setStoreTermVectors(true); textOptions.setStoreTermVectorPositions(true); Document doc1 = new Document(); doc1.add(new StringField("docid", "doc1", Field.Store.YES)); doc1.add(new Field("text", "here is some text here is some more text", textOptions)); writer.addDocument(doc1); Document doc2 = new Document(); doc2.add(new StringField("docid", "doc2", Field.Store.YES)); doc2.add(new Field("text", "more text", textOptions)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new StringField("docid", "doc3", Field.Store.YES)); doc3.add(new Field("text", "here is a test", textOptions)); writer.addDocument(doc3); writer.commit(); writer.forceMerge(1); writer.close(); }