List of usage examples for org.apache.lucene.index IndexWriter forceMerge
public void forceMerge(int maxNumSegments) throws IOException
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);// w ww. j av a 2s.co m config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexGov2.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { IndexArgs indexArgs = new IndexArgs(); CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90)); try {// w w w. j a va 2 s.c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: IndexGov2" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } final String dirPath = indexArgs.index; final String dataDir = indexArgs.input; final int docCountLimit = indexArgs.doclimit; final int numThreads = indexArgs.threads; final boolean positions = indexArgs.positions; final boolean optimize = indexArgs.optimize; final Analyzer a = new EnglishAnalyzer(); final TrecContentSource trecSource = createGov2Source(dataDir); final Directory dir = FSDirectory.open(Paths.get(dirPath)); LOG.info("Index path: " + dirPath); LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); LOG.info("Threads: " + numThreads); LOG.info("Positions: " + positions); LOG.info("Optimize (merge segments): " + optimize); final IndexWriterConfig config = new IndexWriterConfig(a); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); final IndexWriter writer = new IndexWriter(dir, config); Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit); LOG.info("Indexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done()) { Thread.sleep(100); } threads.stop(); final long t1 = System.currentTimeMillis(); LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs"); if (docCountLimit != -1 && writer.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit); } if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } final long t2; t2 = System.currentTimeMillis(); final Map<String, String> commitData = new HashMap<String, String>(); commitData.put("userData", "multi"); writer.setCommitData(commitData); writer.commit(); final long t3 = System.currentTimeMillis(); LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)"); if (optimize) { LOG.info("Indexer: merging all segments"); writer.forceMerge(1); final long t4 = System.currentTimeMillis(); LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)"); } // TODO: Doesn't compile for 5.3.1 - do we actually need this? //LOG.info("Indexer: at close: " + writer.segString()); final long tCloseStart = System.currentTimeMillis(); writer.close(); LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec"); dir.close(); final long tFinal = System.currentTimeMillis(); LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)"); LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed()); LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.)) + " GB/hour plain text"); }
From source file:io.anserini.index.IndexTweets.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(/* w w w.ja v a 2 s. co m*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexTweets.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(Paths.get(indexPath)); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512);/* w w w . j a va2 s. co m*/ iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void buildTestIndex() throws IOException { Directory dir = FSDirectory.open(tempDir1); Analyzer analyzer = new EnglishAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textOptions.setStored(true);/*from w w w . j a v a 2 s. c o m*/ textOptions.setTokenized(true); textOptions.setStoreTermVectors(true); textOptions.setStoreTermVectorPositions(true); Document doc1 = new Document(); doc1.add(new StringField("docid", "doc1", Field.Store.YES)); doc1.add(new Field("text", "here is some text here is some more text", textOptions)); writer.addDocument(doc1); Document doc2 = new Document(); doc2.add(new StringField("docid", "doc2", Field.Store.YES)); doc2.add(new Field("text", "more text", textOptions)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new StringField("docid", "doc3", Field.Store.YES)); doc3.add(new Field("text", "here is a test", textOptions)); writer.addDocument(doc3); writer.commit(); writer.forceMerge(1); writer.close(); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testCloneIndex() throws Exception { System.out.println("Cloning index:"); Directory dir1 = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir1); Directory dir2 = FSDirectory.open(tempDir2); IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir2, config); LeafReader leafReader = reader.leaves().get(0).reader(); CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader); writer.addIndexes(new MyFilterCodecReader(codecReader)); writer.commit();/*from w ww . j a va 2 s .co m*/ writer.forceMerge(1); writer.close(); reader.close(); // Open up the cloned index and verify it. reader = DirectoryReader.open(dir2); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from w w w .j a v a 2 s.c om*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); FileInputStream fis = null; BufferedReader br = null; try { fis = new FileInputStream(new File(path)); byte[] ignoreBytes = new byte[2]; fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8")); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String s; while ((s = br.readLine()) != null) { Runnable worker = new AddDocumentRunnable(writer, s); executor.execute(worker); cnt++; if (cnt % 1000000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); br.close(); fis.close(); } }
From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java
License:Apache License
/** * #1 Index contains deletions/*from www. j a v a 2s . c o m*/ * * #2 1 indexed document, 1 deleted document * * #3 Optimize compacts deletes */ @Test public void testDeleteAfterOptimize() throws IOException { IndexWriter writer = getWriter(); assertEquals(2, writer.numDocs()); writer.deleteDocuments(new Term("id", "1")); writer.forceMerge(1); writer.commit(); assertFalse(writer.hasDeletions()); assertEquals(1, writer.maxDoc()); assertEquals(1, writer.numDocs()); writer.close(); }
From source file:io.datalayer.lucene.read.LuceneReaderTest.java
License:Apache License
private static void addDocuments(Directory dir) throws IOException { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, getAnalyzer()); IndexWriter writer = new IndexWriter(dir, config); for (int i = 0; i < keywords.length; i++) { Document doc = new Document(); doc.add(new Field("id", keywords[i], AosFieldType.INDEXED_STORED_TERMVECTORS)); doc.add(new Field("city", text[i], AosFieldType.INDEXED_STORED_TERMVECTORS)); doc.add(new Field("country", unindexed[i], AosFieldType.INDEXEDNOT_STORED_TERMVECTORSNOT)); doc.add(new Field("contents", unstored[i], AosFieldType.INDEXED_STOREDNOT_TERMVECTORS)); writer.addDocument(doc);//from w ww .j av a 2 s.c om } writer.forceMerge(1); writer.close(); }
From source file:io.github.infolis.algorithm.Indexer.java
License:Apache License
@Override public void execute() throws IOException { File indexDir;/*from w w w . jav a 2s . c om*/ if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) { indexDir = new File(getExecution().getIndexDirectory()); } else { indexDir = new File( Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX) .toString()); FileUtils.forceDeleteOnExit(indexDir); } log.debug("Indexing to: " + indexDir.getAbsolutePath()); getExecution().setOutputDirectory(indexDir.getAbsolutePath().toString()); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, createAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE); FSDirectory fsIndexDir = FSDirectory.open(indexDir); List<InfolisFile> files = new ArrayList<>(); for (String fileUri : getExecution().getInputFiles()) { try { files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri)); } catch (Exception e) { error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage()); getExecution().setStatus(ExecutionStatus.FAILED); persistExecution(); return; } } Date start = new Date(); log.debug("Starting to index"); IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig); try { int counter = 0; for (InfolisFile file : files) { counter++; log.trace("Indexing file " + file); writer.addDocument(toLuceneDocument(getInputFileResolver(), file)); updateProgress(counter, files.size()); } } catch (FileNotFoundException fnfe) { // NOTE: at least on windows, some temporary files raise this // exception with an "access denied" message checking if the // file can be read doesn't help throw new RuntimeException("Could not write index entry: " + fnfe); } finally { log.debug("Merging all Lucene segments ..."); writer.forceMerge(1); writer.close(); } getExecution().setStatus(ExecutionStatus.FINISHED); fsIndexDir.close(); log.debug(String.format("Indexing %s documents took %s ms", files.size(), new Date().getTime() - start.getTime())); }