List of usage examples for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB
@Override public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB)
From source file:edu.virginia.cs.index.UserIndexer.java
/** * Creates the initial index files on disk * * @param indexPath/* w w w .j a v a 2 s . c o m*/ * @return * @throws IOException */ private static IndexWriter setupIndex(String indexPath) throws IOException { Analyzer analyzer = new SpecialAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer); config.setOpenMode(OpenMode.CREATE); config.setRAMBufferSizeMB(2048.0); FSDirectory dir = FSDirectory.open(new File(indexPath)); IndexWriter writer = new IndexWriter(dir, config); return writer; }
From source file:eu.eexcess.sourceselection.redde.indexer.BinaryIndexResource.java
License:Apache License
/** * opens the sample index for writing; overwrites existing one * //from w w w . jav a 2s . c o m * @param ramBufferSizeMB * determines the amount of RAM that may be used for buffering * @throws IOException * if unable to open/create index */ void openOutIndex(double ramBufferSizeMB) throws IOException { try { Directory indexDirectory = FSDirectory.open(new File(outIndexPath)); Analyzer analyzer = new EnglishAnalyzer(); IndexWriterConfig writerConfig = new IndexWriterConfig(luceneVersion, analyzer); writerConfig.setOpenMode(OpenMode.CREATE); writerConfig.setRAMBufferSizeMB(ramBufferSizeMB); outIndexWriter = new IndexWriter(indexDirectory, writerConfig); } catch (IOException e) { logger.log(Level.SEVERE, "unable to open/create index at [" + outIndexPath + "]", e); throw e; } }
From source file:eu.eexcess.sourceselection.redde.indexer.TrecToLuceneIndexBuilder.java
License:Apache License
/** * Builds/overwrites existing Lucene index using TREC documents as source *//*from w w w. ja va 2 s . co m*/ public void index() { Date startTimestamp = new Date(); final File documentsDirectory = new File(documentsPath); if (!documentsDirectory.exists() || !documentsDirectory.canRead()) { logger.severe("cannot access document directory [" + documentsDirectory.getAbsolutePath() + "]"); } else { try { logger.info("processing directory [" + documentsPath + "] to index [" + indexPath + "]"); Directory indexDirectory = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new EnglishAnalyzer(); IndexWriterConfig writerConfig = new IndexWriterConfig(luceneVersion, analyzer); writerConfig.setOpenMode(OpenMode.CREATE); writerConfig.setRAMBufferSizeMB(ramBufferSize); IndexWriter indexWriter = new IndexWriter(indexDirectory, writerConfig); indexDocs(indexWriter, documentsDirectory); indexWriter.commit(); indexWriter.close(); Date stopTimestamp = new Date(); logger.info("processed [" + dirsCount + "] dirs [" + filesCount + "] files [" + documentsTotal + "] documents [" + filesSkipped + "] files skipped in [" + (stopTimestamp.getTime() - startTimestamp.getTime()) + "] ms]"); } catch (IOException e) { logger.log(Level.SEVERE, "failed indexing documents", e); } } }
From source file:indexer.LuceneIndexer.java
/** * Indexing the files. This method checks for the directories and then * finishes out after the indexing is complete. * @param global This is for reference to the global class variables * and methods.//from www. java 2 s .c om * @param createIndex If true a new index will be created from scratch * and the old index will be destroyed. * @param indexPanel If true it will also print the console printout lines * to the main panel. */ public static void IndexFiles(Global global, Boolean createIndex) { String dataDir = global.dataDir; String indexDir = global.indexDir; //Verifies that the data directory exists if (dataDir == null) { System.err.println("Data Directory Is not accessable, Unable to Index files."); } //Verifies that the data directory is readable and writeable final Path docDir = Paths.get(dataDir); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); } startTime = new Date(); try { System.out.println("Indexing to directory '" + indexDir + "'..."); //Setups the analyzer Analyzer analyzer; try (Directory dir = FSDirectory.open(Paths.get(indexDir))) { analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (createIndex) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } iwc.setRAMBufferSizeMB(global.RAM_BUFFER_SIZE); iwc.setMaxBufferedDocs(global.MAX_BUFFERED_DOCS); LogDocMergePolicy ldmp = new LogDocMergePolicy(); ldmp.setMergeFactor(global.MERGE_FACTOR); iwc.setMergePolicy(ldmp); try (IndexWriter writer = new IndexWriter(dir, iwc)) { hm.clear(); indexDocs(writer, docDir, global); //This is a costly operation, we scheduled the time to apply it if (global.merge) { System.out.println("Starting Merge"); writer.forceMerge(1); global.merge = false; } writer.close(); } finishTime = new Date(); long millis = finishTime.getTime() - startTime.getTime(); totalTime = String.format("%02dhr %02dmin %02dsec", TimeUnit.MILLISECONDS.toHours(millis), TimeUnit.MILLISECONDS.toMinutes(millis) - TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(millis)), // The change is in this line TimeUnit.MILLISECONDS.toSeconds(millis) - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))); System.out.println(""); System.out.println(""); System.out.println("Start Time: " + global.sdf.format(startTime.getTime())); System.out.println("Building List Time: " + listBuildTime); System.out.println("Indexing Time: " + indexingTime); System.out.println("Total Time: " + totalTime); System.out.println("Number of Documents: " + amountOfDocuments); System.out.println("Finish Time: " + global.sdf.format(finishTime.getTime())); System.out.println(""); } analyzer.close(); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); log.fatal(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:intelligentWebAlgorithms.algos.search.lucene.LuceneIndexBuilder.java
License:Apache License
private IndexWriter getIndexWriter(File file) throws IOException { FSDirectory dir = FSDirectory.open(file); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setRAMBufferSizeMB(RamBufferSizeMB); return new IndexWriter(dir, config); }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//w w w . j a v a 2 s .c om "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);/*from w w w . j ava 2 s . c o m*/ config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512); iwc.setUseCompoundFile(false);//from www .ja v a2 s . com iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(/*from w w w.ja v a2s. c o m*/ "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); for (Path f : discoverWarcFiles(docDir)) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); } finally { writer.close(); } return numIndexed; }
From source file:io.druid.extension.lucene.LuceneDruidSegment.java
License:Apache License
private static IndexWriter buildRamWriter(RAMDirectory dir, Analyzer analyzer, int maxDocsPerSegment) throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer); writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // some arbitrary large numbers writerConfig.setMaxBufferedDocs(maxDocsPerSegment * 2); writerConfig.setRAMBufferSizeMB(5000); writerConfig.setUseCompoundFile(false); writerConfig.setCommitOnClose(true); writerConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); writerConfig.setMergePolicy(NoMergePolicy.INSTANCE); writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE); return new IndexWriter(dir, writerConfig); }