List of usage examples for org.apache.lucene.index ConcurrentMergeScheduler ConcurrentMergeScheduler
public ConcurrentMergeScheduler()
From source file:com.flaptor.hounder.Index.java
License:Apache License
private Index(final File path, boolean create) { if (!create) { if (!path.exists()) { throw new IllegalArgumentException("The path passed to the costructor doesn't exist."); }/*ww w. j av a2 s. c o m*/ if (!path.isDirectory()) { throw new IllegalArgumentException("The path passed to the constructor is not a directory"); } } else { if (path.exists()) { throw new IllegalArgumentException( "Cannot create index on " + path.getAbsolutePath() + ". Path exists."); } } this.path = path; properties = new Properties(); if (!create) { File propFile = new File(path.getAbsolutePath() + File.separator + "index.properties"); if (!propFile.exists()) { throw new IllegalArgumentException( "Cannot find the properties inside the index. Is the index corrupted?."); } if (!propFile.isFile()) { throw new IllegalArgumentException( "There's no file named index.properties in the index. Maybe it is a directory?"); } InputStream is = null; try { is = new FileInputStream(propFile); properties.load(is); indexDescriptor = new IndexDescriptor(properties.get("indexDescriptor").toString()); } catch (IOException e) { logger.error( "Exception while trying to load index.properties for index at " + path.getAbsolutePath(), e); } catch (NullPointerException e) { logger.error("There is no index descriptor on " + propFile.getName() + ". using default. Exception caused by " + e.getMessage(), e); indexDescriptor = IndexDescriptor.defaultDescriptor(); } finally { com.flaptor.util.Execute.close(is, logger); } } Config config = Config.getConfig("common.properties"); failOnLegacyParameters(config); ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler(); cms.setMaxThreadCount(12); cms.setMergeThreadPriority(Thread.MIN_PRIORITY); this.mergeScheduler = cms; LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); smallSegmentSize = config.getFloat("Index.smallSegmentSizeMB"); mp.setMinMergeMB(smallSegmentSize); mp.setMergeFactor(config.getInt("Index.mergeFactor")); mergePolicy = mp; createAnalyzer(); setUpDirectory(create); }
From source file:com.qwazr.search.bench.LuceneCommonIndex.java
License:Apache License
LuceneCommonIndex(final Path rootDirectory, final String schemaName, final String indexName, final double ramBufferSize, final boolean useCompoundFile) throws IOException { final Path schemaDirectory = Files.createDirectory(rootDirectory.resolve(schemaName)); this.indexDirectory = Files.createDirectory(schemaDirectory.resolve(indexName)); this.luceneDirectory = indexDirectory.resolve("data"); this.dataDirectory = FSDirectory.open(luceneDirectory); final IndexWriterConfig indexWriterConfig = new IndexWriterConfig( new PerFieldAnalyzerWrapper(new StandardAnalyzer())); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setRAMBufferSizeMB(ramBufferSize); final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); mergeScheduler.setMaxMergesAndThreads(MAX_SSD_MERGE_THREADS, MAX_SSD_MERGE_THREADS); indexWriterConfig.setMergeScheduler(mergeScheduler); indexWriterConfig.setUseCompoundFile(useCompoundFile); final TieredMergePolicy mergePolicy = new TieredMergePolicy(); indexWriterConfig.setMergePolicy(mergePolicy); // We use snapshots deletion policy final SnapshotDeletionPolicy snapshotDeletionPolicy = new SnapshotDeletionPolicy( indexWriterConfig.getIndexDeletionPolicy()); indexWriterConfig.setIndexDeletionPolicy(snapshotDeletionPolicy); this.indexWriter = new IndexWriter(this.dataDirectory, indexWriterConfig); this.localReplicator = new LocalReplicator(); }
From source file:io.anserini.embeddings.search.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); final Directory dir = FSDirectory.open(indexPath); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setUseCompoundFile(false);/*from w w w . j a v a2 s . com*/ config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); Document document = new Document(); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null; bRdr.readLine(); while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES)); document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1])); } }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(/*from w ww . j a v a 2 s .co m*/ "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);//from ww w . j a v a2 s . c om config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512);//from ww w . j av a2 s . c o m iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//from w w w. j a va 2 s . co m "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); for (Path f : discoverWarcFiles(docDir)) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); } finally { writer.close(); } return numIndexed; }
From source file:org.apache.nifi.provenance.lucene.SimpleIndexManager.java
License:Apache License
private IndexWriterCount createWriter(final File indexDirectory) throws IOException { final List<Closeable> closeables = new ArrayList<>(); final Directory directory = FSDirectory.open(indexDirectory); closeables.add(directory);/* w w w . j a v a 2 s . co m*/ try { final Analyzer analyzer = new StandardAnalyzer(); closeables.add(analyzer); final IndexWriterConfig config = new IndexWriterConfig(LuceneUtil.LUCENE_VERSION, analyzer); final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); final int mergeThreads = repoConfig.getConcurrentMergeThreads(); mergeScheduler.setMaxMergesAndThreads(mergeThreads, mergeThreads); config.setMergeScheduler(mergeScheduler); final IndexWriter indexWriter = new IndexWriter(directory, config); final EventIndexWriter eventIndexWriter = new LuceneEventIndexWriter(indexWriter, indexDirectory); final IndexWriterCount writerCount = new IndexWriterCount(eventIndexWriter, analyzer, directory, 1, false); logger.debug("Providing new index writer for {}", indexDirectory); return writerCount; } catch (final IOException ioe) { for (final Closeable closeable : closeables) { try { closeable.close(); } catch (final IOException ioe2) { ioe.addSuppressed(ioe2); } } throw ioe; } }
From source file:org.apache.nifi.provenance.lucene.StandardIndexManager.java
License:Apache License
private IndexWriterCount createWriter(final File indexDirectory) throws IOException { final List<Closeable> closeables = new ArrayList<>(); final Directory directory = FSDirectory.open(indexDirectory.toPath()); closeables.add(directory);//from w w w .j av a2 s.c o m try { final Analyzer analyzer = new StandardAnalyzer(); closeables.add(analyzer); final IndexWriterConfig config = new IndexWriterConfig(analyzer); final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); final int mergeThreads = repoConfig.getConcurrentMergeThreads(); mergeScheduler.setMaxMergesAndThreads(mergeThreads, mergeThreads); config.setMergeScheduler(mergeScheduler); final IndexWriter indexWriter = new IndexWriter(directory, config); final EventIndexWriter eventIndexWriter = new LuceneEventIndexWriter(indexWriter, indexDirectory); final IndexWriterCount writerCount = new IndexWriterCount(eventIndexWriter, analyzer, directory, 1, false); logger.debug("Providing new index writer for {}", indexDirectory); return writerCount; } catch (final IOException ioe) { for (final Closeable closeable : closeables) { try { closeable.close(); } catch (final IOException ioe2) { ioe.addSuppressed(ioe2); } } throw ioe; } }
From source file:org.compass.core.lucene.engine.merge.scheduler.ConcurrentMergeSchedulerProvider.java
License:Apache License
/** * Returns Lucene {@link org.apache.lucene.index.ConcurrentMergeScheduler} allowing to configure * using {@link org.compass.core.lucene.LuceneEnvironment.MergeScheduler.Concurrent}. */// w w w . ja v a 2 s. com public MergeScheduler create(LuceneSearchEngineIndexManager indexManager, CompassSettings settings) throws SearchEngineException { ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); mergeScheduler.setMaxThreadCount( settings.getSettingAsInt(LuceneEnvironment.MergeScheduler.Concurrent.MAX_THREAD_COUNT, 3)); mergeScheduler.setMergeThreadPriority(settings.getSettingAsInt( LuceneEnvironment.MergeScheduler.Concurrent.THREAD_PRIORITY, Thread.NORM_PRIORITY)); return mergeScheduler; }