List of usage examples for org.apache.lucene.index IndexWriterConfig setUseCompoundFile
@Override public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile)
From source file:com.github.lucene.store.jdbc.AbstractJdbcDirectoryITest.java
License:Apache License
protected void addDocuments(final Directory directory, final OpenMode openMode, final boolean useCompoundFile, final Collection<String> docs) throws IOException { final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(OpenMode.CREATE); config.setUseCompoundFile(useCompoundFile); final DirectoryTemplate template = new DirectoryTemplate(directory); template.execute(new DirectoryTemplate.DirectoryCallbackWithoutResult() { @Override//from ww w . j a va 2 s . c om public void doInDirectoryWithoutResult(final Directory dir) throws IOException { final IndexWriter writer = new IndexWriter(dir, config); for (final Object element : docs) { final Document doc = new Document(); final String word = (String) element; // FIXME: review // doc.add(new Field("keyword", word, Field.Store.YES, // Field.Index.UN_TOKENIZED)); // doc.add(new Field("unindexed", word, Field.Store.YES, // Field.Index.NO)); // doc.add(new Field("unstored", word, Field.Store.NO, // Field.Index.TOKENIZED)); // doc.add(new Field("text", word, Field.Store.YES, // Field.Index.TOKENIZED)); doc.add(new StringField("keyword", word, Field.Store.YES)); doc.add(new StringField("unindexed", word, Field.Store.YES)); doc.add(new StringField("unstored", word, Field.Store.NO)); doc.add(new StringField("text", word, Field.Store.YES)); writer.addDocument(doc); } // FIXME: review // writer.optimize(); writer.close(); } }); }
From source file:com.github.rnewson.couchdb.lucene.DatabaseIndexer.java
License:Apache License
private IndexWriter newWriter(final Directory dir) throws IOException { final IndexWriterConfig config = new IndexWriterConfig(Constants.VERSION, Constants.ANALYZER); config.setUseCompoundFile(ini.getBoolean("lucene.useCompoundFile", false)); config.setRAMBufferSizeMB(//from w w w .j a v a 2s. c om ini.getDouble("lucene.ramBufferSizeMB", IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB)); return new IndexWriter(dir, config); }
From source file:com.qwazr.search.bench.LuceneCommonIndex.java
License:Apache License
LuceneCommonIndex(final Path rootDirectory, final String schemaName, final String indexName, final double ramBufferSize, final boolean useCompoundFile) throws IOException { final Path schemaDirectory = Files.createDirectory(rootDirectory.resolve(schemaName)); this.indexDirectory = Files.createDirectory(schemaDirectory.resolve(indexName)); this.luceneDirectory = indexDirectory.resolve("data"); this.dataDirectory = FSDirectory.open(luceneDirectory); final IndexWriterConfig indexWriterConfig = new IndexWriterConfig( new PerFieldAnalyzerWrapper(new StandardAnalyzer())); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setRAMBufferSizeMB(ramBufferSize); final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); mergeScheduler.setMaxMergesAndThreads(MAX_SSD_MERGE_THREADS, MAX_SSD_MERGE_THREADS); indexWriterConfig.setMergeScheduler(mergeScheduler); indexWriterConfig.setUseCompoundFile(useCompoundFile); final TieredMergePolicy mergePolicy = new TieredMergePolicy(); indexWriterConfig.setMergePolicy(mergePolicy); // We use snapshots deletion policy final SnapshotDeletionPolicy snapshotDeletionPolicy = new SnapshotDeletionPolicy( indexWriterConfig.getIndexDeletionPolicy()); indexWriterConfig.setIndexDeletionPolicy(snapshotDeletionPolicy); this.indexWriter = new IndexWriter(this.dataDirectory, indexWriterConfig); this.localReplicator = new LocalReplicator(); }
From source file:com.senseidb.abacus.api.codec.CodecTest.java
License:Apache License
static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception { String idxname = codec == null ? "lucene" : codec.getName(); Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory(); //Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)); conf.setUseCompoundFile(false); if (codec != null) { conf.setCodec(codec);/*w w w .j a va 2 s . c o m*/ } IndexWriter writer = new IndexWriter(dir, conf); for (String doc : datasrc) { if (doc == null) break; doc = doc.trim(); if (doc.length() == 0) continue; Document d = new Document(); FieldType ft = new FieldType(); ft.setIndexed(true); ft.setStored(false); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.setOmitNorms(true); Field f = new Field(FIELD, doc, ft); d.add(f); writer.addDocument(d); } writer.forceMerge(1); writer.commit(); writer.close(); return dir; }
From source file:com.stratio.cassandra.index.LuceneIndex.java
License:Apache License
/** * Initializes this using the specified {@link Sort} for trying to keep the {@link Document}s sorted. * * @param sort The {@link Sort} to be used. *//*from ww w.j av a2 s . com*/ public void init(Sort sort) { Log.debug("Initializing index"); try { this.sort = sort; // Get directory file file = new File(path); // Open or create directory FSDirectory fsDirectory = FSDirectory.open(file); directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB); // Setup index writer IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); config.setRAMBufferSizeMB(ramBufferMB); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); config.setUseCompoundFile(true); config.setMergePolicy(new SortingMergePolicy(config.getMergePolicy(), sort)); indexWriter = new IndexWriter(directory, config); // Setup NRT search SearcherFactory searcherFactory = new SearcherFactory() { public IndexSearcher newSearcher(IndexReader reader) throws IOException { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new NoIDFSimilarity()); return searcher; } }; TrackingIndexWriter trackingIndexWriter = new TrackingIndexWriter(indexWriter); searcherManager = new SearcherManager(indexWriter, true, searcherFactory); searcherReopener = new ControlledRealTimeReopenThread<>(trackingIndexWriter, searcherManager, refreshSeconds, refreshSeconds); searcherReopener.start(); // Start the refresher thread } catch (IOException e) { Log.error(e, "Error while initializing index"); throw new RuntimeException(e); } }
From source file:com.stratio.cassandra.lucene.index.FSIndex.java
License:Apache License
/** * Builds a new {@link FSIndex}.//from w ww . j a va 2 s. c o m * * @param name the index name * @param mbeanName the JMX MBean object name * @param path the directory path * @param analyzer the index writer analyzer * @param refresh the index reader refresh frequency in seconds * @param ramBufferMB the index writer RAM buffer size in MB * @param maxMergeMB the directory max merge size in MB * @param maxCachedMB the directory max cache size in MB * @param refreshTask action to be done during refresh */ public FSIndex(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB, int maxMergeMB, int maxCachedMB, Runnable refreshTask) { try { this.path = path; this.name = name; // Open or create directory FSDirectory fsDirectory = FSDirectory.open(path); directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB); // Setup index writer IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(ramBufferMB); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setUseCompoundFile(true); indexWriterConfig.setMergePolicy(new TieredMergePolicy()); indexWriter = new IndexWriter(directory, indexWriterConfig); // Setup NRT search SearcherFactory searcherFactory = new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) { if (refreshTask != null) { refreshTask.run(); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new NoIDFSimilarity()); return searcher; } }; TrackingIndexWriter trackingWriter = new TrackingIndexWriter(indexWriter); searcherManager = new SearcherManager(indexWriter, true, searcherFactory); searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, searcherManager, refresh, refresh); searcherReopener.start(); // Register JMX MBean mbean = new ObjectName(mbeanName); ManagementFactory.getPlatformMBeanServer().registerMBean(this, this.mbean); } catch (Exception e) { throw new IndexException(logger, e, "Error while creating index %s", name); } }
From source file:com.stratio.cassandra.lucene.service.LuceneIndex.java
License:Apache License
/** * Builds a new {@code RowDirectory} using the specified directory path and analyzer. * * @param keyspace The keyspace name. * @param table The table name. * @param name The index name. * @param path The path of the directory in where the Lucene files will be stored. * @param ramBufferMB The index writer buffer size in MB. * @param maxMergeMB NRTCachingDirectory max merge size in MB. * @param maxCachedMB NRTCachingDirectory max cached MB. * @param analyzer The default {@link Analyzer}. * @param refreshSeconds The index readers refresh time in seconds. Writings are not visible until this time. * @param refreshCallback A runnable to be run on index refresh. * @throws IOException If Lucene throws IO errors. *//*w w w .ja va2 s . c o m*/ public LuceneIndex(String keyspace, String table, String name, Path path, Integer ramBufferMB, Integer maxMergeMB, Integer maxCachedMB, Analyzer analyzer, Double refreshSeconds, Runnable refreshCallback) throws IOException { this.path = path; this.refreshCallback = refreshCallback; this.logName = String.format("Lucene index %s.%s.%s", keyspace, table, name); // Open or create directory FSDirectory fsDirectory = FSDirectory.open(path); directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB); // Setup index writer IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setRAMBufferSizeMB(ramBufferMB); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); config.setUseCompoundFile(true); config.setMergePolicy(new TieredMergePolicy()); indexWriter = new IndexWriter(directory, config); // Setup NRT search SearcherFactory searcherFactory = new SearcherFactory() { public IndexSearcher newSearcher(IndexReader reader) throws IOException { LuceneIndex.this.refreshCallBack(); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new NoIDFSimilarity()); return searcher; } }; TrackingIndexWriter trackingIndexWriter = new TrackingIndexWriter(indexWriter); searcherManager = new SearcherManager(indexWriter, true, searcherFactory); searcherReopener = new ControlledRealTimeReopenThread<>(trackingIndexWriter, searcherManager, refreshSeconds, refreshSeconds); searcherReopener.start(); // Start the refresher thread // Register JMX MBean try { objectName = new ObjectName( String.format("com.stratio.cassandra.lucene:type=LuceneIndexes,keyspace=%s,table=%s,index=%s", keyspace, table, name)); ManagementFactory.getPlatformMBeanServer().registerMBean(this, objectName); } catch (MBeanException | OperationsException e) { Log.error(e, "Error while registering MBean"); } }
From source file:io.anserini.embeddings.search.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); final Directory dir = FSDirectory.open(indexPath); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); Document document = new Document(); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null;/* w ww. jav a 2 s.co m*/ bRdr.readLine(); while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES)); document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1])); } }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//from w w w . ja va 2 s . c om "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); }/*w w w .j av a 2 s. co m*/ executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }