Example usage for org.apache.lucene.index IndexWriterConfig setUseCompoundFile

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setUseCompoundFile.

Prototype

@Override
    public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile)

Source Link

Usage

From source file:com.github.lucene.store.jdbc.AbstractJdbcDirectoryITest.java

License:Apache License

protected void addDocuments(final Directory directory, final OpenMode openMode, final boolean useCompoundFile,
        final Collection<String> docs) throws IOException {
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(OpenMode.CREATE);
    config.setUseCompoundFile(useCompoundFile);

    final DirectoryTemplate template = new DirectoryTemplate(directory);
    template.execute(new DirectoryTemplate.DirectoryCallbackWithoutResult() {
        @Override//from   ww  w  . j  a va  2  s  . c om
        public void doInDirectoryWithoutResult(final Directory dir) throws IOException {
            final IndexWriter writer = new IndexWriter(dir, config);
            for (final Object element : docs) {
                final Document doc = new Document();
                final String word = (String) element;
                // FIXME: review
                // doc.add(new Field("keyword", word, Field.Store.YES,
                // Field.Index.UN_TOKENIZED));
                // doc.add(new Field("unindexed", word, Field.Store.YES,
                // Field.Index.NO));
                // doc.add(new Field("unstored", word, Field.Store.NO,
                // Field.Index.TOKENIZED));
                // doc.add(new Field("text", word, Field.Store.YES,
                // Field.Index.TOKENIZED));
                doc.add(new StringField("keyword", word, Field.Store.YES));
                doc.add(new StringField("unindexed", word, Field.Store.YES));
                doc.add(new StringField("unstored", word, Field.Store.NO));
                doc.add(new StringField("text", word, Field.Store.YES));
                writer.addDocument(doc);
            }

            // FIXME: review
            // writer.optimize();
            writer.close();
        }
    });
}

From source file:com.github.rnewson.couchdb.lucene.DatabaseIndexer.java

License:Apache License

private IndexWriter newWriter(final Directory dir) throws IOException {
    final IndexWriterConfig config = new IndexWriterConfig(Constants.VERSION, Constants.ANALYZER);
    config.setUseCompoundFile(ini.getBoolean("lucene.useCompoundFile", false));
    config.setRAMBufferSizeMB(//from   w w w .j a  v  a  2s.  c  om
            ini.getDouble("lucene.ramBufferSizeMB", IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB));

    return new IndexWriter(dir, config);
}

From source file:com.qwazr.search.bench.LuceneCommonIndex.java

License:Apache License

LuceneCommonIndex(final Path rootDirectory, final String schemaName, final String indexName,
        final double ramBufferSize, final boolean useCompoundFile) throws IOException {

    final Path schemaDirectory = Files.createDirectory(rootDirectory.resolve(schemaName));
    this.indexDirectory = Files.createDirectory(schemaDirectory.resolve(indexName));
    this.luceneDirectory = indexDirectory.resolve("data");
    this.dataDirectory = FSDirectory.open(luceneDirectory);
    final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(
            new PerFieldAnalyzerWrapper(new StandardAnalyzer()));
    indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    indexWriterConfig.setRAMBufferSizeMB(ramBufferSize);

    final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler();
    mergeScheduler.setMaxMergesAndThreads(MAX_SSD_MERGE_THREADS, MAX_SSD_MERGE_THREADS);
    indexWriterConfig.setMergeScheduler(mergeScheduler);
    indexWriterConfig.setUseCompoundFile(useCompoundFile);

    final TieredMergePolicy mergePolicy = new TieredMergePolicy();
    indexWriterConfig.setMergePolicy(mergePolicy);

    // We use snapshots deletion policy
    final SnapshotDeletionPolicy snapshotDeletionPolicy = new SnapshotDeletionPolicy(
            indexWriterConfig.getIndexDeletionPolicy());
    indexWriterConfig.setIndexDeletionPolicy(snapshotDeletionPolicy);

    this.indexWriter = new IndexWriter(this.dataDirectory, indexWriterConfig);
    this.localReplicator = new LocalReplicator();
}

From source file:com.senseidb.abacus.api.codec.CodecTest.java

License:Apache License

static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception {
    String idxname = codec == null ? "lucene" : codec.getName();
    Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory();
    //Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44));
    conf.setUseCompoundFile(false);
    if (codec != null) {
        conf.setCodec(codec);/*w w w .j a  va 2 s  .  c o  m*/
    }

    IndexWriter writer = new IndexWriter(dir, conf);

    for (String doc : datasrc) {
        if (doc == null)
            break;
        doc = doc.trim();
        if (doc.length() == 0)
            continue;
        Document d = new Document();
        FieldType ft = new FieldType();
        ft.setIndexed(true);
        ft.setStored(false);
        ft.setIndexOptions(IndexOptions.DOCS_ONLY);
        ft.setOmitNorms(true);
        Field f = new Field(FIELD, doc, ft);
        d.add(f);
        writer.addDocument(d);
    }
    writer.forceMerge(1);
    writer.commit();
    writer.close();
    return dir;
}

From source file:com.stratio.cassandra.index.LuceneIndex.java

License:Apache License

/**
 * Initializes this using the specified {@link Sort} for trying to keep the {@link Document}s sorted.
 *
 * @param sort The {@link Sort} to be used.
 *//*from  ww w.j  av a2  s . com*/
public void init(Sort sort) {
    Log.debug("Initializing index");
    try {
        this.sort = sort;

        // Get directory file
        file = new File(path);

        // Open or create directory
        FSDirectory fsDirectory = FSDirectory.open(file);
        directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

        // Setup index writer
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        config.setRAMBufferSizeMB(ramBufferMB);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        config.setUseCompoundFile(true);
        config.setMergePolicy(new SortingMergePolicy(config.getMergePolicy(), sort));
        indexWriter = new IndexWriter(directory, config);

        // Setup NRT search
        SearcherFactory searcherFactory = new SearcherFactory() {
            public IndexSearcher newSearcher(IndexReader reader) throws IOException {
                IndexSearcher searcher = new IndexSearcher(reader);
                searcher.setSimilarity(new NoIDFSimilarity());
                return searcher;
            }
        };
        TrackingIndexWriter trackingIndexWriter = new TrackingIndexWriter(indexWriter);
        searcherManager = new SearcherManager(indexWriter, true, searcherFactory);
        searcherReopener = new ControlledRealTimeReopenThread<>(trackingIndexWriter, searcherManager,
                refreshSeconds, refreshSeconds);
        searcherReopener.start(); // Start the refresher thread
    } catch (IOException e) {
        Log.error(e, "Error while initializing index");
        throw new RuntimeException(e);
    }
}

From source file:com.stratio.cassandra.lucene.index.FSIndex.java

License:Apache License

/**
 * Builds a new {@link FSIndex}.//from  w ww  . j  a  va 2 s. c o  m
 *
 * @param name the index name
 * @param mbeanName the JMX MBean object name
 * @param path the directory path
 * @param analyzer the index writer analyzer
 * @param refresh the index reader refresh frequency in seconds
 * @param ramBufferMB the index writer RAM buffer size in MB
 * @param maxMergeMB the directory max merge size in MB
 * @param maxCachedMB the directory max cache size in MB
 * @param refreshTask action to be done during refresh
 */
public FSIndex(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB,
        int maxMergeMB, int maxCachedMB, Runnable refreshTask) {
    try {
        this.path = path;
        this.name = name;

        // Open or create directory
        FSDirectory fsDirectory = FSDirectory.open(path);
        directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

        // Setup index writer
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setRAMBufferSizeMB(ramBufferMB);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        indexWriterConfig.setUseCompoundFile(true);
        indexWriterConfig.setMergePolicy(new TieredMergePolicy());
        indexWriter = new IndexWriter(directory, indexWriterConfig);

        // Setup NRT search
        SearcherFactory searcherFactory = new SearcherFactory() {
            @Override
            public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) {
                if (refreshTask != null) {
                    refreshTask.run();
                }
                IndexSearcher searcher = new IndexSearcher(reader);
                searcher.setSimilarity(new NoIDFSimilarity());
                return searcher;
            }
        };
        TrackingIndexWriter trackingWriter = new TrackingIndexWriter(indexWriter);
        searcherManager = new SearcherManager(indexWriter, true, searcherFactory);
        searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, searcherManager, refresh,
                refresh);
        searcherReopener.start();

        // Register JMX MBean
        mbean = new ObjectName(mbeanName);
        ManagementFactory.getPlatformMBeanServer().registerMBean(this, this.mbean);

    } catch (Exception e) {
        throw new IndexException(logger, e, "Error while creating index %s", name);
    }
}

From source file:com.stratio.cassandra.lucene.service.LuceneIndex.java

License:Apache License

/**
 * Builds a new {@code RowDirectory} using the specified directory path and analyzer.
 *
 * @param keyspace        The keyspace name.
 * @param table           The table name.
 * @param name            The index name.
 * @param path            The path of the directory in where the Lucene files will be stored.
 * @param ramBufferMB     The index writer buffer size in MB.
 * @param maxMergeMB      NRTCachingDirectory max merge size in MB.
 * @param maxCachedMB     NRTCachingDirectory max cached MB.
 * @param analyzer        The default {@link Analyzer}.
 * @param refreshSeconds  The index readers refresh time in seconds. Writings are not visible until this time.
 * @param refreshCallback A runnable to be run on index refresh.
 * @throws IOException If Lucene throws IO errors.
 *//*w w w  .ja va2  s . c o m*/
public LuceneIndex(String keyspace, String table, String name, Path path, Integer ramBufferMB,
        Integer maxMergeMB, Integer maxCachedMB, Analyzer analyzer, Double refreshSeconds,
        Runnable refreshCallback) throws IOException {
    this.path = path;
    this.refreshCallback = refreshCallback;
    this.logName = String.format("Lucene index %s.%s.%s", keyspace, table, name);

    // Open or create directory
    FSDirectory fsDirectory = FSDirectory.open(path);
    directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

    // Setup index writer
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setRAMBufferSizeMB(ramBufferMB);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    config.setUseCompoundFile(true);
    config.setMergePolicy(new TieredMergePolicy());
    indexWriter = new IndexWriter(directory, config);

    // Setup NRT search
    SearcherFactory searcherFactory = new SearcherFactory() {
        public IndexSearcher newSearcher(IndexReader reader) throws IOException {
            LuceneIndex.this.refreshCallBack();
            IndexSearcher searcher = new IndexSearcher(reader);
            searcher.setSimilarity(new NoIDFSimilarity());
            return searcher;
        }
    };
    TrackingIndexWriter trackingIndexWriter = new TrackingIndexWriter(indexWriter);
    searcherManager = new SearcherManager(indexWriter, true, searcherFactory);
    searcherReopener = new ControlledRealTimeReopenThread<>(trackingIndexWriter, searcherManager,
            refreshSeconds, refreshSeconds);
    searcherReopener.start(); // Start the refresher thread

    // Register JMX MBean
    try {
        objectName = new ObjectName(
                String.format("com.stratio.cassandra.lucene:type=LuceneIndexes,keyspace=%s,table=%s,index=%s",
                        keyspace, table, name));
        ManagementFactory.getPlatformMBeanServer().registerMBean(this, objectName);
    } catch (MBeanException | OperationsException e) {
        Log.error(e, "Error while registering MBean");
    }
}

From source file:io.anserini.embeddings.search.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");

    final Directory dir = FSDirectory.open(indexPath);
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setUseCompoundFile(false);
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);
    Document document = new Document();
    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;/*  w  ww. jav  a  2  s.co  m*/
    bRdr.readLine();
    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.YES));
        document.add(new StoredField(LuceneDocumentGenerator.FIELD_BODY, termEmbedding[1]));
    }
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from w  w  w . ja  va  2 s  . c  om
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    List<Path> warcFiles = discoverWarcFiles(docDir);
    if (doclimit > 0 && warcFiles.size() < doclimit)
        warcFiles = warcFiles.subList(0, doclimit);

    for (Path f : warcFiles)
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }/*w  w  w .j  av  a 2 s. co  m*/

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}