Example usage for org.apache.lucene.index IndexWriterConfig setUseCompoundFile

List of usage examples for org.apache.lucene.index IndexWriterConfig setUseCompoundFile

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setUseCompoundFile.

Prototype

@Override
    public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile) 

Source Link

Usage

From source file:org.elasticsearch.index.store.StoreTest.java

License:Apache License

@Test
public void testRecoveryDiff() throws IOException, InterruptedException {
    int numDocs = 2 + random().nextInt(100);
    List<Document> docs = new ArrayList<>();
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        doc.add(new StringField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
        doc.add(new TextField("body", TestUtil.randomRealisticUnicodeString(random()),
                random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
        doc.add(new SortedDocValuesField("dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random()))));
        docs.add(doc);//from   w  ww  .  jav a2  s  .c o  m
    }
    long seed = random().nextLong();
    Store.MetadataSnapshot first;
    {
        Random random = new Random(seed);
        IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
                .setCodec(actualDefaultCodec());
        iwc.setMergePolicy(NoMergePolicy.INSTANCE);
        iwc.setUseCompoundFile(random.nextBoolean());
        iwc.setMaxThreadStates(1);
        final ShardId shardId = new ShardId(new Index("index"), 1);
        DirectoryService directoryService = new LuceneManagedDirectoryService(random);
        Store store = new Store(shardId, ImmutableSettings.EMPTY, directoryService,
                randomDistributor(random, directoryService), new DummyShardLock(shardId));
        IndexWriter writer = new IndexWriter(store.directory(), iwc);
        final boolean lotsOfSegments = rarely(random);
        for (Document d : docs) {
            writer.addDocument(d);
            if (lotsOfSegments && random.nextBoolean()) {
                writer.commit();
            } else if (rarely(random)) {
                writer.commit();
            }
        }
        writer.close();
        first = store.getMetadata();
        assertDeleteContent(store, directoryService);
        store.close();
    }
    long time = new Date().getTime();
    while (time == new Date().getTime()) {
        Thread.sleep(10); // bump the time
    }
    Store.MetadataSnapshot second;
    Store store;
    {
        Random random = new Random(seed);
        IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
                .setCodec(actualDefaultCodec());
        iwc.setMergePolicy(NoMergePolicy.INSTANCE);
        iwc.setUseCompoundFile(random.nextBoolean());
        iwc.setMaxThreadStates(1);
        final ShardId shardId = new ShardId(new Index("index"), 1);
        DirectoryService directoryService = new LuceneManagedDirectoryService(random);
        store = new Store(shardId, ImmutableSettings.EMPTY, directoryService,
                randomDistributor(random, directoryService), new DummyShardLock(shardId));
        IndexWriter writer = new IndexWriter(store.directory(), iwc);
        final boolean lotsOfSegments = rarely(random);
        for (Document d : docs) {
            writer.addDocument(d);
            if (lotsOfSegments && random.nextBoolean()) {
                writer.commit();
            } else if (rarely(random)) {
                writer.commit();
            }
        }
        writer.close();
        second = store.getMetadata();
    }
    Store.RecoveryDiff diff = first.recoveryDiff(second);
    assertThat(first.size(), equalTo(second.size()));
    for (StoreFileMetaData md : first) {
        assertThat(second.get(md.name()), notNullValue());
        // si files are different - containing timestamps etc
        assertThat(second.get(md.name()).isSame(md), equalTo(md.name().endsWith(".si") == false));
    }
    assertThat(diff.different.size(), equalTo(first.size() - 1));
    assertThat(diff.identical.size(), equalTo(1)); // commit point is identical
    assertThat(diff.missing, empty());

    // check the self diff
    Store.RecoveryDiff selfDiff = first.recoveryDiff(first);
    assertThat(selfDiff.identical.size(), equalTo(first.size()));
    assertThat(selfDiff.different, empty());
    assertThat(selfDiff.missing, empty());

    // lets add some deletes
    Random random = new Random(seed);
    IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
            .setCodec(actualDefaultCodec());
    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
    iwc.setUseCompoundFile(random.nextBoolean());
    iwc.setMaxThreadStates(1);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    IndexWriter writer = new IndexWriter(store.directory(), iwc);
    writer.deleteDocuments(new Term("id", Integer.toString(random().nextInt(numDocs))));
    writer.close();
    Store.MetadataSnapshot metadata = store.getMetadata();
    StoreFileMetaData delFile = null;
    for (StoreFileMetaData md : metadata) {
        if (md.name().endsWith(".del")) {
            delFile = md;
            break;
        }
    }
    Store.RecoveryDiff afterDeleteDiff = metadata.recoveryDiff(second);
    if (delFile != null) {
        assertThat(afterDeleteDiff.identical.size(), equalTo(metadata.size() - 2)); // segments_N + del file
        assertThat(afterDeleteDiff.different.size(), equalTo(0));
        assertThat(afterDeleteDiff.missing.size(), equalTo(2));
    } else {
        // an entire segment must be missing (single doc segment got dropped)
        assertThat(afterDeleteDiff.identical.size(), greaterThan(0));
        assertThat(afterDeleteDiff.different.size(), equalTo(0));
        assertThat(afterDeleteDiff.missing.size(), equalTo(1)); // the commit file is different
    }

    // check the self diff
    selfDiff = metadata.recoveryDiff(metadata);
    assertThat(selfDiff.identical.size(), equalTo(metadata.size()));
    assertThat(selfDiff.different, empty());
    assertThat(selfDiff.missing, empty());

    // add a new commit
    iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodec(actualDefaultCodec());
    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
    iwc.setUseCompoundFile(true); // force CFS - easier to test here since we know it will add 3 files
    iwc.setMaxThreadStates(1);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    writer = new IndexWriter(store.directory(), iwc);
    writer.addDocument(docs.get(0));
    writer.close();

    Store.MetadataSnapshot newCommitMetaData = store.getMetadata();
    Store.RecoveryDiff newCommitDiff = newCommitMetaData.recoveryDiff(metadata);
    if (delFile != null) {
        assertThat(newCommitDiff.identical.size(), equalTo(newCommitMetaData.size() - 5)); // segments_N, del file, cfs, cfe, si for the new segment
        assertThat(newCommitDiff.different.size(), equalTo(1)); // the del file must be different
        assertThat(newCommitDiff.different.get(0).name(), endsWith(".del"));
        assertThat(newCommitDiff.missing.size(), equalTo(4)); // segments_N,cfs, cfe, si for the new segment
    } else {
        assertThat(newCommitDiff.identical.size(), equalTo(newCommitMetaData.size() - 4)); // segments_N, cfs, cfe, si for the new segment
        assertThat(newCommitDiff.different.size(), equalTo(0));
        assertThat(newCommitDiff.missing.size(), equalTo(4)); // an entire segment must be missing (single doc segment got dropped)  plus the commit is different
    }

    deleteContent(store.directory());
    IOUtils.close(store);
}

From source file:org.elasticsearch.index.store.StoreTests.java

License:Apache License

@Test
public void testRecoveryDiff() throws IOException, InterruptedException {
    int numDocs = 2 + random().nextInt(100);
    List<Document> docs = new ArrayList<>();
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        doc.add(new StringField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
        doc.add(new TextField("body", TestUtil.randomRealisticUnicodeString(random()),
                random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
        doc.add(new SortedDocValuesField("dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random()))));
        docs.add(doc);/* w  ww.jav  a2  s  . c o  m*/
    }
    long seed = random().nextLong();
    Store.MetadataSnapshot first;
    {
        Random random = new Random(seed);
        IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random))
                .setCodec(TestUtil.getDefaultCodec());
        iwc.setMergePolicy(NoMergePolicy.INSTANCE);
        iwc.setUseCompoundFile(random.nextBoolean());
        final ShardId shardId = new ShardId(new Index("index"), 1);
        DirectoryService directoryService = new LuceneManagedDirectoryService(random);
        Store store = new Store(shardId, Settings.EMPTY, directoryService, new DummyShardLock(shardId));
        IndexWriter writer = new IndexWriter(store.directory(), iwc);
        final boolean lotsOfSegments = rarely(random);
        for (Document d : docs) {
            writer.addDocument(d);
            if (lotsOfSegments && random.nextBoolean()) {
                writer.commit();
            } else if (rarely(random)) {
                writer.commit();
            }
        }
        writer.commit();
        writer.close();
        first = store.getMetadata();
        assertDeleteContent(store, directoryService);
        store.close();
    }
    long time = new Date().getTime();
    while (time == new Date().getTime()) {
        Thread.sleep(10); // bump the time
    }
    Store.MetadataSnapshot second;
    Store store;
    {
        Random random = new Random(seed);
        IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random))
                .setCodec(TestUtil.getDefaultCodec());
        iwc.setMergePolicy(NoMergePolicy.INSTANCE);
        iwc.setUseCompoundFile(random.nextBoolean());
        final ShardId shardId = new ShardId(new Index("index"), 1);
        DirectoryService directoryService = new LuceneManagedDirectoryService(random);
        store = new Store(shardId, Settings.EMPTY, directoryService, new DummyShardLock(shardId));
        IndexWriter writer = new IndexWriter(store.directory(), iwc);
        final boolean lotsOfSegments = rarely(random);
        for (Document d : docs) {
            writer.addDocument(d);
            if (lotsOfSegments && random.nextBoolean()) {
                writer.commit();
            } else if (rarely(random)) {
                writer.commit();
            }
        }
        writer.commit();
        writer.close();
        second = store.getMetadata();
    }
    Store.RecoveryDiff diff = first.recoveryDiff(second);
    assertThat(first.size(), equalTo(second.size()));
    for (StoreFileMetaData md : first) {
        assertThat(second.get(md.name()), notNullValue());
        // si files are different - containing timestamps etc
        assertThat(second.get(md.name()).isSame(md), equalTo(false));
    }
    assertThat(diff.different.size(), equalTo(first.size()));
    assertThat(diff.identical.size(), equalTo(0)); // in lucene 5 nothing is identical - we use random ids in file headers
    assertThat(diff.missing, empty());

    // check the self diff
    Store.RecoveryDiff selfDiff = first.recoveryDiff(first);
    assertThat(selfDiff.identical.size(), equalTo(first.size()));
    assertThat(selfDiff.different, empty());
    assertThat(selfDiff.missing, empty());

    // lets add some deletes
    Random random = new Random(seed);
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random))
            .setCodec(TestUtil.getDefaultCodec());
    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
    iwc.setUseCompoundFile(random.nextBoolean());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    IndexWriter writer = new IndexWriter(store.directory(), iwc);
    writer.deleteDocuments(new Term("id", Integer.toString(random().nextInt(numDocs))));
    writer.commit();
    writer.close();
    Store.MetadataSnapshot metadata = store.getMetadata();
    StoreFileMetaData delFile = null;
    for (StoreFileMetaData md : metadata) {
        if (md.name().endsWith(".liv")) {
            delFile = md;
            break;
        }
    }
    Store.RecoveryDiff afterDeleteDiff = metadata.recoveryDiff(second);
    if (delFile != null) {
        assertThat(afterDeleteDiff.identical.size(), equalTo(metadata.size() - 2)); // segments_N + del file
        assertThat(afterDeleteDiff.different.size(), equalTo(0));
        assertThat(afterDeleteDiff.missing.size(), equalTo(2));
    } else {
        // an entire segment must be missing (single doc segment got dropped)
        assertThat(afterDeleteDiff.identical.size(), greaterThan(0));
        assertThat(afterDeleteDiff.different.size(), equalTo(0));
        assertThat(afterDeleteDiff.missing.size(), equalTo(1)); // the commit file is different
    }

    // check the self diff
    selfDiff = metadata.recoveryDiff(metadata);
    assertThat(selfDiff.identical.size(), equalTo(metadata.size()));
    assertThat(selfDiff.different, empty());
    assertThat(selfDiff.missing, empty());

    // add a new commit
    iwc = new IndexWriterConfig(new MockAnalyzer(random)).setCodec(TestUtil.getDefaultCodec());
    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
    iwc.setUseCompoundFile(true); // force CFS - easier to test here since we know it will add 3 files
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    writer = new IndexWriter(store.directory(), iwc);
    writer.addDocument(docs.get(0));
    writer.close();

    Store.MetadataSnapshot newCommitMetaData = store.getMetadata();
    Store.RecoveryDiff newCommitDiff = newCommitMetaData.recoveryDiff(metadata);
    if (delFile != null) {
        assertThat(newCommitDiff.identical.size(), equalTo(newCommitMetaData.size() - 5)); // segments_N, del file, cfs, cfe, si for the new segment
        assertThat(newCommitDiff.different.size(), equalTo(1)); // the del file must be different
        assertThat(newCommitDiff.different.get(0).name(), endsWith(".liv"));
        assertThat(newCommitDiff.missing.size(), equalTo(4)); // segments_N,cfs, cfe, si for the new segment
    } else {
        assertThat(newCommitDiff.identical.size(), equalTo(newCommitMetaData.size() - 4)); // segments_N, cfs, cfe, si for the new segment
        assertThat(newCommitDiff.different.size(), equalTo(0));
        assertThat(newCommitDiff.missing.size(), equalTo(4)); // an entire segment must be missing (single doc segment got dropped)  plus the commit is different
    }

    deleteContent(store.directory());
    IOUtils.close(store);
}

From source file:org.eu.bitzone.Leia.java

License:Apache License

private IndexWriter createIndexWriter() {
    try {//  ww w  .  j  av  a  2s  .co m
        final IndexWriterConfig cfg = new IndexWriterConfig(LV, new WhitespaceAnalyzer(LV));
        IndexDeletionPolicy policy;
        if (keepCommits) {
            policy = new KeepAllIndexDeletionPolicy();
        } else {
            policy = new KeepLastIndexDeletionPolicy();
        }
        cfg.setIndexDeletionPolicy(policy);
        final MergePolicy mp = cfg.getMergePolicy();
        cfg.setUseCompoundFile(IndexGate.preferCompoundFormat(dir));
        final IndexWriter iw = new IndexWriter(dir, cfg);
        return iw;
    } catch (final Exception e) {
        errorMsg("Error creating IndexWriter: " + e.toString());
        return null;
    }
}

From source file:org.eu.bitzone.Leia.java

License:Apache License

/**
 * Optimize the index.//from   w w  w  . j a  v  a  2 s .  c om
 */
public void optimize(final Object dialog) {
    final Thread t = new Thread() {

        @Override
        public void run() {
            IndexWriter iw = null;
            final Object optimizeButton = find(dialog, "optimizeButton");
            setBoolean(optimizeButton, "enabled", false);
            final Object closeButton = find(dialog, "closeButton");
            setBoolean(closeButton, "enabled", false);
            final Object msg = find(dialog, "msg");
            final Object stat = find(dialog, "stat");
            setString(stat, "text", "Running ...");
            final PanelPrintWriter ppw = new PanelPrintWriter(Leia.this, msg);
            final boolean useCompound = getBoolean(find(dialog, "optCompound"), "selected");
            final boolean expunge = getBoolean(find(dialog, "optExpunge"), "selected");
            final boolean keep = getBoolean(find(dialog, "optKeepAll"), "selected");
            final boolean useLast = getBoolean(find(dialog, "optLastCommit"), "selected");
            final Object tiiSpin = find(dialog, "tii");
            final Object segnumSpin = find(dialog, "segnum");
            final int tii = Integer.parseInt(getString(tiiSpin, "text"));
            final int segnum = Integer.parseInt(getString(segnumSpin, "text"));
            try {
                if (is != null) {
                    is = null;
                }
                if (ir != null) {
                    ir.close();
                }
                if (ar != null) {
                    ar.close();
                }
                IndexDeletionPolicy policy;
                if (keep) {
                    policy = new KeepAllIndexDeletionPolicy();
                } else {
                    policy = new KeepLastIndexDeletionPolicy();
                }
                final IndexWriterConfig cfg = new IndexWriterConfig(LV, new WhitespaceAnalyzer(LV));
                if (!useLast) {
                    final IndexCommit ic = ((DirectoryReader) ir).getIndexCommit();
                    if (ic != null) {
                        cfg.setIndexCommit(ic);
                    }
                }
                cfg.setIndexDeletionPolicy(policy);
                cfg.setTermIndexInterval(tii);
                final MergePolicy p = cfg.getMergePolicy();
                cfg.setUseCompoundFile(useCompound);
                if (useCompound) {
                    p.setNoCFSRatio(1.0);
                }
                cfg.setInfoStream(ppw);
                iw = new IndexWriter(dir, cfg);
                final long startSize = Util.calcTotalFileSize(pName, dir);
                final long startTime = System.currentTimeMillis();
                if (expunge) {
                    iw.forceMergeDeletes();
                } else {
                    if (segnum > 1) {
                        iw.forceMerge(segnum, true);
                    } else {
                        iw.forceMerge(1, true);
                    }
                }
                iw.commit();
                final long endTime = System.currentTimeMillis();
                final long endSize = Util.calcTotalFileSize(pName, dir);
                final long deltaSize = startSize - endSize;
                final String sign = deltaSize < 0 ? " Increased " : " Reduced ";
                final String sizeMsg = sign + Util.normalizeSize(Math.abs(deltaSize))
                        + Util.normalizeUnit(Math.abs(deltaSize));
                final String timeMsg = String.valueOf(endTime - startTime) + " ms";
                showStatus(sizeMsg + " in " + timeMsg);
                iw.close();
                setString(stat, "text", "Finished OK.");
            } catch (final Exception e) {
                e.printStackTrace(ppw);
                setString(stat, "text", "ERROR - aborted.");
                errorMsg("ERROR optimizing: " + e.toString());
                if (iw != null) {
                    try {
                        iw.close();
                    } catch (final Exception e1) {
                    }
                }
            } finally {
                setBoolean(closeButton, "enabled", true);
            }
            try {
                actionReopen();
                is = new IndexSearcher(ir);
                // add dialog again
                add(dialog);
            } catch (final Exception e) {
                e.printStackTrace(ppw);
                errorMsg("ERROR reopening after optimize:\n" + e.getMessage());
            }
        }
    };
    t.start();
}

From source file:org.getopt.luke.Luke.java

License:Apache License

private IndexWriter createIndexWriter() {
    try {//from   w  w  w . j  a v a  2s.c o m
        IndexWriterConfig cfg = new IndexWriterConfig(LV, new WhitespaceAnalyzer(LV));
        IndexDeletionPolicy policy;
        if (keepCommits) {
            policy = new KeepAllIndexDeletionPolicy();
        } else {
            policy = new KeepLastIndexDeletionPolicy();
        }
        cfg.setIndexDeletionPolicy(policy);
        cfg.setUseCompoundFile(IndexGate.preferCompoundFormat(dir));
        IndexWriter iw = new IndexWriter(dir, cfg);
        return iw;
    } catch (Exception e) {
        errorMsg("Error creating IndexWriter: " + e.toString());
        return null;
    }
}

From source file:org.getopt.luke.Luke.java

License:Apache License

/**
 * Optimize the index./*from  ww w.  j a v a 2  s  . c  o  m*/
 */
public void optimize(final Object dialog) {
    Thread t = new Thread() {
        public void run() {
            IndexWriter iw = null;
            Object optimizeButton = find(dialog, "optimizeButton");
            setBoolean(optimizeButton, "enabled", false);
            Object closeButton = find(dialog, "closeButton");
            setBoolean(closeButton, "enabled", false);
            Object msg = find(dialog, "msg");
            Object stat = find(dialog, "stat");
            setString(stat, "text", "Running ...");
            PanelPrintWriter ppw = new PanelPrintWriter(Luke.this, msg);
            boolean useCompound = getBoolean(find(dialog, "optCompound"), "selected");
            boolean expunge = getBoolean(find(dialog, "optExpunge"), "selected");
            boolean keep = getBoolean(find(dialog, "optKeepAll"), "selected");
            boolean useLast = getBoolean(find(dialog, "optLastCommit"), "selected");
            Object tiiSpin = find(dialog, "tii");
            Object segnumSpin = find(dialog, "segnum");
            int tii = Integer.parseInt(getString(tiiSpin, "text"));
            int segnum = Integer.parseInt(getString(segnumSpin, "text"));
            try {
                if (is != null)
                    is = null;
                if (ir != null)
                    ir.close();
                if (ar != null)
                    ar.close();
                IndexDeletionPolicy policy;
                if (keep) {
                    policy = new KeepAllIndexDeletionPolicy();
                } else {
                    policy = new KeepLastIndexDeletionPolicy();
                }
                IndexWriterConfig cfg = new IndexWriterConfig(LV, new WhitespaceAnalyzer(LV));
                if (!useLast) {
                    IndexCommit ic = ((DirectoryReader) ir).getIndexCommit();
                    if (ic != null) {
                        cfg.setIndexCommit(ic);
                    }
                }
                cfg.setIndexDeletionPolicy(policy);
                cfg.setTermIndexInterval(tii);
                cfg.setUseCompoundFile(useCompound);
                cfg.setInfoStream(ppw);
                iw = new IndexWriter(dir, cfg);
                long startSize = Util.calcTotalFileSize(pName, dir);
                long startTime = System.currentTimeMillis();
                if (expunge) {
                    iw.forceMergeDeletes();
                } else {
                    if (segnum > 1) {
                        iw.forceMerge(segnum, true);
                    } else {
                        iw.forceMerge(1, true);
                    }
                }
                iw.commit();
                long endTime = System.currentTimeMillis();
                long endSize = Util.calcTotalFileSize(pName, dir);
                long deltaSize = startSize - endSize;
                String sign = deltaSize < 0 ? " Increased " : " Reduced ";
                String sizeMsg = sign + Util.normalizeSize(Math.abs(deltaSize))
                        + Util.normalizeUnit(Math.abs(deltaSize));
                String timeMsg = String.valueOf(endTime - startTime) + " ms";
                showStatus(sizeMsg + " in " + timeMsg);
                iw.close();
                setString(stat, "text", "Finished OK.");
            } catch (Exception e) {
                e.printStackTrace(ppw);
                setString(stat, "text", "ERROR - aborted.");
                errorMsg("ERROR optimizing: " + e.toString());
                if (iw != null)
                    try {
                        iw.close();
                    } catch (Exception e1) {
                    }
            } finally {
                setBoolean(closeButton, "enabled", true);
            }
            try {
                actionReopen();
                is = new IndexSearcher(ir);
                // add dialog again
                add(dialog);
            } catch (Exception e) {
                e.printStackTrace(ppw);
                errorMsg("ERROR reopening after optimize:\n" + e.getMessage());
            }
        }
    };
    t.start();
}

From source file:org.languagetool.dev.FrequencyIndexCreator.java

License:Open Source License

private void run(File inputDir, File indexBaseDir) throws IOException {
    List<File> files = Arrays.asList(inputDir.listFiles());
    Collections.sort(files);/*  ww  w  .j a v  a2 s  .  co m*/
    for (File file : files) {
        String name = file.getName();
        if (name.matches(".*_[A-Z]+_.*")) {
            System.out.println("Skipping POS tag file " + name);
            continue;
        }
        File indexDir;
        boolean hiveMode;
        if (name.matches(NAME_REGEX1)) {
            indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
            hiveMode = false;
            System.out.println("Running in corpus mode (i.e. aggregation of years)");
        } else if (name.matches(NAME_REGEX2)) {
            indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX2, "$1"));
            hiveMode = true;
            System.out.println("Running in Hive mode (i.e. no aggregation of years)");
        } else {
            System.out.println(
                    "Skipping " + name + " - doesn't match regex " + NAME_REGEX1 + " or " + NAME_REGEX2);
            continue;
        }
        if (indexDir.exists() && indexDir.isDirectory()) {
            System.out.println("Skipping " + name + " - index dir '" + indexDir + "' already exists");
            continue;
        }
        System.out.println("Index dir: " + indexDir);
        Directory directory = FSDirectory.open(indexDir);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
        config.setUseCompoundFile(false); // ~10% speedup
        //config.setRAMBufferSizeMB(1000);
        try (IndexWriter writer = new IndexWriter(directory, config)) {
            indexLinesFromGoogleFile(writer, file, hiveMode);
        }
    }
}

From source file:org.meresco.lucene.numerate.UriEnumerate.java

License:Open Source License

/**
 *
 * @param path// w ww  . j a  va  2 s  .c o m
 * @param max_cache_size
 * @param withTransactionLog allows for crash recovery, but slows down UriNumerate considerably because of file system flush.
 * @throws IOException
 */
public UriEnumerate(String path, int max_cache_size, boolean withTransactionLog) throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(null);
    ConcurrentMergeScheduler ms = (ConcurrentMergeScheduler) config.getMergeScheduler();
    ms.setDefaultMaxMergesAndThreads(/* spins= */false);
    LogDocMergePolicy mp = new LogDocMergePolicy();
    mp.setMergeFactor(2);
    mp.setMinMergeDocs(max_cache_size);
    config.setMergePolicy(mp);
    config.setCodec(new Lucene60Codec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return new BloomFilteringPostingsFormat(super.getPostingsFormatForField(field));
        }
    });
    config.setUseCompoundFile(false);
    this.writer = new IndexWriter(FSDirectory.open(FileSystems.getDefault().getPath(path)), config);
    this.next_ord = writer.numDocs() + 1;
    this.searcher = new SimpleSearcher(this.writer);
    this.cache = new Cache(max_cache_size, () -> this.commit());
    this.transactionLog = new TransactionLog(withTransactionLog ? path + "/transactionLog" : null);
    this.transactionLog.maybeRecover();
}

From source file:org.neo4j.kernel.api.impl.index.IndexWriterConfigs.java

License:Open Source License

public static IndexWriterConfig standard() {
    IndexWriterConfig writerConfig = new IndexWriterConfig(LuceneDataSource.KEYWORD_ANALYZER);

    writerConfig.setMaxBufferedDocs(MAX_BUFFERED_DOCS);
    writerConfig.setIndexDeletionPolicy(new MultipleBackupDeletionPolicy());
    writerConfig.setUseCompoundFile(true);
    writerConfig.setCodec(new Lucene54Codec() {
        @Override//w w w.j  a v  a  2 s .  co m
        public PostingsFormat getPostingsFormatForField(String field) {
            PostingsFormat postingFormat = super.getPostingsFormatForField(field);
            return CODEC_BLOCK_TREE_ORDS_POSTING_FORMAT ? blockTreeOrdsPostingsFormat : postingFormat;
        }
    });

    LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
    mergePolicy.setNoCFSRatio(MERGE_POLICY_NO_CFS_RATIO);
    mergePolicy.setMinMergeMB(MERGE_POLICY_MIN_MERGE_MB);
    mergePolicy.setMergeFactor(MERGE_POLICY_MERGE_FACTOR);
    writerConfig.setMergePolicy(mergePolicy);

    return writerConfig;
}

From source file:perf.IDPerfTest.java

License:Apache License

private static Result testOne(String indexPath, String desc, IDIterator ids, final int minTermsInBlock,
        final int maxTermsInBlock) throws IOException {
    System.out.println("\ntest: " + desc + " termBlocks=" + minTermsInBlock + "/" + maxTermsInBlock);
    Directory dir = FSDirectory.open(new File(indexPath));
    //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48));
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_8, new StandardAnalyzer(Version.LUCENE_4_8));
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    // So I can walk the files and get the *.tip sizes:
    iwc.setUseCompoundFile(false);

    iwc.setCodec(new Lucene53Codec() {
        @Override//from www  . ja va 2s.c  om
        public PostingsFormat getPostingsFormatForField(String field) {
            return new Lucene50PostingsFormat(minTermsInBlock, maxTermsInBlock);
        }
    });

    /// 7/7/7 segment structure:
    iwc.setMaxBufferedDocs(ID_COUNT / 777);
    iwc.setRAMBufferSizeMB(-1);
    //iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    //iwc.setMergePolicy(new LogDocMergePolicy());
    ((TieredMergePolicy) iwc.getMergePolicy()).setFloorSegmentMB(.001);
    ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(0.0);
    //((LogDocMergePolicy) iwc.getMergePolicy()).setMinMergeDocs(1000);
    iwc.getMergePolicy().setNoCFSRatio(0.0);

    IndexWriter w = new IndexWriter(dir, iwc);
    Document doc = new Document();

    FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
    ft.setTokenized(true);
    ft.freeze();

    BytesRef idValue = new BytesRef(64);
    Field idField = new Field("id", new BinaryTokenStream(idValue), ft);
    doc.add(idField);

    long t0 = System.nanoTime();
    BytesRef[] lookupIDs = new BytesRef[ID_SEARCH_COUNT];
    Random random = new Random(17);
    int lookupCount = 0;
    double rate = 1.01 * ((double) ID_SEARCH_COUNT) / ID_COUNT;
    for (int i = 0; i < ID_COUNT; i++) {
        ids.next(idValue);
        if (lookupCount < lookupIDs.length && random.nextDouble() <= rate) {
            lookupIDs[lookupCount++] = BytesRef.deepCopyOf(idValue);
        }
        // Trickery: the idsIter changed the idValue which the BinaryTokenStream reuses for each added doc
        w.addDocument(doc);
    }

    if (lookupCount < lookupIDs.length) {
        throw new RuntimeException("didn't get enough lookup ids: " + lookupCount + " vs " + lookupIDs.length);
    }

    long indexTime = System.nanoTime() - t0;

    System.out.println("  indexing done; waitForMerges...");
    w.waitForMerges();

    IndexReader r = DirectoryReader.open(w, true);
    System.out.println("  reader=" + r);

    shuffle(random, lookupIDs);
    shuffle(random, lookupIDs);

    long bestTime = Long.MAX_VALUE;
    long checksum = 0;

    List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
    // Sort largest to smallest:
    Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
        @Override
        public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
            return c2.reader().maxDoc() - c1.reader().maxDoc();
        }
    });
    TermsEnum[] termsEnums = new TermsEnum[leaves.size()];
    DocsEnum[] docsEnums = new DocsEnum[leaves.size()];
    int[] docBases = new int[leaves.size()];
    for (int i = 0; i < leaves.size(); i++) {
        //System.out.println("i=" + i + " count=" + leaves.get(i).reader().maxDoc());
        termsEnums[i] = leaves.get(i).reader().fields().terms("id").iterator(null);
        docBases[i] = leaves.get(i).docBase;
    }

    long rawLookupCount = 0;

    int countx = 0;
    for (int iter = 0; iter < 5; iter++) {
        t0 = System.nanoTime();
        BlockTreeTermsReader.seekExactFastNotFound = 0;
        BlockTreeTermsReader.seekExactFastRootNotFound = 0;
        rawLookupCount = 0;
        for (BytesRef id : lookupIDs) {
            if (countx++ < 50) {
                System.out.println("    id=" + id);
            }
            boolean found = false;
            for (int seg = 0; seg < termsEnums.length; seg++) {
                rawLookupCount++;
                if (termsEnums[seg].seekExact(id)) {
                    docsEnums[seg] = termsEnums[seg].docs(null, docsEnums[seg], 0);
                    int docID = docsEnums[seg].nextDoc();
                    if (docID == DocsEnum.NO_MORE_DOCS) {
                        // uh-oh!
                        throw new RuntimeException("id not found: " + id);
                    }
                    // paranoia:
                    checksum += docID + docBases[seg];

                    found = true;

                    // Optimization vs MultiFields: we don't need to check any more segments since id is PK
                    break;
                }
            }
            if (found == false) {
                // uh-oh!
                throw new RuntimeException("id not found: " + id);
            }
        }
        long lookupTime = System.nanoTime() - t0;
        System.out.println(String.format(Locale.ROOT, "  iter=" + iter + " lookupTime=%.3f sec",
                lookupTime / 1000000000.0));
        if (lookupTime < bestTime) {
            bestTime = lookupTime;
            System.out.println("    **");
        }
    }

    long totalBytes = 0;
    long termsIndexTotalBytes = 0;
    for (String fileName : dir.listAll()) {
        long bytes = dir.fileLength(fileName);
        totalBytes += bytes;
        if (fileName.endsWith(".tip")) {
            termsIndexTotalBytes += bytes;
        }
    }

    r.close();
    w.rollback();
    dir.close();

    return new Result(desc, ID_COUNT / (indexTime / 1000000.0), lookupIDs.length / (bestTime / 1000000.0),
            totalBytes, termsIndexTotalBytes, checksum, BlockTreeTermsReader.seekExactFastNotFound,
            BlockTreeTermsReader.seekExactFastRootNotFound, rawLookupCount, minTermsInBlock, maxTermsInBlock);
}