Example usage for org.apache.lucene.analysis.shingle ShingleAnalyzerWrapper ShingleAnalyzerWrapper

List of usage examples for org.apache.lucene.analysis.shingle ShingleAnalyzerWrapper ShingleAnalyzerWrapper

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.shingle ShingleAnalyzerWrapper ShingleAnalyzerWrapper.

Prototype

public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) 

Source Link

Usage

From source file:com.tamingtext.classifier.mlt.MoreLikeThisQueryTest.java

License:Apache License

@Test
public void testMoreLikeThisQuery() throws Exception {
    //<start id="lucene.examples.mlt.setup"/>
    Directory directory = FSDirectory.open(new File(modelPath));

    IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/>
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    Analyzer analyzer //<co id="mlt.analyzersetup"/>
            = new EnglishAnalyzer(Version.LUCENE_36);

    if (nGramSize > 1) { //<co id="mlt.ngramsetup"/>
        analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize, nGramSize);
    }/*from ww  w .j  a  v  a2s  .co m*/

    MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/>
    moreLikeThis.setAnalyzer(analyzer);
    moreLikeThis.setFieldNames(new String[] { "content" });

    /*<calloutlist>
    <callout arearefs="mlt.indexsetup">Open Index</callout>
    <callout arearefs="mlt.analyzersetup">Setup Analyzer</callout>
    <callout arearefs="mlt.ngramsetup">Setup NGrams</callout>
    <callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.setup"/>

    // for testing against the same corpus
    moreLikeThis.setMinTermFreq(1);
    moreLikeThis.setMinDocFreq(1);

    //<start id="lucene.examples.mlt.query"/>
    Reader reader = new FileReader(inputPath); //<co id="mlt.query"/>
    Query query = moreLikeThis.like(reader);

    TopDocs results = indexSearcher.search(query, maxResults); //<co id="mlt.search"/>

    HashMap<String, CategoryHits> categoryHash = new HashMap<String, CategoryHits>();

    for (ScoreDoc sd : results.scoreDocs) { //<co id="mlt.collect"/>
        Document d = indexReader.document(sd.doc);
        Fieldable f = d.getFieldable(categoryFieldName);
        String cat = f.stringValue();
        CategoryHits ch = categoryHash.get(cat);
        if (ch == null) {
            ch = new CategoryHits();
            ch.setLabel(cat);
            categoryHash.put(cat, ch);
        }
        ch.incrementScore(sd.score);
    }

    SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/>
            = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator());
    sortedCats.addAll(categoryHash.values());

    for (CategoryHits c : sortedCats) { //<co id="mlt.display"/>
        System.out.println(c.getLabel() + "\t" + c.getScore());
    }
    /*<calloutlist>
    <callout arearefs="mlt.query">Create Query</callout>
    <callout arearefs="mlt.search">Perform Search</callout>
    <callout arearefs="mlt.collect">Collect Results</callout>
    <callout arearefs="mlt.rank">Rank Categories</callout>
    <callout arearefs="mlt.display">Display Categories</callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.query"/>

}

From source file:jobs.BuildIndexJob.java

@Override
public void doJob() throws Exception {
    Logger.info("Computing the indexes...");
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();/* w  ww  .j  a v  a2 s. com*/

    CharArraySet stopSet = CharArraySet.copy(Version.LUCENE_47, StandardAnalyzer.STOP_WORDS_SET);
    stopSet.add("we");
    stopSet.add("pmid");
    stopSet.add("were");
    stopSet.add("from");
    stopSet.add("reply");
    stopSet.add("can");
    stopSet.add("between");
    stopSet.add("using");
    stopSet.add("used");
    stopSet.add("however");
    stopSet.add("which");
    stopSet.add("our");
    stopSet.add("among");
    stopSet.add("while");
    stopSet.add("this");

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_47, stopSet);
    //        Analyzer analyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
    //Does [1-5]-grams, as determined by previous graphs
    ShingleAnalyzerWrapper shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, 2, 5);
    int now = Integer.parseInt((String) play.Play.configuration.get("analysis.year"));

    for (int t = now; t >= now - 1; t--) {
        //Create a folder for the index
        //TODO delete the folders first
        VirtualFile.fromRelativePath("/indexes/index-" + t).getRealFile().mkdir();
        Directory directory = FSDirectory
                .open(VirtualFile.fromRelativePath("/indexes/index-" + t).getRealFile());
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, shingleAnalyzer);
        IndexWriter iwriter = new IndexWriter(directory, config);

        //Retrieve the citations given a year t
        SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
        Date start = sdf.parse("01/01/" + t);
        Date end = sdf.parse("31/12/" + t);

        Logger.info("Query for year " + t + "...");
        List<MorphiaCitation> citations = MorphiaCitation.q().filter("created <=", end)
                .filter("created >=", start).asList();

        Logger.info("Citation size: " + citations.size());
        int total = citations.size();
        int counter = 0;

        //Iterate over the citations and create the index
        for (MorphiaCitation citation : citations) {
            counter++;
            Logger.info("i (" + t + "): " + counter + "/" + total);
            Document doc = new Document();
            String contents = "";

            doc.add(new Field("pmid", citation.pmid, TextField.TYPE_STORED));

            if (citation.abstractText != null) {
                contents += citation.abstractText;
            }

            if (citation.title != null) {
                contents += citation.title;
            }

            if (!contents.equals("")) {
                doc.add(new Field("contents", contents, TextField.TYPE_STORED));
            }

            iwriter.addDocument(doc);

        }
        iwriter.close();
    }
    stopwatch.stop();
    Utils.emailAdmin("Indexing done. ", "Job finished in " + stopwatch.elapsed(TimeUnit.MINUTES) + " minutes.");
    Logger.info("Time to index the documents: " + stopwatch.elapsed(TimeUnit.MINUTES));
}

From source file:jobs.LuceneStartifiedIndexing.java

@Override
public void doJob() throws Exception {
    Logger.info("Stratified indexing started...");
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();// w w w . j  ava2 s. c o  m

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    ShingleAnalyzerWrapper shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, 2, 5);
    int now = Integer.parseInt((String) play.Play.configuration.get("analysis.year"));

    //TODO could possibly do less indexes
    //Just focus on the previous year for simplicity sake
    for (int t = now; t >= now - 10; t--) {
        //Create a folder for the index
        VirtualFile.fromRelativePath("/indexes/index-" + t).getRealFile().mkdir();
        Directory directory = FSDirectory
                .open(VirtualFile.fromRelativePath("/indexes/index-" + t).getRealFile());
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, shingleAnalyzer);
        IndexWriter iwriter = new IndexWriter(directory, config);

        //Retrieve the citations given a year t
        SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
        Date start = sdf.parse("01/01/" + t);
        Date end = sdf.parse("31/12/" + t);

        //Get all the citations given a year
        List<Citation> citations = Citation.find("created between ? and ?", start, end).fetch();
        int total = citations.size();
        int counter = 0;

        //Iterate over the citations and create the index
        for (Citation citation : citations) {
            counter++;
            Logger.info("i (" + t + "): " + counter + "/" + total);
            Document doc = new Document();
            String contents = "";

            if (citation.abstractText != null) {
                contents += citation.abstractText;
            }

            if (citation.title != null) {
                contents += citation.title;
            }

            if (!contents.equals("")) {
                doc.add(new Field("contents", contents, TextField.TYPE_STORED));
            }

            iwriter.addDocument(doc);

        }

        iwriter.close();

    }
    Logger.info("Job done!");
    stopwatch.stop();
    Utils.emailAdmin("Stratified index built",
            "Job finished in " + stopwatch.elapsed(TimeUnit.MINUTES) + " minutes.");

}

From source file:perf.Indexer.java

License:Apache License

private static void _main(String[] clArgs) throws Exception {

    Args args = new Args(clArgs);

    // EG: -facets Date -facets characterCount ...
    FacetsConfig facetsConfig = new FacetsConfig();
    facetsConfig.setHierarchical("Date", true);
    final Set<String> facetFields = new HashSet<String>();
    if (args.hasArg("-facets")) {
        for (String arg : args.getStrings("-facets")) {
            facetFields.add(arg);//  w  ww  . ja va2s  .  c  om
        }
    }

    final String dirImpl = args.getString("-dirImpl");
    final String dirPath = args.getString("-indexPath") + "/index";

    final Directory dir;
    OpenDirectory od = OpenDirectory.get(dirImpl);

    dir = od.open(Paths.get(dirPath));

    final String analyzer = args.getString("-analyzer");
    final Analyzer a;
    if (analyzer.equals("EnglishAnalyzer")) {
        a = new EnglishAnalyzer();
    } else if (analyzer.equals("StandardAnalyzer")) {
        a = new StandardAnalyzer();
    } else if (analyzer.equals("StandardAnalyzerNoStopWords")) {
        a = new StandardAnalyzer(CharArraySet.EMPTY_SET);
    } else if (analyzer.equals("ShingleStandardAnalyzer")) {
        a = new ShingleAnalyzerWrapper(new StandardAnalyzer(), 2, 2);
    } else if (analyzer.equals("ShingleStandardAnalyzerNoStopWords")) {
        a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2);
    } else {
        throw new RuntimeException("unknown analyzer " + analyzer);
    }

    final String lineFile = args.getString("-lineDocsFile");

    // -1 means all docs in the line file:
    final int docCountLimit = args.getInt("-docCountLimit");
    final int numThreads = args.getInt("-threadCount");

    final boolean doForceMerge = args.getFlag("-forceMerge");
    final boolean verbose = args.getFlag("-verbose");

    String indexSortField = null;
    SortField.Type indexSortType = null;

    if (args.hasArg("-indexSort")) {
        indexSortField = args.getString("-indexSort");

        int i = indexSortField.indexOf(':');
        if (i == -1) {
            throw new IllegalArgumentException(
                    "-indexSort should have form field:type; got: " + indexSortField);
        }
        String typeString = indexSortField.substring(i + 1, indexSortField.length());
        if (typeString.equals("long")) {
            indexSortType = SortField.Type.LONG;
        } else if (typeString.equals("string")) {
            indexSortType = SortField.Type.STRING;
        } else {
            throw new IllegalArgumentException("-indexSort can only handle 'long' sort; got: " + typeString);
        }
        indexSortField = indexSortField.substring(0, i);
    } else {
        indexSortType = null;
    }

    final double ramBufferSizeMB = args.getDouble("-ramBufferMB");
    final int maxBufferedDocs = args.getInt("-maxBufferedDocs");

    final String defaultPostingsFormat = args.getString("-postingsFormat");
    final boolean doDeletions = args.getFlag("-deletions");
    final boolean printDPS = args.getFlag("-printDPS");
    final boolean waitForMerges = args.getFlag("-waitForMerges");
    final boolean waitForCommit = args.getFlag("-waitForCommit");
    final String mergePolicy = args.getString("-mergePolicy");
    final Mode mode;
    final boolean doUpdate = args.getFlag("-update");
    if (doUpdate) {
        mode = Mode.UPDATE;
    } else {
        mode = Mode.valueOf(args.getString("-mode", "add").toUpperCase(Locale.ROOT));
    }
    int randomDocIDMax;
    if (mode == Mode.UPDATE) {
        randomDocIDMax = args.getInt("-randomDocIDMax");
    } else {
        randomDocIDMax = -1;
    }
    final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat");
    final boolean addGroupingFields = args.getFlag("-grouping");
    final boolean useCFS = args.getFlag("-cfs");
    final boolean storeBody = args.getFlag("-store");
    final boolean tvsBody = args.getFlag("-tvs");
    final boolean bodyPostingsOffsets = args.getFlag("-bodyPostingsOffsets");
    final int maxConcurrentMerges = args.getInt("-maxConcurrentMerges");
    final boolean addDVFields = args.getFlag("-dvfields");
    final boolean doRandomCommit = args.getFlag("-randomCommit");
    final boolean useCMS = args.getFlag("-useCMS");
    final boolean disableIOThrottle = args.getFlag("-disableIOThrottle");

    if (waitForCommit == false && waitForMerges) {
        throw new RuntimeException("pass -waitForCommit if you pass -waitForMerges");
    }

    if (waitForCommit == false && doForceMerge) {
        throw new RuntimeException("pass -waitForCommit if you pass -forceMerge");
    }

    if (waitForCommit == false && doDeletions) {
        throw new RuntimeException("pass -waitForCommit if you pass -deletions");
    }

    if (useCMS == false && disableIOThrottle) {
        throw new RuntimeException("-disableIOThrottle only makes sense with -useCMS");
    }

    final double nrtEverySec;
    if (args.hasArg("-nrtEverySec")) {
        nrtEverySec = args.getDouble("-nrtEverySec");
    } else {
        nrtEverySec = -1.0;
    }

    // True to start back at the beginning if we run out of
    // docs from the line file source:
    final boolean repeatDocs = args.getFlag("-repeatDocs");

    final String facetDVFormatName;
    if (facetFields.isEmpty()) {
        facetDVFormatName = "Lucene54";
    } else {
        facetDVFormatName = args.getString("-facetDVFormat");
    }

    if (addGroupingFields && docCountLimit == -1) {
        a.close();
        throw new RuntimeException("cannot add grouping fields unless docCount is set");
    }

    args.check();

    System.out.println("Dir: " + dirImpl);
    System.out.println("Index path: " + dirPath);
    System.out.println("Analyzer: " + analyzer);
    System.out.println("Line file: " + lineFile);
    System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
    System.out.println("Threads: " + numThreads);
    System.out.println("Force merge: " + (doForceMerge ? "yes" : "no"));
    System.out.println("Verbose: " + (verbose ? "yes" : "no"));
    System.out.println("RAM Buffer MB: " + ramBufferSizeMB);
    System.out.println("Max buffered docs: " + maxBufferedDocs);
    System.out.println("Default postings format: " + defaultPostingsFormat);
    System.out.println("Do deletions: " + (doDeletions ? "yes" : "no"));
    System.out.println("Wait for merges: " + (waitForMerges ? "yes" : "no"));
    System.out.println("Wait for commit: " + (waitForCommit ? "yes" : "no"));
    System.out.println("IO throttle: " + (disableIOThrottle ? "no" : "yes"));
    System.out.println("Merge policy: " + mergePolicy);
    System.out.println("Mode: " + mode);
    if (mode == Mode.UPDATE) {
        System.out.println("DocIDMax: " + randomDocIDMax);
    }
    System.out.println("ID field postings format: " + idFieldPostingsFormat);
    System.out.println("Add grouping fields: " + (addGroupingFields ? "yes" : "no"));
    System.out.println("Compound file format: " + (useCFS ? "yes" : "no"));
    System.out.println("Store body field: " + (storeBody ? "yes" : "no"));
    System.out.println("Term vectors for body field: " + (tvsBody ? "yes" : "no"));
    System.out.println("Facet DV Format: " + facetDVFormatName);
    System.out.println("Facet fields: " + facetFields);
    System.out.println("Body postings offsets: " + (bodyPostingsOffsets ? "yes" : "no"));
    System.out.println("Max concurrent merges: " + maxConcurrentMerges);
    System.out.println("Add DocValues fields: " + addDVFields);
    System.out.println("Use ConcurrentMergeScheduler: " + useCMS);
    if (nrtEverySec > 0.0) {
        System.out.println("Open & close NRT reader every: " + nrtEverySec + " sec");
    } else {
        System.out.println("Open & close NRT reader every: never");
    }
    System.out.println("Repeat docs: " + repeatDocs);

    if (verbose) {
        InfoStream.setDefault(new PrintStreamInfoStream(System.out));
    }

    final IndexWriterConfig iwc = new IndexWriterConfig(a);

    if (indexSortField != null) {
        iwc.setIndexSort(new Sort(new SortField(indexSortField, indexSortType)));
    }

    if (mode == Mode.UPDATE) {
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    } else {
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    iwc.setMaxBufferedDocs(maxBufferedDocs);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);

    // So flushed segments do/don't use CFS:
    iwc.setUseCompoundFile(useCFS);

    final AtomicBoolean indexingFailed = new AtomicBoolean();

    iwc.setMergeScheduler(getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle));
    iwc.setMergePolicy(getMergePolicy(mergePolicy, useCFS));

    // Keep all commit points:
    if (doDeletions || doForceMerge) {
        iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    }

    final Codec codec = new Lucene62Codec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return PostingsFormat.forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat);
        }

        private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName(facetDVFormatName);
        //private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42");
        //private final DocValuesFormat diskDVFormat = DocValuesFormat.forName("Disk");
        //        private final DocValuesFormat lucene45DVFormat = DocValuesFormat.forName("Lucene45");
        private final DocValuesFormat directDVFormat = DocValuesFormat.forName("Direct");

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            if (facetFields.contains(field) || field.equals("$facets")) {
                return facetsDVFormat;
                //} else if (field.equals("$facets_sorted_doc_values")) {
                //return diskDVFormat;
            } else {
                // Use default DVFormat for all else:
                // System.out.println("DV: field=" + field + " format=" + super.getDocValuesFormatForField(field));
                return super.getDocValuesFormatForField(field);
            }
        }
    };

    iwc.setCodec(codec);

    System.out.println("IW config=" + iwc);

    IndexWriter w = new IndexWriter(dir, iwc);

    System.out.println("Index has " + w.maxDoc() + " docs");

    final TaxonomyWriter taxoWriter;
    if (facetFields.isEmpty() == false) {
        taxoWriter = new DirectoryTaxonomyWriter(od.open(Paths.get(args.getString("-indexPath"), "facets")),
                IndexWriterConfig.OpenMode.CREATE);
    } else {
        taxoWriter = null;
    }

    // Fixed seed so group field values are always consistent:
    final Random random = new Random(17);

    LineFileDocs lineFileDocs = new LineFileDocs(lineFile, repeatDocs, storeBody, tvsBody, bodyPostingsOffsets,
            false, taxoWriter, facetFields, facetsConfig, addDVFields);

    float docsPerSecPerThread = -1f;
    //float docsPerSecPerThread = 100f;

    IndexThreads threads = new IndexThreads(random, w, indexingFailed, lineFileDocs, numThreads, docCountLimit,
            addGroupingFields, printDPS, mode, docsPerSecPerThread, null, nrtEverySec, randomDocIDMax);

    System.out.println("\nIndexer: start");
    final long t0 = System.currentTimeMillis();

    threads.start();

    while (!threads.done() && indexingFailed.get() == false) {
        Thread.sleep(100);

        // Commits once per minute on average:
        if (doRandomCommit && random.nextInt(600) == 17) {
            System.out.println("Indexer: now commit");
            long commitStartNS = System.nanoTime();
            w.commit();
            System.out.println(String.format(Locale.ROOT, "Indexer: commit took %.1f msec",
                    (System.nanoTime() - commitStartNS) / 1000000.));
        }
    }

    threads.stop();

    final long t1 = System.currentTimeMillis();
    System.out.println("\nIndexer: indexing done (" + (t1 - t0) + " msec); total " + w.maxDoc() + " docs");
    // if we update we can not tell how many docs
    if (threads.failed.get()) {
        throw new RuntimeException("exceptions during indexing");
    }
    if (mode != Mode.UPDATE && docCountLimit != -1 && w.maxDoc() != docCountLimit) {
        throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit);
    }

    final Map<String, String> commitData = new HashMap<String, String>();

    if (waitForMerges) {
        w.close();
        IndexWriterConfig iwc2 = new IndexWriterConfig(a);
        iwc2.setMergeScheduler(
                getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle));
        iwc2.setMergePolicy(getMergePolicy(mergePolicy, useCFS));
        iwc2.setCodec(codec);
        iwc2.setUseCompoundFile(useCFS);
        iwc2.setMaxBufferedDocs(maxBufferedDocs);
        iwc2.setRAMBufferSizeMB(ramBufferSizeMB);
        if (indexSortField != null) {
            iwc2.setIndexSort(new Sort(new SortField(indexSortField, indexSortType)));
        }

        w = new IndexWriter(dir, iwc2);
        long t2 = System.currentTimeMillis();
        System.out.println("\nIndexer: waitForMerges done (" + (t2 - t1) + " msec)");
    }

    if (waitForCommit) {
        commitData.put("userData", "multi");
        w.setLiveCommitData(commitData.entrySet());
        long t2 = System.currentTimeMillis();
        w.commit();
        long t3 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit multi (took " + (t3 - t2) + " msec)");
    } else {
        w.rollback();
        w = null;
    }

    if (doForceMerge) {
        long forceMergeStartMSec = System.currentTimeMillis();
        w.forceMerge(1);
        long forceMergeEndMSec = System.currentTimeMillis();
        System.out.println(
                "\nIndexer: force merge done (took " + (forceMergeEndMSec - forceMergeStartMSec) + " msec)");

        commitData.put("userData", "single");
        w.setLiveCommitData(commitData.entrySet());
        w.commit();
        final long t5 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit single done (took " + (t5 - forceMergeEndMSec) + " msec)");
    }

    if (doDeletions) {
        final long t5 = System.currentTimeMillis();
        // Randomly delete 5% of the docs
        final Set<Integer> deleted = new HashSet<Integer>();
        final int maxDoc = w.maxDoc();
        final int toDeleteCount = (int) (maxDoc * 0.05);
        System.out.println("\nIndexer: delete " + toDeleteCount + " docs");
        while (deleted.size() < toDeleteCount) {
            final int id = random.nextInt(maxDoc);
            if (!deleted.contains(id)) {
                deleted.add(id);
                w.deleteDocuments(new Term("id", LineFileDocs.intToID(id)));
            }
        }
        final long t6 = System.currentTimeMillis();
        System.out.println("\nIndexer: deletes done (took " + (t6 - t5) + " msec)");

        commitData.put("userData", doForceMerge ? "delsingle" : "delmulti");
        w.setLiveCommitData(commitData.entrySet());
        w.commit();
        final long t7 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit delmulti done (took " + (t7 - t6) + " msec)");

        if (doUpdate || w.numDocs() != maxDoc - toDeleteCount) {
            throw new RuntimeException(
                    "count mismatch: w.numDocs()=" + w.numDocs() + " but expected " + (maxDoc - toDeleteCount));
        }
    }

    if (taxoWriter != null) {
        System.out.println("Taxonomy has " + taxoWriter.getSize() + " ords");
        taxoWriter.commit();
        taxoWriter.close();
    }

    final long tCloseStart = System.currentTimeMillis();
    if (w != null) {
        w.close();
        w = null;
    }
    if (waitForCommit) {
        System.out.println("\nIndexer: at close: " + SegmentInfos.readLatestCommit(dir));
        System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) + " msec");
    }

    dir.close();
    final long tFinal = System.currentTimeMillis();
    System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed());

    final long indexingTime;
    if (waitForCommit) {
        indexingTime = tFinal - t0;
        System.out.println("\nIndexer: finished (" + indexingTime + " msec)");
    } else {
        indexingTime = t1 - t0;
        System.out.println("\nIndexer: finished (" + indexingTime + " msec), excluding commit");
    }
    System.out.println(
            "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / (indexingTime / 3600000.))
                    + " GB/hour plain text");
}