Example usage for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:lsre.utils.LuceneUtils.java

License:Open Source License

/**
 * Optimizes an index.//  w ww . ja v a 2 s  .  co m
 * @param iw
 * @throws IOException
 */
public static void optimizeWriter(IndexWriter iw) throws IOException {
    iw.forceMerge(1);
}

From source file:net.riezebos.thoth.content.search.Indexer.java

License:Apache License

public void index() throws ContentManagerException {

    String contextName = contentManager.getContextName();
    synchronized (activeIndexers) {
        if (activeIndexers.contains(contextName)) {
            LOG.warn("Indexer for context " + contextName
                    + " is already (still?) active. Not starting a new index operation");
            return;
        }/*from   w  w w. j  ava2 s.  c om*/
        activeIndexers.add(contextName);
    }

    try {
        Date start = new Date();
        LOG.info("Indexing " + contextName + " to directory '" + indexFolder + "'...");

        IndexWriter writer = getWriter(recreate);
        IndexingContext indexingContext = new IndexingContext();
        indexDirectory(writer, libraryFolder, indexingContext);

        sortIndexLists(indexingContext.getIndirectReverseIndex());
        sortIndexLists(indexingContext.getDirectReverseIndex());
        cacheResults(indexingContext);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here. This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.forceMerge(1);

        writer.close();

        markUnusedDocuments(indexingContext.getDirectReverseIndex());

        Date end = new Date();
        LOG.info("Indexing context " + contextName + " took " + (end.getTime() - start.getTime())
                + " milliseconds");
    } catch (IOException e) {
        throw new ContentManagerException(e);
    } finally {
        synchronized (activeIndexers) {
            activeIndexers.remove(contextName);
        }
    }
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from   w  w  w.j a  v  a 2s . c  o m
 */
public void index() throws IOException {
    init();
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    KMeans k;
    if (useParallelClustering)
        k = new ParallelKMeans(numClusters);
    else
        k = new KMeans(numClusters);
    // fill the KMeans object:
    LinkedList<double[]> features = new LinkedList<double[]>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(f.getDoubleHistogram());
        }
        k.addImage(file, features);
    }
    if (pm != null) { // set to 5 of 100 before clustering starts.
        pm.setProgress(5);
        pm.setNote("Starting clustering");
    }
    if (k.getFeatureCount() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(k.getFeatureCount()));
    System.out.println("Starting clustering ...");
    k.init();
    System.out.println("Step.");
    double time = System.currentTimeMillis();
    double laststress = k.clusteringStep();

    if (pm != null) { // set to 8 of 100 after first step.
        pm.setProgress(8);
        pm.setNote("Step 1 finished");
    }

    System.out.println(getDuration(time) + " -> Next step.");
    time = System.currentTimeMillis();
    double newStress = k.clusteringStep();

    if (pm != null) { // set to 11 of 100 after second step.
        pm.setProgress(11);
        pm.setNote("Step 2 finished");
    }

    // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off.
    double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d);
    System.out.println("Threshold = " + df.format(threshold));
    int cstep = 3;
    while (Math.abs(newStress - laststress) > threshold && cstep < 12) {
        System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - "
                + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress)));
        time = System.currentTimeMillis();
        laststress = newStress;
        newStress = k.clusteringStep();
        if (pm != null) { // set to XX of 100 after second step.
            pm.setProgress(cstep * 3 + 5);
            pm.setNote("Step " + cstep + " finished");
        }
        cstep++;
    }
    // Serializing clusters to a file on the disk ...
    clusters = k.getClusters();
    //        for (int i = 0; i < clusters.length; i++) {
    //            Cluster cluster = clusters[i];
    //            System.out.print(cluster.getMembers().size() + ", ");
    //        }
    //        System.out.println();
    Cluster.writeClusters(clusters, clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    time = System.currentTimeMillis();
    //        int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(50);
        pm.setNote("Clustering finished");
    }
    // parallelized indexing
    LinkedList<Thread> threads = new LinkedList<Thread>();
    int numThreads = 8;
    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    int step = reader.maxDoc() / numThreads;
    for (int part = 0; part < numThreads; part++) {
        Indexer indexer = null;
        if (part < numThreads - 1)
            indexer = new Indexer(part * step, (part + 1) * step, iw, null);
        else
            indexer = new Indexer(part * step, reader.maxDoc(), iw, pm);
        Thread t = new Thread(indexer);
        threads.add(t);
        t.start();
    }
    for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) {
        Thread next = iterator.next();
        try {
            next.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(95);
        pm.setNote("Indexing finished, optimizing index now.");
    }

    System.out.println(getDuration(time));
    iw.commit();
    // this one does the "old" commit(), it removes the deleted SURF features.
    iw.forceMerge(1);
    iw.close();
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(100);
        pm.setNote("Indexing & optimization finished");
        pm.close();
    }
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java

License:Open Source License

public void indexMissing() throws IOException {
    init();//  ww  w. j  a  va  2s  .c  o  m
    // Reading clusters from disk:
    clusters = Cluster.readClusters(clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    LireFeature f = getFeatureInstance();

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    // based on bug report from Einav Itamar <einavitamar@gmail.com>
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), false,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    int counter = 0;
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(i);
        // Only if there are no values yet:
        if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) {
            createVisualWords(d, f);
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
            counter++;
        }
    }
    System.out.println(counter + " Documents were updated");
    iw.commit();
    // added to permanently remove the deleted docs.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException/*from w  ww .jav  a  2s . c o  m*/
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    KMeans k;
    if (useParallelClustering)
        k = new ParallelKMeans(numClusters);
    else
        k = new KMeans(numClusters);
    // fill the KMeans object:
    LinkedList<double[]> features = new LinkedList<double[]>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            features.add(((Histogram) f).descriptor);
        }
        k.addImage(file, features);
    }
    if (pm != null) { // set to 5 of 100 before clustering starts.
        pm.setProgress(5);
        pm.setNote("Starting clustering");
    }
    if (k.getFeatureCount() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("k.getFeatureCount() = " + k.getFeatureCount());
    System.out.println("Starting clustering ...");
    k.init();
    System.out.println("Step.");
    double time = System.currentTimeMillis();
    double laststress = k.clusteringStep();

    if (pm != null) { // set to 8 of 100 after first step.
        pm.setProgress(8);
        pm.setNote("Step 1 finished");
    }

    System.out.println(getDuration(time) + " -> Next step.");
    time = System.currentTimeMillis();
    double newStress = k.clusteringStep();

    if (pm != null) { // set to 11 of 100 after second step.
        pm.setProgress(11);
        pm.setNote("Step 2 finished");
    }

    // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off.
    double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d);
    System.out.println("Threshold = " + threshold);
    int cstep = 3;
    while (Math.abs(newStress - laststress) > threshold) {
        System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - "
                + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress)));
        time = System.currentTimeMillis();
        laststress = newStress;
        newStress = k.clusteringStep();
        if (pm != null) { // set to XX of 100 after second step.
            pm.setProgress(cstep * 3 + 5);
            pm.setNote("Step " + cstep + " finished");
        }
        cstep++;
    }
    // Serializing clusters to a file on the disk ...
    clusters = k.getClusters();
    //        for (int i = 0; i < clusters.length; i++) {
    //            Cluster cluster = clusters[i];
    //            System.out.print(cluster.getMembers().size() + ", ");
    //        }
    //        System.out.println();
    Cluster.writeClusters(clusters, clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    time = System.currentTimeMillis();
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(50);
        pm.setNote("Clustering finished");
    }
    // parallelized indexing
    LinkedList<Thread> threads = new LinkedList<Thread>();
    int numThreads = 4;
    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    int step = reader.maxDoc() / numThreads;
    for (int part = 0; part < numThreads; part++) {
        Indexer indexer = null;
        if (part < numThreads - 1)
            indexer = new Indexer(part * step, (part + 1) * step, iw, null);
        else
            indexer = new Indexer(part * step, reader.maxDoc(), iw, pm);
        Thread t = new Thread(indexer);
        threads.add(t);
        t.start();
    }
    for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) {
        Thread next = iterator.next();
        try {
            next.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(95);
        pm.setNote("Indexing finished, optimizing index now.");
    }

    System.out.println(getDuration(time));
    iw.commit();
    // this one does the "old" commit(), it removes the deleted SURF features.
    iw.forceMerge(1);
    iw.close();
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(100);
        pm.setNote("Indexing & optimization finished");
        pm.close();
    }
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//ww  w  .  ja v a2 s  .  c  om
 */
public void index() throws IOException {

    clusters = SerializationUtils.readCodeBook(new FileInputStream("codebook128PNG.txt"));
    numClusters = clusters.size();
    System.out.println("Clustering finished, " + clusters.size() + " clusters found");

    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);
    // parallelized indexing
    LinkedList<Thread> threads = new LinkedList<Thread>();
    int numThreads = 8;
    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    int step = reader.maxDoc() / numThreads;
    for (int part = 0; part < numThreads; part++) {
        Indexer indexer = null;
        if (part < numThreads - 1)
            indexer = new Indexer(part * step, (part + 1) * step, iw, null);
        else
            indexer = new Indexer(part * step, reader.maxDoc(), iw, pm);
        Thread t = new Thread(indexer);
        threads.add(t);
        t.start();
    }
    for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) {
        Thread next = iterator.next();
        try {
            next.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(95);
        pm.setNote("Indexing finished, optimizing index now.");
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted SURF features.
    iw.forceMerge(1);
    iw.close();
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(100);
        pm.setNote("Indexing & optimization finished");
        pm.close();
    }
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from   w  ww.j  a va  2s .  co  m
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
    KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
    // fill the KMeans object:
    LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        //            features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(new DoublePoint(f.getDoubleHistogram()));
        }
    }
    if (features.size() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(features.size()));
    System.out.println("Starting clustering ...");
    List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
    // TODO: Serializing clusters to a file on the disk ...
    System.out.println("Clustering finished, " + clusterList.size() + " clusters found");
    clusters = new LinkedList<double[]>();
    for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) {
        CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        clusters.add(centroidCluster.getCenter().getPoint());
    }
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    LireFeature f = getFeatureInstance();
    for (int i = 0; i < reader.maxDoc(); i++) {
        try {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue;
            for (int j = 0; j < tmpHist.length; j++) {
                tmpHist[j] = 0;
            }
            Document d = reader.document(i);
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // remove the fields if they are already there ...
            d.removeField(visualWordsFieldName);
            d.removeField(localFeatureHistFieldName);

            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature(f, clusters)]++;
            }
            //                System.out.println(Arrays.toString(tmpHist));
            d.add(new StoredField(localFeatureHistFieldName,
                    SerializationUtils.toByteArray(normalize(tmpHist))));
            quantize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

            // remove local features to save some space if requested:
            if (DELETE_LOCAL_FEATURES) {
                d.removeFields(localFeatureFieldName);
            }
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted local features.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from   w w w.ja va2  s  . c om
 */
public void index() throws IOException {
    init();
    //        localFeatureFieldName = getFeatureInstance().getFieldName();
    //        vladFieldName = localFeatureFieldName + "vlad";
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    KMeans k;
    if (useParallelClustering)
        k = new ParallelKMeans(numClusters);
    else
        k = new KMeans(numClusters);
    // fill the KMeans object:
    LinkedList<double[]> features = new LinkedList<double[]>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            features.add(((Histogram) f).getDoubleHistogram());
        }
        k.addImage(file, features);
    }
    if (pm != null) { // set to 5 of 100 before clustering starts.
        pm.setProgress(5);
        pm.setNote("Starting clustering");
    }
    if (k.getFeatureCount() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("k.getFeatureCount() = " + k.getFeatureCount());
    System.out.println("Starting clustering ...");
    k.init();
    System.out.println("Step.");
    double time = System.currentTimeMillis();
    double laststress = k.clusteringStep();

    if (pm != null) { // set to 8 of 100 after first step.
        pm.setProgress(8);
        pm.setNote("Step 1 finished");
    }

    System.out.println(getDuration(time) + " -> Next step.");
    time = System.currentTimeMillis();
    double newStress = k.clusteringStep();

    if (pm != null) { // set to 11 of 100 after second step.
        pm.setProgress(11);
        pm.setNote("Step 2 finished");
    }

    // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off.
    double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d);
    System.out.println("Threshold = " + threshold);
    int cstep = 3;
    // maximum of 14 steps.
    while (Math.abs(newStress - laststress) > threshold && cstep < 12) {
        System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - "
                + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress)));
        time = System.currentTimeMillis();
        laststress = newStress;
        newStress = k.clusteringStep();
        if (pm != null) { // set to XX of 100 after second step.
            pm.setProgress(cstep * 3 + 5);
            pm.setNote("Step " + cstep + " finished");
        }
        cstep++;
    }
    // Serializing clusters to a file on the disk ...
    clusters = k.getClusters();
    //        for (int i = 0; i < clusters.length; i++) {
    //            Cluster cluster = clusters[i];
    //            System.out.print(cluster.getMembers().size() + ", ");
    //        }
    //        System.out.println();
    Cluster.writeClusters(clusters, clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    time = System.currentTimeMillis();
    //        int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(50);
        pm.setNote("Clustering finished");
    }
    // parallelized indexing
    LinkedList<Thread> threads = new LinkedList<Thread>();
    int numThreads = 4;
    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    int step = reader.maxDoc() / numThreads;
    for (int part = 0; part < numThreads; part++) {
        Indexer indexer = null;
        if (part < numThreads - 1)
            indexer = new Indexer(part * step, (part + 1) * step, iw, null);
        else
            indexer = new Indexer(part * step, reader.maxDoc(), iw, pm);
        Thread t = new Thread(indexer);
        threads.add(t);
        t.start();
    }
    for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) {
        Thread next = iterator.next();
        try {
            next.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(95);
        pm.setNote("Indexing finished, optimizing index now.");
    }

    System.out.println(getDuration(time));
    iw.commit();
    // this one does the "old" commit(), it removes the deleted SURF features.
    iw.forceMerge(1);
    iw.close();
    if (pm != null) { // set to 50 of 100 after clustering.
        pm.setProgress(100);
        pm.setNote("Indexing & optimization finished");
        pm.close();
    }
    System.out.println("Finished.");
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public void optimize() {
    IndexWriter iWriter = null;
    try {//from   w ww .  j a  va2s  .co m
        iWriter = createIndexWriter();
        iWriter.forceMerge(1);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        closeWriter(iWriter);
    }
}

From source file:org.ala.lucene.Autocompleter.java

License:Open Source License

@SuppressWarnings("unchecked")
public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex)
        throws CorruptIndexException, IOException {
    // build a dictionary (from the spell package)
    IndexReader sourceReader = IndexReader.open(sourceDirectory);

    LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);

    // code from/*w w w  .  j  av  a 2s .c  o m*/
    // org.apache.lucene.search.spell.SpellChecker.indexDictionary(
    // Dictionary)
    IndexWriter.unlock(autoCompleteDirectory);

    // use a custom analyzer so we can do EdgeNGramFiltering
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() {
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
            TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
            result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result,
                    new CharArraySet(SolrUtils.BIE_LUCENE_VERSION,
                            new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true));
            result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20);
            return new TokenStreamComponents(src, result) {
                @Override
                protected void setReader(final Reader reader) throws IOException {
                    super.setReader(reader);
                }

            };
        }
        //            public TokenStream tokenStream(String fieldName, Reader reader) {
        //            TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
        //            
        //            result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
        //            result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
        //            //result = new ISOLatin1AccentFilter(result);
        //            result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)));
        //            result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20);
        //            
        //            return result;
        //          }
    });
    if (createNewIndex) {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    } else {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    }
    indexWriterConfig.setMaxBufferedDocs(150);
    IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig);
    //        writer.setMergeFactor(300);

    // go through every word, storing the original word (incl. n-grams)
    // and the number of times it occurs
    Map<String, Integer> wordsMap = new HashMap<String, Integer>();

    Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
    while (iter.hasNext()) {
        String word = iter.next();

        int len = word.length();
        if (len < 3) {
            continue; // too short we bail but "too long" is fine...
        }

        if (wordsMap.containsKey(word)) {
            throw new IllegalStateException("This should never happen in Lucene 2.3.2");
            // wordsMap.put(word, wordsMap.get(word) + 1);
        } else {
            // use the number of documents this word appears in
            wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word)));
        }
    }

    for (String word : wordsMap.keySet()) {
        // ok index the word
        Document doc = new Document();
        doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
        doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed
        doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO,
                Field.Index.NOT_ANALYZED)); // count

        writer.addDocument(doc);
    }

    sourceReader.close();

    // close writer
    writer.forceMerge(1);
    writer.close();

    // re-open our reader
    reOpenReader();
}