List of usage examples for org.apache.lucene.index IndexWriter forceMerge
public void forceMerge(int maxNumSegments) throws IOException
From source file:lsre.utils.LuceneUtils.java
License:Open Source License
/** * Optimizes an index.// w ww . ja v a 2 s . co m * @param iw * @throws IOException */ public static void optimizeWriter(IndexWriter iw) throws IOException { iw.forceMerge(1); }
From source file:net.riezebos.thoth.content.search.Indexer.java
License:Apache License
public void index() throws ContentManagerException { String contextName = contentManager.getContextName(); synchronized (activeIndexers) { if (activeIndexers.contains(contextName)) { LOG.warn("Indexer for context " + contextName + " is already (still?) active. Not starting a new index operation"); return; }/*from w w w. j ava2 s. c om*/ activeIndexers.add(contextName); } try { Date start = new Date(); LOG.info("Indexing " + contextName + " to directory '" + indexFolder + "'..."); IndexWriter writer = getWriter(recreate); IndexingContext indexingContext = new IndexingContext(); indexDirectory(writer, libraryFolder, indexingContext); sortIndexLists(indexingContext.getIndirectReverseIndex()); sortIndexLists(indexingContext.getDirectReverseIndex()); cacheResults(indexingContext); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.forceMerge(1); writer.close(); markUnusedDocuments(indexingContext.getDirectReverseIndex()); Date end = new Date(); LOG.info("Indexing context " + contextName + " took " + (end.getTime() - start.getTime()) + " milliseconds"); } catch (IOException e) { throw new ContentManagerException(e); } finally { synchronized (activeIndexers) { activeIndexers.remove(contextName); } } }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from w w w.j a v a 2s . c o m */ public void index() throws IOException { init(); df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(f.getDoubleHistogram()); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(k.getFeatureCount())); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + df.format(threshold)); int cstep = 3; while (Math.abs(newStress - laststress) > threshold && cstep < 12) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); // int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 8; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
public void indexMissing() throws IOException { init();// ww w. j a va 2s .c o m // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); LireFeature f = getFeatureInstance(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // based on bug report from Einav Itamar <einavitamar@gmail.com> IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), false, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); int counter = 0; for (int i = 0; i < reader.maxDoc(); i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) { createVisualWords(d, f); // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); counter++; } } System.out.println(counter + " Documents were updated"); iw.commit(); // added to permanently remove the deleted docs. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/*from w ww .jav a 2s . c o m*/ */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); features.add(((Histogram) f).descriptor); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("k.getFeatureCount() = " + k.getFeatureCount()); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + threshold); int cstep = 3; while (Math.abs(newStress - laststress) > threshold) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 4; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//ww w . ja v a2 s . c om */ public void index() throws IOException { clusters = SerializationUtils.readCodeBook(new FileInputStream("codebook128PNG.txt")); numClusters = clusters.size(); System.out.println("Clustering finished, " + clusters.size() + " clusters found"); System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 8; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from w ww.j a va 2s . co m */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from w w w.ja va2 s . c om */ public void index() throws IOException { init(); // localFeatureFieldName = getFeatureInstance().getFieldName(); // vladFieldName = localFeatureFieldName + "vlad"; df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); features.add(((Histogram) f).getDoubleHistogram()); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("k.getFeatureCount() = " + k.getFeatureCount()); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + threshold); int cstep = 3; // maximum of 14 steps. while (Math.abs(newStress - laststress) > threshold && cstep < 12) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); // int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 4; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
@Override public void optimize() { IndexWriter iWriter = null; try {//from w ww . j a va2s .co m iWriter = createIndexWriter(); iWriter.forceMerge(1); } catch (final IOException e) { throw ADOException.of(e); } finally { closeWriter(iWriter); } }
From source file:org.ala.lucene.Autocompleter.java
License:Open Source License
@SuppressWarnings("unchecked") public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex) throws CorruptIndexException, IOException { // build a dictionary (from the spell package) IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from/*w w w . j av a 2s .c o m*/ // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) IndexWriter.unlock(autoCompleteDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new CharArraySet(SolrUtils.BIE_LUCENE_VERSION, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true)); result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20); return new TokenStreamComponents(src, result) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; } // public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); // // result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); // result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); // //result = new ISOLatin1AccentFilter(result); // result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS))); // result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20); // // return result; // } }); if (createNewIndex) { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } indexWriterConfig.setMaxBufferedDocs(150); IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig); // writer.setMergeFactor(300); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs Map<String, Integer> wordsMap = new HashMap<String, Integer>(); Iterator<String> iter = (Iterator<String>) dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (wordsMap.containsKey(word)) { throw new IllegalStateException("This should never happen in Lucene 2.3.2"); // wordsMap.put(word, wordsMap.get(word) + 1); } else { // use the number of documents this word appears in wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word))); } } for (String word : wordsMap.keySet()) { // ok index the word Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count writer.addDocument(doc); } sourceReader.close(); // close writer writer.forceMerge(1); writer.close(); // re-open our reader reOpenReader(); }