List of usage examples for org.apache.lucene.index IndexWriter commit
@Override public final long commit() throws IOException
Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.
From source file:net.semanticmetadata.lire.benchmarking.TestGeneral.java
License:Open Source License
private void indexFiles(ArrayList<String> images, DocumentBuilder builder, String indexPath) throws IOException { // eventually check if the directory is there or not ... IndexWriter iw = LuceneUtils.createIndexWriter(testIndex, false); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { // TODO: cut toes from the image ... -> doesn't work out very well. Stable at first, decreasing then. // TODO: Joint Histogram ... // TODO: LSA / PCA on the vectors ...-> this looks like a job for me :-D // TODO: local features ... Document doc = null;/*from w ww. j a va 2s . com*/ if (cutImages) { BufferedImage bimg = ImageUtils.cropImage(ImageIO.read(new FileInputStream(identifier)), 0, 0, 200, 69); doc = builder.createDocument(bimg, identifier); } else doc = builder.createDocument(new FileInputStream(identifier), identifier); iw.addDocument(doc); count++; if (count % 100 == 0) { int percent = (int) Math.floor(((double) count * 100.0) / (double) images.size()); double timeTemp = (double) (System.currentTimeMillis() - time) / 1000d; int secsLeft = (int) Math.round(((timeTemp / (double) count) * (double) images.size()) - timeTemp); System.out.println(percent + "% finished (" + count + " files), " + secsLeft + " s left"); } } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.commit(); iw.close(); }
From source file:net.semanticmetadata.lire.benchmarking.TestWang.java
License:Open Source License
private void indexFiles(ArrayList<String> images, DocumentBuilder builder, String indexPath) throws IOException { // System.out.println(">> Indexing " + images.size() + " files."); // DocumentBuilder builder = DocumentBuilderFactory.getExtensiveDocumentBuilder(); // DocumentBuilder builder = DocumentBuilderFactory.getFastDocumentBuilder(); IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { Document doc = builder.createDocument(new FileInputStream(identifier), identifier); iw.addDocument(doc);//from w w w . j a va 2 s . c o m count++; if (count % 100 == 0) System.out.println(count + " files indexed."); // if (count == 200) break; } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.commit(); iw.close(); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/*from ww w . j a v a 2 s .c om*/ */ public void index() throws IOException { init(); df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(f.getDoubleHistogram()); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(k.getFeatureCount())); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + df.format(threshold)); int cstep = 3; while (Math.abs(newStress - laststress) > threshold && cstep < 12) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); // int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 8; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
public void indexMissing() throws IOException { init();/*from ww w . ja v a2 s . c o m*/ // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); LireFeature f = getFeatureInstance(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // based on bug report from Einav Itamar <einavitamar@gmail.com> IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), false, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); int counter = 0; for (int i = 0; i < reader.maxDoc(); i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) { createVisualWords(d, f); // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); counter++; } } System.out.println(counter + " Documents were updated"); iw.commit(); // added to permanently remove the deleted docs. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from ww w . jav a 2s .com */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); features.add(((Histogram) f).descriptor); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("k.getFeatureCount() = " + k.getFeatureCount()); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + threshold); int cstep = 3; while (Math.abs(newStress - laststress) > threshold) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 4; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java
License:Open Source License
public void indexMissing() throws IOException { // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; LireFeature f = getFeatureInstance(); IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); for (int i = 0; i < reader.maxDoc(); i++) { // if (!reader.isDeleted(i)) { for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0;/* ww w . j a v a2 s .co m*/ } Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) { IndexableField[] fields = d.getFields(localFeatureFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature((Histogram) f)]++; } normalize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); d.add(new StringField(localFeatureHistFieldName, SerializationUtils.arrayToString(tmpHist), Field.Store.YES)); // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } // } } iw.commit(); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/*from ww w. j a v a 2 s . c om*/ */ public void index() throws IOException { clusters = SerializationUtils.readCodeBook(new FileInputStream("codebook128PNG.txt")); numClusters = clusters.size(); System.out.println("Clustering finished, " + clusters.size() + " clusters found"); System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 8; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException// w w w.j a v a 2 s . co m */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/*from w ww . jav a 2s .c o m*/ */ public void index() throws IOException { init(); // localFeatureFieldName = getFeatureInstance().getFieldName(); // vladFieldName = localFeatureFieldName + "vlad"; df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); KMeans k; if (useParallelClustering) k = new ParallelKMeans(numClusters); else k = new KMeans(numClusters); // fill the KMeans object: LinkedList<double[]> features = new LinkedList<double[]>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); features.add(((Histogram) f).getDoubleHistogram()); } k.addImage(file, features); } if (pm != null) { // set to 5 of 100 before clustering starts. pm.setProgress(5); pm.setNote("Starting clustering"); } if (k.getFeatureCount() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("k.getFeatureCount() = " + k.getFeatureCount()); System.out.println("Starting clustering ..."); k.init(); System.out.println("Step."); double time = System.currentTimeMillis(); double laststress = k.clusteringStep(); if (pm != null) { // set to 8 of 100 after first step. pm.setProgress(8); pm.setNote("Step 1 finished"); } System.out.println(getDuration(time) + " -> Next step."); time = System.currentTimeMillis(); double newStress = k.clusteringStep(); if (pm != null) { // set to 11 of 100 after second step. pm.setProgress(11); pm.setNote("Step 2 finished"); } // critical part: Give the difference in between steps as a constraint for accuracy vs. runtime trade off. double threshold = Math.max(20d, (double) k.getFeatureCount() / 1000d); System.out.println("Threshold = " + threshold); int cstep = 3; // maximum of 14 steps. while (Math.abs(newStress - laststress) > threshold && cstep < 12) { System.out.println(getDuration(time) + " -> Next step. Stress difference ~ |" + (int) newStress + " - " + (int) laststress + "| = " + df.format(Math.abs(newStress - laststress))); time = System.currentTimeMillis(); laststress = newStress; newStress = k.clusteringStep(); if (pm != null) { // set to XX of 100 after second step. pm.setProgress(cstep * 3 + 5); pm.setNote("Step " + cstep + " finished"); } cstep++; } // Serializing clusters to a file on the disk ... clusters = k.getClusters(); // for (int i = 0; i < clusters.length; i++) { // Cluster cluster = clusters[i]; // System.out.print(cluster.getMembers().size() + ", "); // } // System.out.println(); Cluster.writeClusters(clusters, clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); time = System.currentTimeMillis(); // int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(50); pm.setNote("Clustering finished"); } // parallelized indexing LinkedList<Thread> threads = new LinkedList<Thread>(); int numThreads = 4; // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); int step = reader.maxDoc() / numThreads; for (int part = 0; part < numThreads; part++) { Indexer indexer = null; if (part < numThreads - 1) indexer = new Indexer(part * step, (part + 1) * step, iw, null); else indexer = new Indexer(part * step, reader.maxDoc(), iw, pm); Thread t = new Thread(indexer); threads.add(t); t.start(); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { Thread next = iterator.next(); try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(95); pm.setNote("Indexing finished, optimizing index now."); } System.out.println(getDuration(time)); iw.commit(); // this one does the "old" commit(), it removes the deleted SURF features. iw.forceMerge(1); iw.close(); if (pm != null) { // set to 50 of 100 after clustering. pm.setProgress(100); pm.setNote("Indexing & optimization finished"); pm.close(); } System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.VLADBuilder.java
License:Open Source License
/** * Indexes all documents in the index, that do not include the VLAD feature yet. * * @throws IOException/*from www . j av a 2 s . c o m*/ */ public void indexMissing() throws IOException { init(); // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); LireFeature f = getFeatureInstance(); IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); for (int i = 0; i < reader.maxDoc(); i++) { // if (!reader.isDeleted(i)) { Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(vladFieldName) == null || d.getValues(vladFieldName).length == 0) { createVisualWords(d, f); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } // } } iw.commit(); iw.close(); System.out.println("Finished."); }