List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:net.semanticmetadata.lire.impl.searcher.GenericFastImageSearcher.java
License:Open Source License
/** * @param reader/*from w w w . ja va 2 s.c o m*/ * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException { maxDistance = -1f; // overallMaxDistance = -1f; // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); Document d; float tmpDistance; int docs = reader.numDocs(); if (!isCaching) { // we read each and every document from the index and then we compare it to the query. for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, lireFeature); assert (tmpDistance >= 0); // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, d, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, d, i)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } } else { // we use the in-memory cache to find the matching docs from the index. int count = 0; for (Iterator<byte[]> iterator = featureCache.iterator(); iterator.hasNext();) { cachedInstance.setByteArrayRepresentation(iterator.next()); if (reader.hasDeletions() && !liveDocs.get(count)) { count++; continue; // if it is deleted, just ignore it. } else { tmpDistance = lireFeature.getDistance(cachedInstance); assert (tmpDistance >= 0); // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } count++; } } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.searcher.LocationBasedImageSearcher.java
License:Open Source License
/** * @param reader// w w w. j av a 2 s .c om * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(IndexReader reader, ImageInfo imageInfo) throws IOException { float maxDistance = -1f, allMaxDistance = -1f; float tmpDistance = 0f; docs.clear(); //???? Bits liveDocs = MultiFields.getLiveDocs(reader); int docNumber = reader.numDocs(); Document d = null; for (int i = 0; i < docNumber; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; //? d = reader.document(i);//? tmpDistance = getDistance(d, imageInfo);//? //???? if (tmpDistance < 0 || tmpDistance > this.threshold) continue; //???? if (allMaxDistance < tmpDistance) { allMaxDistance = tmpDistance; } //?? if (maxDistance < 0) { maxDistance = tmpDistance; } //???? if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, d, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; //????? } else if (tmpDistance < maxDistance) { //??? this.docs.remove(this.docs.size() - 1); this.docs.add(new SimpleResult(tmpDistance, d, i)); //? maxDistance = tmpDistance; Collections.sort(docs); } } //? return maxDistance; }
From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java
License:Open Source License
/** * @param reader//w w w . j a va2s . c o m * @param cl * @param sc * @param eh * @return the maximum distance found for normalizing. * @throws IOException */ private float findSimilar(IndexReader reader, ColorLayoutImpl cl, ScalableColorImpl sc, EdgeHistogramImplementation eh) throws IOException { float maxDistance = -1f, overallMaxDistance = -1f; boolean hasDeletions = reader.hasDeletions(); // clear result set ... docs.clear(); int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { // bugfix by Roman Kern Document d = reader.document(i); float distance = getDistance(d, cl, sc, eh); // calculate the overall max distance to normalize score afterwards if (overallMaxDistance < distance) { overallMaxDistance = distance; } // if it is the first document: if (maxDistance < 0) { maxDistance = distance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(distance, d)); if (distance > maxDistance) maxDistance = distance; } else if (distance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(distance, d)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java
License:Open Source License
public ImageDuplicates findDuplicates(IndexReader reader) throws IOException { // get the first document: if (!IndexReader.indexExists(reader.directory())) throw new FileNotFoundException("No index found at this specific location."); Document doc = reader.document(0); ScalableColorImpl sc = null;// w ww . j ava 2 s . c o m ColorLayoutImpl cl = null; EdgeHistogramImplementation eh = null; String[] cls = doc.getValues(DocumentBuilder.FIELD_NAME_COLORLAYOUT); if (cls != null && cls.length > 0) cl = new ColorLayoutImpl(cls[0]); String[] scs = doc.getValues(DocumentBuilder.FIELD_NAME_SCALABLECOLOR); if (scs != null && scs.length > 0) sc = new ScalableColorImpl(scs[0]); String[] ehs = doc.getValues(DocumentBuilder.FIELD_NAME_EDGEHISTOGRAM); if (ehs != null && ehs.length > 0) eh = new EdgeHistogramImplementation(ehs[0]); HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>(); // find duplicates ... boolean hasDeletions = reader.hasDeletions(); int docs = reader.numDocs(); int numDuplicates = 0; for (int i = 0; i < docs; i++) { Document d = reader.document(i); float distance = getDistance(d, cl, sc, eh); if (!duplicates.containsKey(distance)) { duplicates.put(distance, new LinkedList<String>()); } else { numDuplicates++; } duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue()); } if (numDuplicates == 0) return null; LinkedList<List<String>> results = new LinkedList<List<String>>(); for (float f : duplicates.keySet()) { if (duplicates.get(f).size() > 1) { results.add(duplicates.get(f)); } } return new SimpleImageDuplicates(results); }
From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java
License:Open Source License
/** * Creates a set of reference objects and stores it in a new index (hashFunctionsFileName "<indexPath>-ro"). Then creates ordered * lists of reference object positions for each data item in the index with given feature. * Finally a new index (hashFunctionsFileName "<indexPath>-ms") is created where all the original documents as well as the new data * are stored./*w ww . j av a 2 s . c o m*/ * * @param indexPath the path to the original index * @throws IOException */ public void createIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); if (numDocs < numReferenceObjects) { throw new UnsupportedOperationException("Too few documents in index."); } // progress report progress.setNumDocsAll(numDocs); progress.setCurrentState(State.RoSelection); boolean hasDeletions = reader.hasDeletions(); // init reference objects: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true); HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects); double numDocsDouble = (double) numDocs; while (referenceObjsIds.size() < numReferenceObjects) { referenceObjsIds.add((int) (numDocsDouble * Math.random())); } int count = 0; if (hasDeletions) { System.err.println("WARNING: There are deleted docs in your index. You should " + "optimize your index before using this method."); } // progress report progress.setCurrentState(State.RoIndexing); // find them in the index and put them into a separate index: for (int i : referenceObjsIds) { count++; Document document = reader.document(i); document.add(new Field("ro-id", count + "", StringField.TYPE_STORED)); iw.addDocument(document); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Indexing); // now find the reference objects for each entry ;) IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField); iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Idle); }
From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java
License:Open Source License
/** * We assume that the initial indexing has been done and a set of reference objects has been * found and indexed in the separate fileList. However further documents were added and they * now need to get a ranked list of reference objects. So we (i) get all these new documents * missing the field "ro-order" and (ii) add this field. * * @param indexPath the index to update/*from w w w . ja va2 s .co m*/ * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }
From source file:net.semanticmetadata.lire.RuntimeTest.java
License:Open Source License
public void testCEDDSearch() throws IOException { int numsearches = 10; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File("indexor-1.4mh"))); int numDocs = reader.numDocs(); System.out.println("numDocs = " + numDocs); // This is the new, shiny and fast one ... ImageSearcher searcher = ImageSearcherFactory.createCEDDImageSearcher(30); // This is the old and slow one. // ImageSearcher searcher = ImageSearcherFactory.createCEDDImageSearcher(30); FileInputStream imageStream = new FileInputStream("E:\\Temp\\images1\\1\\im1.jpg"); BufferedImage bimg = ImageIO.read(imageStream); ImageSearchHits hits = null;//from ww w.j a va2s. co m long time = System.currentTimeMillis(); for (int i = 0; i < numsearches; i++) { hits = searcher.search(bimg, reader); } time = System.currentTimeMillis() - time; System.out.println( ((float) time / (float) numsearches) + " ms per search with image, averaged on " + numsearches); for (int i = 0; i < hits.length(); i++) { System.out.println(hits.score(i) + ": " + hits.doc(i).getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue()); } Document document = hits.doc(4); time = System.currentTimeMillis(); for (int i = 0; i < numsearches; i++) { hits = searcher.search(document, reader); } time = System.currentTimeMillis() - time; System.out.println( ((float) time / (float) numsearches) + " ms per search with document, averaged on " + numsearches); for (int i = 0; i < hits.length(); i++) { System.out.println(hits.score(i) + ": " + hits.doc(i).getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue()); } }
From source file:net.semanticmetadata.lire.searchers.custom.SingleNddCeddImageSearcher.java
License:Open Source License
protected void init(IndexReader reader) { this.reader = reader; if (reader.hasDeletions()) { throw new UnsupportedOperationException( "The index has to be optimized first to be cached! Use IndexWriter.forceMerge(0) to do this."); }/*from w w w.ja v a 2 s .c om*/ docs = new TreeSet<SimpleResult>(); try { this.cachedInstance = (GlobalFeature) this.descriptorClass.newInstance(); if (fieldName == null) fieldName = this.cachedInstance.getFieldName(); } catch (InstantiationException e) { logger.log(Level.SEVERE, "Error instantiating class for generic image searcher (" + descriptorClass.getName() + "): " + e.getMessage()); } catch (IllegalAccessException e) { logger.log(Level.SEVERE, "Error instantiating class for generic image searcher (" + descriptorClass.getName() + "): " + e.getMessage()); } // put all respective features into an in-memory cache ... if (isCaching && reader != null) { int docs = reader.numDocs(); featureCache = new ArrayList<double[]>(docs); try { Document d; for (int i = 0; i < docs; i++) { d = reader.document(i); cachedInstance.setByteArrayRepresentation(d.getField(fieldName).binaryValue().bytes, d.getField(fieldName).binaryValue().offset, d.getField(fieldName).binaryValue().length); // normalize features,o we can use L1 if (!halfDimensions) { featureCache.add(normalize(cachedInstance.getFeatureVector())); } else { featureCache.add(crunch(cachedInstance.getFeatureVector())); } } } catch (IOException e) { e.printStackTrace(); } } }
From source file:net.semanticmetadata.lire.searchers.FastOpponentImageSearcher.java
License:Open Source License
/** * @param reader//from w ww. j a va2 s . c om * @param globalFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected double findSimilar(IndexReader reader, GlobalFeature globalFeature) throws IOException { maxDistance = -1f; // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); Document d; double tmpDistance; int docs = reader.numDocs(); byte[] histogram = globalFeature.getByteArrayRepresentation(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, histogram); assert (tmpDistance >= 0); // calculate the overall max distance to normalize score afterwards // if (overallMaxDistance < tmpDistance) { // overallMaxDistance = tmpDistance; // } // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, i)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } return maxDistance; }
From source file:net.semanticmetadata.lire.searchers.forevaluations.GenericFastImageSearcherForEvaluation.java
License:Open Source License
/** * @param reader//from www . j ava 2 s. c om * @param lireFeature * @return the maximum distance found for normalizing. * @throws IOException */ protected double findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException { maxDistance = -1d; // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); Document d; double tmpDistance; int docs = reader.numDocs(); if (!isCaching) { // we read each and every document from the index and then we compare it to the query. for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, lireFeature); assert (tmpDistance >= 0); // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResultForEvaluation(tmpDistance, i, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0])); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResultForEvaluation(tmpDistance, i, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0])); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } } else { LinkedList<Consumer> tasks = new LinkedList<Consumer>(); LinkedList<Thread> threads = new LinkedList<Thread>(); Consumer consumer; Thread thread; Thread p = new Thread(new Producer()); p.start(); for (int i = 0; i < numThreads; i++) { consumer = new Consumer(lireFeature); thread = new Thread(consumer); thread.start(); tasks.add(consumer); threads.add(thread); } for (Thread next : threads) { try { next.join(); } catch (InterruptedException e) { e.printStackTrace(); } } TreeSet<SimpleResultForEvaluation> tmpDocs; boolean flag; SimpleResultForEvaluation simpleResult; for (Consumer task : tasks) { tmpDocs = task.getResult(); flag = true; while (flag && (tmpDocs.size() > 0)) { simpleResult = tmpDocs.pollFirst(); if (this.docs.size() < maxHits) { this.docs.add(simpleResult); if (simpleResult.getDistance() > maxDistance) maxDistance = simpleResult.getDistance(); } else if (simpleResult.getDistance() < maxDistance) { // this.docs.remove(this.docs.last()); this.docs.pollLast(); this.docs.add(simpleResult); maxDistance = this.docs.last().getDistance(); } else flag = false; } } } return maxDistance; }