List of usage examples for org.apache.lucene.index IndexReader hasDeletions
public boolean hasDeletions()
From source file:net.semanticmetadata.lire.impl.GenericImageSearcher.java
License:Open Source License
/** * @param reader/*from ww w. jav a 2 s . co m*/ * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException { float maxDistance = -1f, overallMaxDistance = -1f; float tmpDistance = 0f; // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); int docs = reader.numDocs(); Document d = null; for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, lireFeature); // if (distance < 0 || Float.isNaN(distance)) // System.out.println("X"); assert (tmpDistance >= 0); // calculate the overall max distance to normalize score afterwards if (overallMaxDistance < tmpDistance) { overallMaxDistance = tmpDistance; } // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, d, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, d, i)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.GenericImageSearcher.java
License:Open Source License
public ImageDuplicates findDuplicates(IndexReader reader) throws IOException { // get the first document: SimpleImageDuplicates simpleImageDuplicates = null; try {/*from w ww. j a v a 2 s . co m*/ // if (!IndexReader.indexExists(reader.directory())) // throw new FileNotFoundException("No index found at this specific location."); Document doc = reader.document(0); LireFeature lireFeature = (LireFeature) descriptorClass.newInstance(); String[] cls = doc.getValues(fieldName); if (cls != null && cls.length > 0) lireFeature.setStringRepresentation(cls[0]); HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); int docs = reader.numDocs(); int numDuplicates = 0; for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); float distance = getDistance(d, lireFeature); if (!duplicates.containsKey(distance)) { duplicates.put(distance, new LinkedList<String>()); } else { numDuplicates++; } duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue()); } if (numDuplicates == 0) return null; LinkedList<List<String>> results = new LinkedList<List<String>>(); for (float f : duplicates.keySet()) { if (duplicates.get(f).size() > 1) { results.add(duplicates.get(f)); } } simpleImageDuplicates = new SimpleImageDuplicates(results); } catch (InstantiationException e) { logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage()); } catch (IllegalAccessException e) { logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage()); } return simpleImageDuplicates; }
From source file:net.semanticmetadata.lire.impl.ParallelImageSearcher.java
License:Open Source License
/** * @param reader//from w w w . ja v a2 s . c o m * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ @SuppressWarnings("unchecked") private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException { float[] maxDistance = new float[lireFeature.length]; float[] overallMaxDistance = new float[lireFeature.length]; for (int i = 0; i < overallMaxDistance.length; i++) { overallMaxDistance[i] = -1f; maxDistance[i] = -1f; } parDocs = new TreeSet[lireFeature.length]; for (int i = 0; i < parDocs.length; i++) { parDocs[i] = new TreeSet<SimpleResult>(); } // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // clear result set ... int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); float[] distance = getDistance(d, lireFeature); // calculate the overall max distance to normalize score afterwards for (int j = 0; j < distance.length; j++) { float f = distance[j]; if (overallMaxDistance[j] < f) { overallMaxDistance[j] = f; } // if it is the first document: if (maxDistance[j] < 0) { maxDistance[j] = f; } // if the array is not full yet: if (this.parDocs[j].size() < maxHits) { this.parDocs[j].add(new SimpleResult(f, d, i)); if (f > maxDistance[j]) { maxDistance[j] = f; } } else if (f < maxDistance[j]) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.parDocs[j].remove(this.parDocs[j].last()); // add the new one ... this.parDocs[j].add(new SimpleResult(f, d, i)); // and set our new distance border ... maxDistance[j] = this.parDocs[j].last().getDistance(); } } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.searcher.GenericFastImageSearcher.java
License:Open Source License
/** * @param reader/*ww w . j a v a 2 s . c om*/ * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException { maxDistance = -1f; // overallMaxDistance = -1f; // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); Document d; float tmpDistance; int docs = reader.numDocs(); if (!isCaching) { // we read each and every document from the index and then we compare it to the query. for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. d = reader.document(i); tmpDistance = getDistance(d, lireFeature); assert (tmpDistance >= 0); // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, d, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, d, i)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } } else { // we use the in-memory cache to find the matching docs from the index. int count = 0; for (Iterator<byte[]> iterator = featureCache.iterator(); iterator.hasNext();) { cachedInstance.setByteArrayRepresentation(iterator.next()); if (reader.hasDeletions() && !liveDocs.get(count)) { count++; continue; // if it is deleted, just ignore it. } else { tmpDistance = lireFeature.getDistance(cachedInstance); assert (tmpDistance >= 0); // if it is the first document: if (maxDistance < 0) { maxDistance = tmpDistance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; } else if (tmpDistance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } count++; } } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.searcher.LocationBasedImageSearcher.java
License:Open Source License
/** * @param reader// ww w. ja v a 2 s . c o m * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(IndexReader reader, ImageInfo imageInfo) throws IOException { float maxDistance = -1f, allMaxDistance = -1f; float tmpDistance = 0f; docs.clear(); //???? Bits liveDocs = MultiFields.getLiveDocs(reader); int docNumber = reader.numDocs(); Document d = null; for (int i = 0; i < docNumber; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; //? d = reader.document(i);//? tmpDistance = getDistance(d, imageInfo);//? //???? if (tmpDistance < 0 || tmpDistance > this.threshold) continue; //???? if (allMaxDistance < tmpDistance) { allMaxDistance = tmpDistance; } //?? if (maxDistance < 0) { maxDistance = tmpDistance; } //???? if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(tmpDistance, d, i)); if (tmpDistance > maxDistance) maxDistance = tmpDistance; //????? } else if (tmpDistance < maxDistance) { //??? this.docs.remove(this.docs.size() - 1); this.docs.add(new SimpleResult(tmpDistance, d, i)); //? maxDistance = tmpDistance; Collections.sort(docs); } } //? return maxDistance; }
From source file:net.semanticmetadata.lire.impl.searcher.TopDocsImageSearcher.java
License:Open Source License
/** * @param results//from www . jav a 2s . c om * @param reader * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ protected float findSimilar(TopDocs results, IndexReader reader, LireFeature lireFeature) throws IOException { float maxDistance = -1f, overallMaxDistance = -1f; boolean hasDeletions = reader.hasDeletions(); // clear result set ... docs.clear(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); int docs = results.totalHits; for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(results.scoreDocs[i].doc); float distance = getDistance(d, lireFeature); assert (distance >= 0); // calculate the overall max distance to normalize score afterwards if (overallMaxDistance < distance) { overallMaxDistance = distance; } // if it is the first document: if (maxDistance < 0) { maxDistance = distance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(distance, d, i)); if (distance > maxDistance) maxDistance = distance; } else if (distance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(distance, d, i)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java
License:Open Source License
/** * @param reader/*w w w . j a v a 2 s . c o m*/ * @param cl * @param sc * @param eh * @return the maximum distance found for normalizing. * @throws IOException */ private float findSimilar(IndexReader reader, ColorLayoutImpl cl, ScalableColorImpl sc, EdgeHistogramImplementation eh) throws IOException { float maxDistance = -1f, overallMaxDistance = -1f; boolean hasDeletions = reader.hasDeletions(); // clear result set ... docs.clear(); int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { // bugfix by Roman Kern Document d = reader.document(i); float distance = getDistance(d, cl, sc, eh); // calculate the overall max distance to normalize score afterwards if (overallMaxDistance < distance) { overallMaxDistance = distance; } // if it is the first document: if (maxDistance < 0) { maxDistance = distance; } // if the array is not full yet: if (this.docs.size() < maxHits) { this.docs.add(new SimpleResult(distance, d)); if (distance > maxDistance) maxDistance = distance; } else if (distance < maxDistance) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.docs.remove(this.docs.last()); // add the new one ... this.docs.add(new SimpleResult(distance, d)); // and set our new distance border ... maxDistance = this.docs.last().getDistance(); } } return maxDistance; }
From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java
License:Open Source License
public ImageDuplicates findDuplicates(IndexReader reader) throws IOException { // get the first document: if (!IndexReader.indexExists(reader.directory())) throw new FileNotFoundException("No index found at this specific location."); Document doc = reader.document(0); ScalableColorImpl sc = null;/*w w w . ja va2 s . c o m*/ ColorLayoutImpl cl = null; EdgeHistogramImplementation eh = null; String[] cls = doc.getValues(DocumentBuilder.FIELD_NAME_COLORLAYOUT); if (cls != null && cls.length > 0) cl = new ColorLayoutImpl(cls[0]); String[] scs = doc.getValues(DocumentBuilder.FIELD_NAME_SCALABLECOLOR); if (scs != null && scs.length > 0) sc = new ScalableColorImpl(scs[0]); String[] ehs = doc.getValues(DocumentBuilder.FIELD_NAME_EDGEHISTOGRAM); if (ehs != null && ehs.length > 0) eh = new EdgeHistogramImplementation(ehs[0]); HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>(); // find duplicates ... boolean hasDeletions = reader.hasDeletions(); int docs = reader.numDocs(); int numDuplicates = 0; for (int i = 0; i < docs; i++) { Document d = reader.document(i); float distance = getDistance(d, cl, sc, eh); if (!duplicates.containsKey(distance)) { duplicates.put(distance, new LinkedList<String>()); } else { numDuplicates++; } duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue()); } if (numDuplicates == 0) return null; LinkedList<List<String>> results = new LinkedList<List<String>>(); for (float f : duplicates.keySet()) { if (duplicates.get(f).size() > 1) { results.add(duplicates.get(f)); } } return new SimpleImageDuplicates(results); }
From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java
License:Open Source License
/** * Creates a set of reference objects and stores it in a new index (hashFunctionsFileName "<indexPath>-ro"). Then creates ordered * lists of reference object positions for each data item in the index with given feature. * Finally a new index (hashFunctionsFileName "<indexPath>-ms") is created where all the original documents as well as the new data * are stored./*from w w w . j a v a 2s .c o m*/ * * @param indexPath the path to the original index * @throws IOException */ public void createIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); if (numDocs < numReferenceObjects) { throw new UnsupportedOperationException("Too few documents in index."); } // progress report progress.setNumDocsAll(numDocs); progress.setCurrentState(State.RoSelection); boolean hasDeletions = reader.hasDeletions(); // init reference objects: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true); HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects); double numDocsDouble = (double) numDocs; while (referenceObjsIds.size() < numReferenceObjects) { referenceObjsIds.add((int) (numDocsDouble * Math.random())); } int count = 0; if (hasDeletions) { System.err.println("WARNING: There are deleted docs in your index. You should " + "optimize your index before using this method."); } // progress report progress.setCurrentState(State.RoIndexing); // find them in the index and put them into a separate index: for (int i : referenceObjsIds) { count++; Document document = reader.document(i); document.add(new Field("ro-id", count + "", StringField.TYPE_STORED)); iw.addDocument(document); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Indexing); // now find the reference objects for each entry ;) IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField); iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Idle); }
From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java
License:Open Source License
/** * We assume that the initial indexing has been done and a set of reference objects has been * found and indexed in the separate fileList. However further documents were added and they * now need to get a ranked list of reference objects. So we (i) get all these new documents * missing the field "ro-order" and (ii) add this field. * * @param indexPath the index to update// ww w . j a v a 2 s .c om * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }