Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:net.semanticmetadata.lire.impl.searcher.GenericFastImageSearcher.java

License:Open Source License

/**
 * @param reader/*from   w  w  w .  ja  va 2  s.c  o  m*/
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    maxDistance = -1f;
    //        overallMaxDistance = -1f;

    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    Document d;
    float tmpDistance;
    int docs = reader.numDocs();
    if (!isCaching) {
        // we read each and every document from the index and then we compare it to the query.
        for (int i = 0; i < docs; i++) {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue; // if it is deleted, just ignore it.

            d = reader.document(i);
            tmpDistance = getDistance(d, lireFeature);
            assert (tmpDistance >= 0);
            // if it is the first document:
            if (maxDistance < 0) {
                maxDistance = tmpDistance;
            }
            // if the array is not full yet:
            if (this.docs.size() < maxHits) {
                this.docs.add(new SimpleResult(tmpDistance, d, i));
                if (tmpDistance > maxDistance)
                    maxDistance = tmpDistance;
            } else if (tmpDistance < maxDistance) {
                // if it is nearer to the sample than at least on of the current set:
                // remove the last one ...
                this.docs.remove(this.docs.last());
                // add the new one ...
                this.docs.add(new SimpleResult(tmpDistance, d, i));
                // and set our new distance border ...
                maxDistance = this.docs.last().getDistance();
            }
        }
    } else {
        // we use the in-memory cache to find the matching docs from the index.
        int count = 0;
        for (Iterator<byte[]> iterator = featureCache.iterator(); iterator.hasNext();) {
            cachedInstance.setByteArrayRepresentation(iterator.next());
            if (reader.hasDeletions() && !liveDocs.get(count)) {
                count++;
                continue; // if it is deleted, just ignore it.
            } else {
                tmpDistance = lireFeature.getDistance(cachedInstance);
                assert (tmpDistance >= 0);
                // if it is the first document:
                if (maxDistance < 0) {
                    maxDistance = tmpDistance;
                }
                // if the array is not full yet:
                if (this.docs.size() < maxHits) {
                    this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count));
                    if (tmpDistance > maxDistance)
                        maxDistance = tmpDistance;
                } else if (tmpDistance < maxDistance) {
                    // if it is nearer to the sample than at least on of the current set:
                    // remove the last one ...
                    this.docs.remove(this.docs.last());
                    // add the new one ...
                    this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count));
                    // and set our new distance border ...
                    maxDistance = this.docs.last().getDistance();
                }
                count++;
            }
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.searcher.LocationBasedImageSearcher.java

License:Open Source License

/**
 * @param reader//  w w  w. j av  a  2  s .c  om
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected float findSimilar(IndexReader reader, ImageInfo imageInfo) throws IOException {
    float maxDistance = -1f, allMaxDistance = -1f;
    float tmpDistance = 0f;

    docs.clear();
    //????
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    int docNumber = reader.numDocs();
    Document d = null;
    for (int i = 0; i < docNumber; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; //?

        d = reader.document(i);//?
        tmpDistance = getDistance(d, imageInfo);//?
        //????
        if (tmpDistance < 0 || tmpDistance > this.threshold)
            continue;

        //????
        if (allMaxDistance < tmpDistance) {
            allMaxDistance = tmpDistance;
        }
        //??
        if (maxDistance < 0) {
            maxDistance = tmpDistance;
        }
        //????
        if (this.docs.size() < maxHits) {
            this.docs.add(new SimpleResult(tmpDistance, d, i));
            if (tmpDistance > maxDistance)
                maxDistance = tmpDistance;
            //?????
        } else if (tmpDistance < maxDistance) {
            //???
            this.docs.remove(this.docs.size() - 1);
            this.docs.add(new SimpleResult(tmpDistance, d, i));
            //?
            maxDistance = tmpDistance;

            Collections.sort(docs);
        }
    }
    //?
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java

License:Open Source License

/**
 * @param reader//w  w w .  j  a  va2s  .  c o  m
 * @param cl
 * @param sc
 * @param eh
 * @return the maximum distance found for normalizing.
 * @throws IOException
 */
private float findSimilar(IndexReader reader, ColorLayoutImpl cl, ScalableColorImpl sc,
        EdgeHistogramImplementation eh) throws IOException {
    float maxDistance = -1f, overallMaxDistance = -1f;
    boolean hasDeletions = reader.hasDeletions();

    // clear result set ...
    docs.clear();

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
        // bugfix by Roman Kern

        Document d = reader.document(i);
        float distance = getDistance(d, cl, sc, eh);
        // calculate the overall max distance to normalize score afterwards
        if (overallMaxDistance < distance) {
            overallMaxDistance = distance;
        }
        // if it is the first document:
        if (maxDistance < 0) {
            maxDistance = distance;
        }
        // if the array is not full yet:
        if (this.docs.size() < maxHits) {
            this.docs.add(new SimpleResult(distance, d));
            if (distance > maxDistance)
                maxDistance = distance;
        } else if (distance < maxDistance) {
            // if it is nearer to the sample than at least on of the current set:
            // remove the last one ...
            this.docs.remove(this.docs.last());
            // add the new one ...
            this.docs.add(new SimpleResult(distance, d));
            // and set our new distance border ...
            maxDistance = this.docs.last().getDistance();
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.SimpleImageSearcher.java

License:Open Source License

public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    if (!IndexReader.indexExists(reader.directory()))
        throw new FileNotFoundException("No index found at this specific location.");
    Document doc = reader.document(0);
    ScalableColorImpl sc = null;//  w  ww  .  j ava 2  s  .  c  o m
    ColorLayoutImpl cl = null;
    EdgeHistogramImplementation eh = null;

    String[] cls = doc.getValues(DocumentBuilder.FIELD_NAME_COLORLAYOUT);
    if (cls != null && cls.length > 0)
        cl = new ColorLayoutImpl(cls[0]);
    String[] scs = doc.getValues(DocumentBuilder.FIELD_NAME_SCALABLECOLOR);
    if (scs != null && scs.length > 0)
        sc = new ScalableColorImpl(scs[0]);
    String[] ehs = doc.getValues(DocumentBuilder.FIELD_NAME_EDGEHISTOGRAM);
    if (ehs != null && ehs.length > 0)
        eh = new EdgeHistogramImplementation(ehs[0]);

    HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

    // find duplicates ...
    boolean hasDeletions = reader.hasDeletions();

    int docs = reader.numDocs();
    int numDuplicates = 0;
    for (int i = 0; i < docs; i++) {
        Document d = reader.document(i);
        float distance = getDistance(d, cl, sc, eh);

        if (!duplicates.containsKey(distance)) {
            duplicates.put(distance, new LinkedList<String>());
        } else {
            numDuplicates++;
        }
        duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
    }

    if (numDuplicates == 0)
        return null;

    LinkedList<List<String>> results = new LinkedList<List<String>>();
    for (float f : duplicates.keySet()) {
        if (duplicates.get(f).size() > 1) {
            results.add(duplicates.get(f));
        }
    }
    return new SimpleImageDuplicates(results);
}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java

License:Open Source License

/**
 * Creates a set of reference objects and stores it in a new index (hashFunctionsFileName "<indexPath>-ro"). Then creates ordered
 * lists of reference object positions for each data item in the index with given feature.
 * Finally a new index (hashFunctionsFileName "<indexPath>-ms") is created where all the original documents as well as the new data
 * are stored./*w ww  .  j  av a 2  s  . c o m*/
 *
 * @param indexPath the path to the original index
 * @throws IOException
 */
public void createIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();

    if (numDocs < numReferenceObjects) {
        throw new UnsupportedOperationException("Too few documents in index.");
    }

    // progress report
    progress.setNumDocsAll(numDocs);
    progress.setCurrentState(State.RoSelection);

    boolean hasDeletions = reader.hasDeletions();

    // init reference objects:
    IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true);
    HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects);

    double numDocsDouble = (double) numDocs;
    while (referenceObjsIds.size() < numReferenceObjects) {
        referenceObjsIds.add((int) (numDocsDouble * Math.random()));
    }
    int count = 0;

    if (hasDeletions) {
        System.err.println("WARNING: There are deleted docs in your index. You should "
                + "optimize your index before using this method.");
    }

    // progress report
    progress.setCurrentState(State.RoIndexing);

    // find them in the index and put them into a separate index:
    for (int i : referenceObjsIds) {
        count++;
        Document document = reader.document(i);
        document.add(new Field("ro-id", count + "", StringField.TYPE_STORED));
        iw.addDocument(document);
    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Indexing);

    // now find the reference objects for each entry ;)
    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField);

    iw = new IndexWriter(FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                    .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document document = reader.document(i);
        ImageSearchHits hits = searcher.search(document, readerRo);
        sb.delete(0, sb.length());
        for (int j = 0; j < numReferenceObjectsUsed; j++) {
            sb.append(hits.doc(j).getValues("ro-id")[0]);
            sb.append(' ');
        }
        // System.out.println(sb.toString());
        document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
        iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);

        // progress report
        progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Idle);

}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java

License:Open Source License

/**
 * We assume that the initial indexing has been done and a set of reference objects has been
 * found and indexed in the separate fileList. However further documents were added and they
 * now need to get a ranked list of reference objects. So we (i) get all these new documents
 * missing the field "ro-order" and (ii) add this field.
 *
 * @param indexPath the index to update/*from w  w w  . ja  va2  s  .co m*/
 * @throws IOException
 */
public void updateIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();
    boolean hasDeletions = reader.hasDeletions();
    int countUpdated = 0;

    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
    perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);

    IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                    .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document document = reader.document(i);
        if (document.getField("ro-order") == null) { // if the field is not here we create it.
            ImageSearchHits hits = searcher.search(document, readerRo);
            sb.delete(0, sb.length());
            for (int j = 0; j < numReferenceObjectsUsed; j++) {
                sb.append(hits.doc(j).getValues("ro-id")[0]);
                sb.append(' ');
            }
            // System.out.println(sb.toString());
            document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);
            countUpdated++;
        }

        // progress report
        progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

        // debug:
        System.out.println("countUpdated = " + countUpdated);
    }
    iw.commit();
    iw.close();
}

From source file:net.semanticmetadata.lire.RuntimeTest.java

License:Open Source License

public void testCEDDSearch() throws IOException {
    int numsearches = 10;
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File("indexor-1.4mh")));
    int numDocs = reader.numDocs();
    System.out.println("numDocs = " + numDocs);

    // This is the new, shiny and fast one ...
    ImageSearcher searcher = ImageSearcherFactory.createCEDDImageSearcher(30);

    // This is the old and slow one.
    //        ImageSearcher searcher = ImageSearcherFactory.createCEDDImageSearcher(30);
    FileInputStream imageStream = new FileInputStream("E:\\Temp\\images1\\1\\im1.jpg");
    BufferedImage bimg = ImageIO.read(imageStream);
    ImageSearchHits hits = null;//from ww w.j  a  va2s. co  m
    long time = System.currentTimeMillis();
    for (int i = 0; i < numsearches; i++) {
        hits = searcher.search(bimg, reader);
    }
    time = System.currentTimeMillis() - time;
    System.out.println(
            ((float) time / (float) numsearches) + " ms per search with image, averaged on " + numsearches);
    for (int i = 0; i < hits.length(); i++) {
        System.out.println(hits.score(i) + ": "
                + hits.doc(i).getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
    }
    Document document = hits.doc(4);
    time = System.currentTimeMillis();
    for (int i = 0; i < numsearches; i++) {
        hits = searcher.search(document, reader);
    }
    time = System.currentTimeMillis() - time;
    System.out.println(
            ((float) time / (float) numsearches) + " ms per search with document, averaged on " + numsearches);
    for (int i = 0; i < hits.length(); i++) {
        System.out.println(hits.score(i) + ": "
                + hits.doc(i).getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
    }

}

From source file:net.semanticmetadata.lire.searchers.custom.SingleNddCeddImageSearcher.java

License:Open Source License

protected void init(IndexReader reader) {
    this.reader = reader;
    if (reader.hasDeletions()) {
        throw new UnsupportedOperationException(
                "The index has to be optimized first to be cached! Use IndexWriter.forceMerge(0) to do this.");
    }/*from w w  w.ja  v a  2 s  .c om*/
    docs = new TreeSet<SimpleResult>();
    try {
        this.cachedInstance = (GlobalFeature) this.descriptorClass.newInstance();
        if (fieldName == null)
            fieldName = this.cachedInstance.getFieldName();
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher ("
                + descriptorClass.getName() + "): " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher ("
                + descriptorClass.getName() + "): " + e.getMessage());
    }
    // put all respective features into an in-memory cache ...
    if (isCaching && reader != null) {
        int docs = reader.numDocs();
        featureCache = new ArrayList<double[]>(docs);
        try {
            Document d;
            for (int i = 0; i < docs; i++) {
                d = reader.document(i);
                cachedInstance.setByteArrayRepresentation(d.getField(fieldName).binaryValue().bytes,
                        d.getField(fieldName).binaryValue().offset, d.getField(fieldName).binaryValue().length);
                // normalize features,o we can use L1
                if (!halfDimensions) {
                    featureCache.add(normalize(cachedInstance.getFeatureVector()));
                } else {
                    featureCache.add(crunch(cachedInstance.getFeatureVector()));
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

From source file:net.semanticmetadata.lire.searchers.FastOpponentImageSearcher.java

License:Open Source License

/**
 * @param reader//from w  ww. j a  va2 s  .  c om
 * @param globalFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected double findSimilar(IndexReader reader, GlobalFeature globalFeature) throws IOException {
    maxDistance = -1f;
    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    Document d;
    double tmpDistance;
    int docs = reader.numDocs();
    byte[] histogram = globalFeature.getByteArrayRepresentation();
    for (int i = 0; i < docs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.

        d = reader.document(i);
        tmpDistance = getDistance(d, histogram);
        assert (tmpDistance >= 0);
        // calculate the overall max distance to normalize score afterwards
        //            if (overallMaxDistance < tmpDistance) {
        //                overallMaxDistance = tmpDistance;
        //            }
        // if it is the first document:
        if (maxDistance < 0) {
            maxDistance = tmpDistance;
        }
        // if the array is not full yet:
        if (this.docs.size() < maxHits) {
            this.docs.add(new SimpleResult(tmpDistance, i));
            if (tmpDistance > maxDistance)
                maxDistance = tmpDistance;
        } else if (tmpDistance < maxDistance) {
            // if it is nearer to the sample than at least on of the current set:
            // remove the last one ...
            this.docs.remove(this.docs.last());
            // add the new one ...
            this.docs.add(new SimpleResult(tmpDistance, i));
            // and set our new distance border ...
            maxDistance = this.docs.last().getDistance();
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.searchers.forevaluations.GenericFastImageSearcherForEvaluation.java

License:Open Source License

/**
 * @param reader//from www  .  j ava 2  s. c  om
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws IOException
 */
protected double findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    maxDistance = -1d;

    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    Document d;
    double tmpDistance;
    int docs = reader.numDocs();
    if (!isCaching) {
        // we read each and every document from the index and then we compare it to the query.
        for (int i = 0; i < docs; i++) {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue; // if it is deleted, just ignore it.

            d = reader.document(i);
            tmpDistance = getDistance(d, lireFeature);
            assert (tmpDistance >= 0);
            // if the array is not full yet:
            if (this.docs.size() < maxHits) {
                this.docs.add(new SimpleResultForEvaluation(tmpDistance, i,
                        d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]));
                if (tmpDistance > maxDistance)
                    maxDistance = tmpDistance;
            } else if (tmpDistance < maxDistance) {
                // if it is nearer to the sample than at least on of the current set:
                // remove the last one ...
                this.docs.remove(this.docs.last());
                // add the new one ...
                this.docs.add(new SimpleResultForEvaluation(tmpDistance, i,
                        d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]));
                // and set our new distance border ...
                maxDistance = this.docs.last().getDistance();
            }
        }
    } else {
        LinkedList<Consumer> tasks = new LinkedList<Consumer>();
        LinkedList<Thread> threads = new LinkedList<Thread>();
        Consumer consumer;
        Thread thread;
        Thread p = new Thread(new Producer());
        p.start();
        for (int i = 0; i < numThreads; i++) {
            consumer = new Consumer(lireFeature);
            thread = new Thread(consumer);
            thread.start();
            tasks.add(consumer);
            threads.add(thread);
        }
        for (Thread next : threads) {
            try {
                next.join();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        TreeSet<SimpleResultForEvaluation> tmpDocs;
        boolean flag;
        SimpleResultForEvaluation simpleResult;
        for (Consumer task : tasks) {
            tmpDocs = task.getResult();
            flag = true;
            while (flag && (tmpDocs.size() > 0)) {
                simpleResult = tmpDocs.pollFirst();
                if (this.docs.size() < maxHits) {
                    this.docs.add(simpleResult);
                    if (simpleResult.getDistance() > maxDistance)
                        maxDistance = simpleResult.getDistance();
                } else if (simpleResult.getDistance() < maxDistance) {
                    //                        this.docs.remove(this.docs.last());
                    this.docs.pollLast();
                    this.docs.add(simpleResult);
                    maxDistance = this.docs.last().getDistance();
                } else
                    flag = false;
            }
        }
    }
    return maxDistance;
}