Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:net.semanticmetadata.lire.impl.CEDDImageSearcher.java

License:Open Source License

public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    SimpleImageDuplicates simpleImageDuplicates = null;
    try {//from w  w  w . j a v a  2  s  .  co m
        if (!IndexReader.indexExists(reader.directory()))
            throw new FileNotFoundException("No index found at this specific location.");
        Document doc = reader.document(0);

        CEDD lireFeature = (CEDD) descriptorClass.newInstance();
        byte[] cls = doc.getBinaryValue(fieldName);
        if (cls != null && cls.length > 0)
            lireFeature.setByteArrayRepresentation(cls);

        HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

        // find duplicates ...
        boolean hasDeletions = reader.hasDeletions();

        int docs = reader.numDocs();
        int numDuplicates = 0;
        for (int i = 0; i < docs; i++) {
            if (hasDeletions && reader.isDeleted(i)) {
                continue;
            }
            Document d = reader.document(i);
            float distance = getDistance(d, lireFeature);

            if (!duplicates.containsKey(distance)) {
                duplicates.put(distance, new LinkedList<String>());
            } else {
                numDuplicates++;
            }
            duplicates.get(distance).add(d.getFieldable(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
        }

        if (numDuplicates == 0)
            return null;

        LinkedList<List<String>> results = new LinkedList<List<String>>();
        for (float f : duplicates.keySet()) {
            if (duplicates.get(f).size() > 1) {
                results.add(duplicates.get(f));
            }
        }
        simpleImageDuplicates = new SimpleImageDuplicates(results);
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    }
    return simpleImageDuplicates;

}

From source file:net.semanticmetadata.lire.impl.ColorLayoutImageSearcher.java

License:Open Source License

public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    SimpleImageDuplicates simpleImageDuplicates = null;
    try {/*from  w w w  . j av  a2 s  .c  o  m*/
        if (!IndexReader.indexExists(reader.directory()))
            throw new FileNotFoundException("No index found at this specific location.");
        Document doc = reader.document(0);

        ColorLayout lireFeature = (ColorLayout) descriptorClass.newInstance();
        byte[] cls = doc.getBinaryValue(fieldName);
        if (cls != null && cls.length > 0)
            lireFeature.setByteArrayRepresentation(cls);

        HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

        // find duplicates ...
        boolean hasDeletions = reader.hasDeletions();

        int docs = reader.numDocs();
        int numDuplicates = 0;
        for (int i = 0; i < docs; i++) {
            if (hasDeletions && reader.isDeleted(i)) {
                continue;
            }
            Document d = reader.document(i);
            float distance = getDistance(d, lireFeature);

            if (!duplicates.containsKey(distance)) {
                duplicates.put(distance, new LinkedList<String>());
            } else {
                numDuplicates++;
            }
            duplicates.get(distance).add(d.getFieldable(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
        }

        if (numDuplicates == 0)
            return null;

        LinkedList<List<String>> results = new LinkedList<List<String>>();
        for (float f : duplicates.keySet()) {
            if (duplicates.get(f).size() > 1) {
                results.add(duplicates.get(f));
            }
        }
        simpleImageDuplicates = new SimpleImageDuplicates(results);
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    }
    return simpleImageDuplicates;

}

From source file:net.semanticmetadata.lire.impl.custom.SingleNddCeddImageSearcher.java

License:Open Source License

protected void init(IndexReader reader) {
    this.reader = reader;
    if (reader.hasDeletions()) {
        throw new UnsupportedOperationException(
                "The index has to be optimized first to be cached! Use IndexWriter.forceMerge(0) to do this.");
    }/*  w  ww  .ja v a  2s  . com*/
    docs = new TreeSet<SimpleResult>();
    try {
        this.cachedInstance = (LireFeature) this.descriptorClass.newInstance();
        if (fieldName == null)
            fieldName = this.cachedInstance.getFieldName();
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher ("
                + descriptorClass.getName() + "): " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher ("
                + descriptorClass.getName() + "): " + e.getMessage());
    }
    // put all respective features into an in-memory cache ...
    if (isCaching && reader != null) {
        int docs = reader.numDocs();
        featureCache = new ArrayList<double[]>(docs);
        try {
            Document d;
            for (int i = 0; i < docs; i++) {
                d = reader.document(i);
                cachedInstance.setByteArrayRepresentation(d.getField(fieldName).binaryValue().bytes,
                        d.getField(fieldName).binaryValue().offset, d.getField(fieldName).binaryValue().length);
                // normalize features,o we can use L1
                if (!halfDimensions) {
                    featureCache.add(normalize(cachedInstance.getDoubleHistogram()));
                } else {
                    featureCache.add(crunch(cachedInstance.getDoubleHistogram()));
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

From source file:net.semanticmetadata.lire.impl.custom.SingleNddCeddImageSearcher.java

License:Open Source License

/**
 * @param reader//from   w  w  w  .j  a  v  a2 s  .com
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    maxDistance = Double.MAX_VALUE;

    // clear result set ...
    docs.clear();
    double tmpDistance;

    // we use the in-memory cache to find the matching docs from the index.
    int count = 0;
    double[] doubleHistogram;
    if (!halfDimensions) {
        doubleHistogram = normalize(lireFeature.getDoubleHistogram());
    } else {
        doubleHistogram = crunch(lireFeature.getDoubleHistogram());
    }
    double[] tmp;
    int index = -1;
    for (Iterator<double[]> iterator = featureCache.iterator(); iterator.hasNext();) {
        tmp = iterator.next();
        tmpDistance = MetricsUtils.distL1(doubleHistogram, tmp);
        assert (tmpDistance >= 0);
        if (tmpDistance < maxDistance) {
            maxDistance = tmpDistance;
            index = count;
        }
        count++;
    }
    this.docs.add(new SimpleResult((float) maxDistance, reader.document(index), index));
    return (float) maxDistance;
}

From source file:net.semanticmetadata.lire.impl.FastOpponentImageSearcher.java

License:Open Source License

/**
 * @param reader/*from   ww  w . j a v a  2 s  .c om*/
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected double findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    maxDistance = -1f;
    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    Document d;
    double tmpDistance;
    int docs = reader.numDocs();
    byte[] histogram = lireFeature.getByteArrayRepresentation();
    for (int i = 0; i < docs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.

        d = reader.document(i);
        tmpDistance = getDistance(d, histogram);
        assert (tmpDistance >= 0);
        // calculate the overall max distance to normalize score afterwards
        //            if (overallMaxDistance < tmpDistance) {
        //                overallMaxDistance = tmpDistance;
        //            }
        // if it is the first document:
        if (maxDistance < 0) {
            maxDistance = tmpDistance;
        }
        // if the array is not full yet:
        if (this.docs.size() < maxHits) {
            this.docs.add(new SimpleResult((float) tmpDistance, d, i));
            if (tmpDistance > maxDistance)
                maxDistance = tmpDistance;
        } else if (tmpDistance < maxDistance) {
            // if it is nearer to the sample than at least on of the current set:
            // remove the last one ...
            this.docs.remove(this.docs.last());
            // add the new one ...
            this.docs.add(new SimpleResult((float) tmpDistance, d, i));
            // and set our new distance border ...
            maxDistance = this.docs.last().getDistance();
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.GenericFastImageSearcher.java

License:Open Source License

/**
 * @param reader//from w w w  . j  ava2 s  .  c  o m
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    maxDistance = Float.MAX_VALUE;
    //        overallMaxDistance = -1f;

    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    Document d;
    float tmpDistance;
    int docs = reader.numDocs();
    if (!isCaching) {
        // we read each and every document from the index and then we compare it to the query.
        for (int i = 0; i < docs; i++) {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue; // if it is deleted, just ignore it.

            d = reader.document(i);
            tmpDistance = getDistance(d, lireFeature);
            assert (tmpDistance >= 0);
            // if the array is not full yet:
            if (this.docs.size() < maxHits) {
                this.docs.add(new SimpleResult(tmpDistance, d, i));
                if (tmpDistance > maxDistance)
                    maxDistance = tmpDistance;
            } else if (tmpDistance < maxDistance) {
                // if it is nearer to the sample than at least on of the current set:
                // remove the last one ...
                this.docs.remove(this.docs.last());
                // add the new one ...
                this.docs.add(new SimpleResult(tmpDistance, d, i));
                // and set our new distance border ...
                maxDistance = this.docs.last().getDistance();
            }
        }
    } else {
        // we use the in-memory cache to find the matching docs from the index.
        int count = 0;
        for (Iterator<byte[]> iterator = featureCache.iterator(); iterator.hasNext();) {
            cachedInstance.setByteArrayRepresentation(iterator.next());
            tmpDistance = lireFeature.getDistance(cachedInstance);
            assert (tmpDistance >= 0);
            // if the array is not full yet:
            if (this.docs.size() < maxHits) {
                this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count));
                if (tmpDistance > maxDistance)
                    maxDistance = tmpDistance;
            } else if (tmpDistance < maxDistance) {
                // if it is nearer to the sample than at least on of the current set:
                // remove the last one ...
                this.docs.remove(this.docs.last());
                // add the new one ...
                this.docs.add(new SimpleResult(tmpDistance, reader.document(count), count));
                // and set our new distance border ...
                maxDistance = this.docs.last().getDistance();
            }
            count++;
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.GenericFastImageSearcher.java

License:Open Source License

public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    SimpleImageDuplicates simpleImageDuplicates = null;
    try {/*from  ww  w . j  av  a 2  s .  c o m*/
        //            if (!IndexReader.indexExists(reader.directory()))
        //                throw new FileNotFoundException("No index found at this specific location.");
        Document doc = reader.document(0);

        LireFeature lireFeature = (LireFeature) descriptorClass.newInstance();
        if (doc.getField(fieldName).binaryValue() != null && doc.getField(fieldName).binaryValue().length > 0)
            lireFeature.setByteArrayRepresentation(doc.getField(fieldName).binaryValue().bytes,
                    doc.getField(fieldName).binaryValue().offset, doc.getField(fieldName).binaryValue().length);

        HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

        // Needed for check whether the document is deleted.
        Bits liveDocs = MultiFields.getLiveDocs(reader);

        int docs = reader.numDocs();
        int numDuplicates = 0;
        for (int i = 0; i < docs; i++) {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue; // if it is deleted, just ignore it.

            Document d = reader.document(i);
            float distance = getDistance(d, lireFeature);

            if (!duplicates.containsKey(distance)) {
                duplicates.put(distance, new LinkedList<String>());
            } else {
                numDuplicates++;
            }
            duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
        }

        if (numDuplicates == 0)
            return null;

        LinkedList<List<String>> results = new LinkedList<List<String>>();
        for (float f : duplicates.keySet()) {
            if (duplicates.get(f).size() > 1) {
                results.add(duplicates.get(f));
            }
        }
        simpleImageDuplicates = new SimpleImageDuplicates(results);
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    }
    return simpleImageDuplicates;

}

From source file:net.semanticmetadata.lire.impl.GenericImageSearcher.java

License:Open Source License

/**
 * @param reader/*from   w w w . j ava  2 s.  c  om*/
 * @param lireFeature
 * @return the maximum distance found for normalizing.
 * @throws java.io.IOException
 */
protected float findSimilar(IndexReader reader, LireFeature lireFeature) throws IOException {
    float maxDistance = -1f, overallMaxDistance = -1f;
    float tmpDistance = 0f;
    // clear result set ...
    docs.clear();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    int docs = reader.numDocs();
    Document d = null;
    for (int i = 0; i < docs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.

        d = reader.document(i);
        tmpDistance = getDistance(d, lireFeature);
        //            if (distance < 0 || Float.isNaN(distance))
        //                System.out.println("X");
        assert (tmpDistance >= 0);
        // calculate the overall max distance to normalize score afterwards
        if (overallMaxDistance < tmpDistance) {
            overallMaxDistance = tmpDistance;
        }
        // if it is the first document:
        if (maxDistance < 0) {
            maxDistance = tmpDistance;
        }
        // if the array is not full yet:
        if (this.docs.size() < maxHits) {
            this.docs.add(new SimpleResult(tmpDistance, d, i));
            if (tmpDistance > maxDistance)
                maxDistance = tmpDistance;
        } else if (tmpDistance < maxDistance) {
            // if it is nearer to the sample than at least on of the current set:
            // remove the last one ...
            this.docs.remove(this.docs.last());
            // add the new one ...
            this.docs.add(new SimpleResult(tmpDistance, d, i));
            // and set our new distance border ...
            maxDistance = this.docs.last().getDistance();
        }
    }
    return maxDistance;
}

From source file:net.semanticmetadata.lire.impl.GenericImageSearcher.java

License:Open Source License

public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    SimpleImageDuplicates simpleImageDuplicates = null;
    try {// w  ww. ja v  a 2  s . co m
        //            if (!IndexReader.indexExists(reader.directory()))
        //                throw new FileNotFoundException("No index found at this specific location.");
        Document doc = reader.document(0);

        LireFeature lireFeature = (LireFeature) descriptorClass.newInstance();
        String[] cls = doc.getValues(fieldName);
        if (cls != null && cls.length > 0)
            lireFeature.setStringRepresentation(cls[0]);

        HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

        // Needed for check whether the document is deleted.
        Bits liveDocs = MultiFields.getLiveDocs(reader);

        int docs = reader.numDocs();
        int numDuplicates = 0;
        for (int i = 0; i < docs; i++) {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue; // if it is deleted, just ignore it.
            Document d = reader.document(i);
            float distance = getDistance(d, lireFeature);

            if (!duplicates.containsKey(distance)) {
                duplicates.put(distance, new LinkedList<String>());
            } else {
                numDuplicates++;
            }
            duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
        }

        if (numDuplicates == 0)
            return null;

        LinkedList<List<String>> results = new LinkedList<List<String>>();
        for (float f : duplicates.keySet()) {
            if (duplicates.get(f).size() > 1) {
                results.add(duplicates.get(f));
            }
        }
        simpleImageDuplicates = new SimpleImageDuplicates(results);
    } catch (InstantiationException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    } catch (IllegalAccessException e) {
        logger.log(Level.SEVERE, "Error instantiating class for generic image searcher: " + e.getMessage());
    }
    return simpleImageDuplicates;

}

From source file:net.semanticmetadata.lire.impl.LshImageSearcher.java

License:Open Source License

private ImageSearchHits search(String[] hashes, LireFeature queryFeature, IndexReader reader)
        throws IOException {
    // first search by text:
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new DefaultSimilarity() {
        @Override/*  ww w  .  j  ava2s  . c om*/
        public float tf(float freq) {
            return 1;
        }

        @Override
        public float idf(long docFreq, long numDocs) {
            return 1;
        }

        @Override
        public float coord(int overlap, int maxOverlap) {
            return 1;
        }

        @Override
        public float queryNorm(float sumOfSquaredWeights) {
            return 1;
        }

        @Override
        public float sloppyFreq(int distance) {
            return 1;
        }

        @Override
        public float lengthNorm(FieldInvertState state) {
            return 1;
        }
    });
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < hashes.length; i++) {
        // be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
        query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                BooleanClause.Occur.SHOULD));
    }
    TopDocs docs = searcher.search(query, maxResultsHashBased);
    // then re-rank
    TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
    float maxDistance = 0f;
    float tmpScore = 0f;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        feature.setByteArrayRepresentation(
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).bytes,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).offset,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).length);
        tmpScore = queryFeature.getDistance(feature);
        if (resultScoreDocs.size() < maximumHits) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            maxDistance = Math.max(maxDistance, tmpScore);
        } else if (tmpScore < maxDistance) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
        }
        while (resultScoreDocs.size() > maximumHits) {
            resultScoreDocs.remove(resultScoreDocs.last());
            maxDistance = resultScoreDocs.last().getDistance();
        }
        //            resultScoreDocs.add(new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
    }
    return new SimpleImageSearchHits(resultScoreDocs, maxDistance);
}