Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:net.semanticmetadata.lire.indexing.HashingTest.java

License:Open Source License

public void testImageSearcher() throws IOException {
    BitSamplingImageSearcher is = new BitSamplingImageSearcher(60, DocumentBuilder.FIELD_NAME_PHOG,
            DocumentBuilder.FIELD_NAME_PHOG + "_hash", new PHOG(), 500);
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File("E:\\wipo1m-idx")));
    Document queryDoc = reader.document(1);
    ImageSearchHits search = is.search(queryDoc, reader);
    ;/*from   w w w.j  a v a  2  s  .com*/
    long ms = System.currentTimeMillis();
    int runs = 50;
    for (int i = 0; i < runs; i++)
        search = is.search(queryDoc, reader);
    ms = System.currentTimeMillis() - ms;
    //        String file = FileUtils.saveImageResultsToHtml("wipo", search, queryDoc.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]);
    //        FileUtils.browseUri(file);
    System.out.println(((double) ms) / ((double) runs) + " ms per search.");
}

From source file:net.semanticmetadata.lire.indexing.HashingTest.java

License:Open Source License

private String printToHtml(TopDocs topDocs, IndexReader reader) throws IOException {
    String fileName = "results-" + System.currentTimeMillis() / 1000 + ".html";
    BufferedWriter bw = new BufferedWriter(new FileWriter(fileName));
    bw.write("<html>\n" + "<head><title>Search Results</title></head>\n" + "<body bgcolor=\"#FFFFFF\">\n");
    bw.write("<h3>query</h3>\n");
    bw.write("<a href=\"" + queryFile + "\"><img src=\"" + queryFile + "\"></a><p>\n");
    bw.write("<h3>results</h3>\n<table>");
    int elems = Math.min(topDocs.scoreDocs.length, 50);
    for (int i = 0; i < elems; i++) {
        if (i % 3 == 0)
            bw.write("<tr>");
        String s = reader.document(topDocs.scoreDocs[i].doc).get("descriptorImageIdentifier");
        s = new File(s).getAbsolutePath();
        bw.write("<td><a href=\"" + s + "\"><img style=\"max-width:220px\"src=\"" + s + "\"></a></td>\n");
        if (i % 3 == 2)
            bw.write("</tr>");
    }/*www  .  j  a  v  a  2  s. co m*/
    if (elems % 3 != 0) {
        if (elems % 3 == 2) {
            bw.write("<td>-</td>\n");
            bw.write("<td>-</td>\n");
        } else if (elems % 3 == 2) {
            bw.write("<td>-</td>\n");
        }
        bw.write("</tr>");
    }
    bw.write("</table></body>\n" + "</html>");
    bw.close();
    return new File(fileName).getPath();
}

From source file:net.semanticmetadata.lire.indexing.IndexVisualWordsTest.java

License:Open Source License

public void testIndexMissingFiles() throws IOException {
    // first delete some of the existing ones ...
    System.out.println("Deleting visual words from docs ...");
    IndexReader ir = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexWriter iw = LuceneUtils.createIndexWriter(index, false);
    int maxDocs = ir.maxDoc();
    for (int i = 0; i < maxDocs / 10; i++) {
        Document d = ir.document(i);
        //            d.removeFields(DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW);
        d.removeFields(DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW);
        //            d.removeFields(DocumentBuilder.FIELD_NAME_SURF_LOCAL_FEATURE_HISTOGRAM);
        d.removeFields(DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW_VECTOR);
        //            d.removeFields(DocumentBuilder.FIELD_NAME_SURF);
        iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
    }// w  w  w . j  a  v a  2  s  .c om
    System.out.println("# of deleted docs:  " + maxDocs / 10);
    System.out.println("Optimizing and closing ...");
    iw.close();
    ir.close();
    System.out.println("Creating new visual words ...");
    BOVWBuilder surfFeatureHistogramBuilder = new BOVWBuilder(
            DirectoryReader.open(FSDirectory.open(new File(index))), new SurfFeature(), numSamples, clusters);
    //        surfFeatureHistogramBuilder.indexMissing();
    //        System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.indexing.LocalitySensitiveHashingTest.java

License:Open Source License

public double singleSearch(int docNum) throws IOException, InstantiationException, IllegalAccessException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));

    // -----------

    String query = reader.document(docNum).getValues("hash")[0];
    CEDD ceddQuery = new CEDD();
    ceddQuery.setByteArrayRepresentation(
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes,
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset,
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length);

    // -----------

    HashSet<String> gold = new HashSet<String>(numImagesEval);
    ImageSearcher cis = ImageSearcherFactory.createCEDDImageSearcher(100);
    ImageSearchHits hits = cis.search(reader.document(docNum), reader);
    for (int i = 0; i < 10; i++) {
        gold.add(hits.doc(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]);
    }/*from w w w . java2 s .c o  m*/

    // ------------

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new SimilarityBase() {
        @Override
        protected float score(BasicStats basicStats, float freq, float v2) {
            return 1;
        }

        @Override
        public String toString() {
            return null;
        }
    });
    TopDocs topDocs = searcher.search(createQuery(query), 500);
    topDocs = rerank(topDocs, ceddQuery, reader);
    //        System.out.println("topDocs.scoreDocs.length = " + topDocs.scoreDocs.length);
    double numMatches = 0;
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        ScoreDoc scoreDoc = topDocs.scoreDocs[i];
        //            System.out.print(scoreDoc.score + ": ");
        String file = reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        //            System.out.println(file.substring(file.lastIndexOf('/') + 1) + (gold.contains(file)?" x":" o"));
        if (gold.contains(file))
            numMatches++;
    }
    return numMatches;
}

From source file:net.semanticmetadata.lire.indexing.LocalitySensitiveHashingTest.java

License:Open Source License

public void testOutputSearchResults() throws IOException, InstantiationException, IllegalAccessException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int docNum = 0; // doc to search for.
    // -----------

    String query = reader.document(docNum).getValues("hash")[0];
    CEDD ceddQuery = new CEDD();
    ceddQuery.setByteArrayRepresentation(
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes,
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset,
            reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length);

    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(createQuery(query), numImagesEval);
    FileUtils.saveImageResultsToPng("result_lsh", topDocs,
            reader.document(docNum).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0], reader);
}

From source file:net.semanticmetadata.lire.indexing.LocalitySensitiveHashingTest.java

License:Open Source License

private TopDocs rerank(TopDocs docs, LireFeature feature, IndexReader reader)
        throws IOException, IllegalAccessException, InstantiationException {
    LireFeature tmp = new CEDD();
    ArrayList<ScoreDoc> res = new ArrayList<ScoreDoc>(docs.scoreDocs.length);
    float maxScore = 0f;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        tmp.setByteArrayRepresentation(/*w  w  w.  j  a va2  s . c om*/
                reader.document(docs.scoreDocs[i].doc).getField(DocumentBuilder.FIELD_NAME_CEDD)
                        .binaryValue().bytes,
                reader.document(docs.scoreDocs[i].doc).getField(DocumentBuilder.FIELD_NAME_CEDD)
                        .binaryValue().offset,
                reader.document(docs.scoreDocs[i].doc).getField(DocumentBuilder.FIELD_NAME_CEDD)
                        .binaryValue().length);
        maxScore = Math.max(1 / tmp.getDistance(feature), maxScore);
        res.add(new ScoreDoc(docs.scoreDocs[i].doc, 1 / tmp.getDistance(feature)));
    }
    // sorting res ...
    Collections.sort(res, new Comparator<ScoreDoc>() {
        @Override
        public int compare(ScoreDoc o1, ScoreDoc o2) {
            return (int) Math.signum(o2.score - o1.score);
        }
    });
    return new TopDocs(numImagesEval, (ScoreDoc[]) res.toArray(new ScoreDoc[res.size()]), maxScore);
}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java

License:Open Source License

/**
 * Creates a set of reference objects and stores it in a new index (hashFunctionsFileName "<indexPath>-ro"). Then creates ordered
 * lists of reference object positions for each data item in the index with given feature.
 * Finally a new index (hashFunctionsFileName "<indexPath>-ms") is created where all the original documents as well as the new data
 * are stored.//from  w w  w.j av  a  2s  .  c  o  m
 *
 * @param indexPath the path to the original index
 * @throws IOException
 */
public void createIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();

    if (numDocs < numReferenceObjects) {
        throw new UnsupportedOperationException("Too few documents in index.");
    }

    // progress report
    progress.setNumDocsAll(numDocs);
    progress.setCurrentState(State.RoSelection);

    boolean hasDeletions = reader.hasDeletions();

    // init reference objects:
    IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true);
    HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects);

    double numDocsDouble = (double) numDocs;
    while (referenceObjsIds.size() < numReferenceObjects) {
        referenceObjsIds.add((int) (numDocsDouble * Math.random()));
    }
    int count = 0;

    if (hasDeletions) {
        System.err.println("WARNING: There are deleted docs in your index. You should "
                + "optimize your index before using this method.");
    }

    // progress report
    progress.setCurrentState(State.RoIndexing);

    // find them in the index and put them into a separate index:
    for (int i : referenceObjsIds) {
        count++;
        Document document = reader.document(i);
        document.add(new Field("ro-id", count + "", StringField.TYPE_STORED));
        iw.addDocument(document);
    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Indexing);

    // now find the reference objects for each entry ;)
    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField);

    iw = new IndexWriter(FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                    .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document document = reader.document(i);
        ImageSearchHits hits = searcher.search(document, readerRo);
        sb.delete(0, sb.length());
        for (int j = 0; j < numReferenceObjectsUsed; j++) {
            sb.append(hits.doc(j).getValues("ro-id")[0]);
            sb.append(' ');
        }
        // System.out.println(sb.toString());
        document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
        iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);

        // progress report
        progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Idle);

}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesInvertedListIndexing.java

License:Open Source License

/**
 * We assume that the initial indexing has been done and a set of reference objects has been
 * found and indexed in the separate fileList. However further documents were added and they
 * now need to get a ranked list of reference objects. So we (i) get all these new documents
 * missing the field "ro-order" and (ii) add this field.
 *
 * @param indexPath the index to update//from   www.j  a va2s.co  m
 * @throws IOException
 */
public void updateIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();
    boolean hasDeletions = reader.hasDeletions();
    int countUpdated = 0;

    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
    perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);

    IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                    .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document document = reader.document(i);
        if (document.getField("ro-order") == null) { // if the field is not here we create it.
            ImageSearchHits hits = searcher.search(document, readerRo);
            sb.delete(0, sb.length());
            for (int j = 0; j < numReferenceObjectsUsed; j++) {
                sb.append(hits.doc(j).getValues("ro-id")[0]);
                sb.append(' ');
            }
            // System.out.println(sb.toString());
            document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);
            countUpdated++;
        }

        // progress report
        progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

        // debug:
        System.out.println("countUpdated = " + countUpdated);
    }
    iw.commit();
    iw.close();
}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesTest.java

License:Open Source License

public void testSearch() throws IOException {
    int docNumber = 1;
    MetricSpacesInvertedListIndexing ms = MetricSpacesInvertedListIndexing.getDefaultInstance();
    MetricSpacesInvertedListIndexing.numReferenceObjectsUsed = 10;
    MetricSpacesInvertedListIndexing.numReferenceObjects = 50;
    IndexReader reader = ms.getIndexReader(indexPath);
    TopDocs docs = ms.search(reader.document(docNumber), indexPath);

    // print the results
    BufferedWriter bw = new BufferedWriter(new FileWriter("out.html"));
    bw.write("<html><body>");
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        ScoreDoc scoreDoc = docs.scoreDocs[i];
        bw.write("<img title=\"ID: " + scoreDoc.doc + ", " + "Score: " + scoreDoc.score + "\" src=\"file:///"
                + reader.document(scoreDoc.doc).getValues("descriptorImageIdentifier")[0] + "\"> ");
    }/*from   w ww .  j  a v  a2  s.  c o  m*/
    bw.write("</body></html>");
    bw.close();
    showUrl("out.html");

}

From source file:net.semanticmetadata.lire.indexing.MetricSpacesTest.java

License:Open Source License

public void testPerformance() throws IOException {
    MetricSpacesInvertedListIndexing mes = MetricSpacesInvertedListIndexing.getDefaultInstance();
    int numSearches = 10;
    IndexReader reader = mes.getIndexReader(indexPath);
    System.out.println(reader.maxDoc() + " documents");
    TopDocs docs;/*from  www .  j  av a 2 s  .  c om*/

    long ms = System.currentTimeMillis();
    for (int i = 0; i < numSearches; i++) {
        docs = mes.search(reader.document(i), indexPath);
    }
    ms = System.currentTimeMillis() - ms;
    System.out.println("ms = " + ms);

    ImageSearcher ceddSearcher = ImageSearcherFactory.createCEDDImageSearcher(100);
    ms = System.currentTimeMillis();
    for (int i = 0; i < numSearches; i++) {
        ceddSearcher.search(reader.document(i), reader);
    }
    ms = System.currentTimeMillis() - ms;
    System.out.println("ms = " + ms);
}