Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVector() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        System.out.println(reader.document(i));
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        Term term;//from   w  w  w. ja  v a  2s .  com
        while (te.next() != null) {
            term = new Term("text", te.term());
            long tf = te.totalTermFreq();
            // Print out the term and its term frequency
            System.out.println(term.bytes().utf8ToString() + " " + tf);
        }
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("docid").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("text", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();

            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid
            Query termQuery = new TermQuery(new Term("text", term)); // the term
            BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            TopDocs rs = searcher.search(finalQuery, 1); // issue the query

            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + rs.getMaxScore());
        }//from  w w  w.  j  av a  2 s .c om
    }
}

From source file:io.datalayer.lucene.delete.LuceneDeleteTest.java

License:Apache License

@Test
public void testDelete() throws IOException {

    IndexWriter writer = AosIndexUtil.newIndexWithDocuments();

    Term term = new Term(ID, "1");
    Query query = new TermQuery(term);

    IndexReader reader = DirectoryReader.open(writer, true);
    IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(writer, true));

    TopDocs topDocs = indexSearcher.search(query, 1);
    LOGGER.info("" + topDocs.scoreDocs[0].doc);
    assertNotNull(reader.document(topDocs.scoreDocs[0].doc));

    LOGGER.info("Deleting documents containing " + term);
    writer.deleteDocuments(term);//from  ww  w  .  j av a  2  s  . c o m
    //        writer.deleteDocuments(query);
    writer.commit();

    indexSearcher = new IndexSearcher(DirectoryReader.open(writer, true));
    topDocs = indexSearcher.search(query, 1);
    assertEquals(0, topDocs.scoreDocs.length);

    reader.close();
    writer.close();

}

From source file:ir.project.TFIDFMatrix.java

private void createMatrix() {
    try {/*www . ja  v  a2 s . c om*/
        this.matrix = new TFIDFBookVector[numDocs];

        IndexReader reader = DirectoryReader.open(this.index);

        for (int i = 0; i < numDocs; i++) {
            Terms vector = reader.getTermVector(i, "text");

            // get title
            IndexableField titleField = reader.document(i).getField("title");
            String title = titleField.stringValue();

            // get isbn
            IndexableField isbnField = reader.document(i).getField("isbn");
            String isbn = isbnField.stringValue();

            // get author
            IndexableField authorField = reader.document(i).getField("author");
            String author = authorField.stringValue();

            this.matrix[i] = new TFIDFBookVector(numTerms, title, isbn, author);

            if (vector == null) {
                System.err.println("Vector is null");
                continue;
            }

            TermsEnum it = vector.iterator();

            while (it.next() != null) {
                Term t = new Term("text", it.term().utf8ToString());

                // TotalTermFreq returns frequency of term in document.
                Long tf = it.totalTermFreq();
                double idf = (double) 1 / (double) reader.totalTermFreq(t);

                double tfIdfWeight = tf * idf;

                // put TF-IDF weight in matrix
                int termIndex = this.termMap.get(it.term().utf8ToString());
                this.matrix[i].editValue(termIndex, tfIdfWeight);
            }
        }

        reader.close();

    } catch (IOException ex) {
        Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Loads the map containing the conversion from the Wikipedia ids to the
 * Lucene Ids./*from   ww w  .  ja v a 2  s  . c o m*/
 */
protected void parseWikiIdToLuceneId() {
    logger.warn("no index wikiID -> lucene found - I'll generate");
    IndexReader reader = getReader();
    wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs());
    ProgressLogger pl = new ProgressLogger("creating wiki2lucene, readed {} docs", 100000);
    int numDocs = reader.numDocs();
    for (int i = 0; i < numDocs; i++) {
        pl.up();
        try {
            Document doc = reader.document(i);
            IndexableField f = doc.getField(LUCENE_ARTICLE_ID);
            Integer wikiId = new Integer(f.stringValue());
            wikiIdToLuceneId.put(wikiId, i);
        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

private Document getDoc(int wikiId) {
    IndexReader reader = getReader();

    // System.out.println("get docId "+pos);
    if (wikiId <= 0)
        return null;
    int docId = getLuceneId(wikiId);
    if (docId < 0) {
        logger.warn("no id for wikiId {}", wikiId);

        return null;
    }//ww w .jav  a  2s  .co  m
    logger.debug("get wikiId {}  ->  docId {}", wikiId, docId);
    Document doc = null;
    try {
        doc = reader.document(docId);
    } catch (Exception e) {
        logger.error("retrieving doc in position {} {}", docId, e.toString());
        System.exit(-1);
    }

    return doc;
}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

public int getWikiId(int luceneId) {
    IndexReader reader = getReader();

    // System.out.println("get docId "+pos);

    Document doc = null;/*from w  ww.j a  v a2 s  .c  om*/
    try {
        doc = reader.document(luceneId);
    } catch (Exception e) {
        logger.error("retrieving doc in position {} {}", luceneId, e.toString());
        System.exit(-1);
    }
    return Integer.parseInt(doc.get(LUCENE_ARTICLE_ID));
}

From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java

License:Open Source License

protected Set<String> deleteContainerAndBelow(String nodeRef, IndexReader reader, boolean delete,
        boolean cascade) throws LuceneIndexException {
    Set<String> refs = new LinkedHashSet<String>();
    List<Integer> deletions = new ArrayList<Integer>();
    try {/*from   www.  j ava  2 s. c  om*/
        if (delete) {
            service.delete(new Term("ID", nodeRef), deltaId, getRepoStorePath());
        }
        refs.add(nodeRef);
        if (cascade) {
            TermDocs td = reader.termDocs(new Term("ANCESTOR", nodeRef));
            while (td.next()) {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete) {
                    deletions.add(new Integer(doc));
                    //                  service.delete(doc, deltaId);
                }
            }
            if (delete) {
                service.delete(deletions, deltaId, getRepoStorePath());
            }
        }

    } catch (IOException e) {
        throw new LuceneIndexException("Failed to delete container and below for " + nodeRef, e);
    }
    return refs;
}

From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java

License:Open Source License

protected Set<String> deletePrimary(Collection<String> nodeRefs, IndexReader reader, boolean delete)
        throws LuceneIndexException {
    Set<String> refs = new LinkedHashSet<String>();
    List<Integer> deletions = new ArrayList<Integer>();
    for (String nodeRef : nodeRefs) {
        try {/*  www  .j  a  v  a  2s . c  o m*/
            TermDocs td = reader.termDocs(new Term("PRIMARYPARENT", nodeRef));
            while (td.next()) {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete) {
                    deletions.add(new Integer(doc));
                }
            }
            if (delete) {
                service.delete(deletions, deltaId, getRepoStorePath());
            }
        } catch (IOException e) {
            throw new LuceneIndexException("Failed to delete node by primary parent for " + nodeRef, e);
        }
    }

    return refs;

}

From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java

License:Open Source License

protected Set<String> deleteReference(Collection<String> nodeRefs, IndexReader reader, boolean delete)
        throws LuceneIndexException {
    Set<String> refs = new LinkedHashSet<String>();
    List<Integer> deletions = new ArrayList<Integer>();
    for (String nodeRef : nodeRefs) {
        try {//from ww  w  .  ja  va  2s .  c om
            TermDocs td = reader.termDocs(new Term("PARENT", nodeRef));
            while (td.next()) {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete) {
                    deletions.add(new Integer(doc));
                    //                  service.delete(doc, deltaId);
                }
            }
            if (delete) {
                service.delete(deletions, deltaId, getRepoStorePath());
            }
        } catch (IOException e) {
            throw new LuceneIndexException("Failed to delete node by parent for " + nodeRef, e);
        }
    }

    return refs;

}