Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:vagueobjects.ir.lda.lucene.Indexer.java

License:Apache License

private void collect(Term term, IndexReader reader, IndexWriter writer, String[] fieldNames)
        throws IOException {
    int numDocs = reader.numDocs();
    String field = term.field();/*from ww  w  .j av  a2  s  .co  m*/
    String value = term.text();
    int count = 0;
    for (int d = 0; d < numDocs; ++d) {

        Document source = reader.document(d);
        if (!reader.isDeleted(d) && value.equals(source.get(field))) {
            ++count;
            if (count % 100000 == 0) {
                logger.debug("Passed " + count + "  documents");
            }
            Document document = new Document();
            for (String fieldName : fieldNames) {
                String v = source.get(fieldName);
                if (v != null) {
                    document.add(new Field(FIELD, v, Field.Store.YES, Field.Index.ANALYZED));
                }
            }
            writer.addDocument(document);
        }
    }
    if (count == 0) {
        throw new IllegalStateException("No matching documents found");
    }
}

From source file:vectorizer.TermInfo.java

private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception {
    DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME));
    Terms tfvector;/*from  w  w w . java  2  s.c o  m*/
    TermsEnum termsEnum;
    String termText;
    BytesRef term;
    int tf;
    float idf;

    tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME);

    if (tfvector == null)
        return null;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field

    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();

        float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText));
        idf = (float) Math.log(1 + numDocs / df);

        TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf);
        if (dict != null) {
            Translations translations = dict.getTranslationTerms(termText);
            for (TranslationInfo tinfo : translations.getTranslationInfo()) {
                termInfo.tf *= tinfo.weight;
            }
        }

        // Update global stats
        TermInfo seenTermInfo = collFreq.get(termText);
        if (seenTermInfo == null) {
            seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf);
            collFreq.put(termText, seenTermInfo);
        } else {
            seenTermInfo.tf += termInfo.tf; // coll freq
        }

        wmap.addTermInfo(termInfo);
    }

    return wmap;
}

From source file:wvec.WordVecsIndexer.java

void clusterWordVecs(IndexWriter clusterIndexWriter, int numClusters) throws Exception {
    // Index where word vectors are stored
    IndexReader reader = DirectoryReader.open(FSDirectory.open((new File(indexPath)).toPath()));
    int numDocs = reader.numDocs();
    KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>(numClusters);
    List<WordVec> wordList = new ArrayList<>(numDocs);

    // Read every wvec and load in memory
    for (int i = 0; i < numDocs; i++) {
        Document doc = reader.document(i);
        WordVec wvec = new WordVec(doc.get(FIELD_WORD_VEC));
        wordList.add(wvec);/*www.  j av a2s .  c o m*/
    }

    // Call K-means clustering
    System.out.println("Clustering the entire vocabulary...");
    List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList);

    // Save the cluster info
    System.out.println("Writing out cluster ids in Lucene index...");
    int clusterId = 0;
    for (CentroidCluster<WordVec> c : clusters) {
        List<WordVec> pointsInThisClusuter = c.getPoints();
        for (WordVec thisPoint : pointsInThisClusuter) {
            Document clusterInfo = constructDoc(thisPoint.word, String.valueOf(clusterId));
            clusterIndexWriter.addDocument(clusterInfo);
        }
        clusterId++;
    }

    reader.close();
}