Example usage for org.apache.lucene.index IndexReader getTermVector

List of usage examples for org.apache.lucene.index IndexReader getTermVector

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getTermVector.

Prototype

public final Terms getTermVector(int docID, String field) throws IOException 

Source Link

Document

Retrieve term vector for this document and field, or null if term vectors were not indexed.

Usage

From source file:Evaluator.TermFreq.java

public DocVector(String docid, IndexReader reader, IndexSearcher searcher) throws IOException, ParseException {
    this.docid = docid;
    this.termVector = new ArrayList<TermFreq>();
    Analyzer analyzer = new KeywordAnalyzer();
    QueryParser parser = new QueryParser("id", analyzer);
    Query query = parser.parse(docid);
    TopDocs topdocs = searcher.search(query, 1);
    int index = topdocs.scoreDocs[0].doc;

    Terms terms = reader.getTermVector(index, "words");
    TermsEnum termsEnum = null;//from   w  w w.j a  va2s. co  m
    termsEnum = terms.iterator();
    BytesRef term;
    while ((term = termsEnum.next()) != null) {
        TermFreq tf = new TermFreq(term.utf8ToString(), (int) termsEnum.totalTermFreq());
        this.termVector.add(tf);
    }

}

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;// www . j a v a  2  s .  co m

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVector() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        System.out.println(reader.document(i));
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        Term term;// w  ww  . j a  v  a2 s .co  m
        while (te.next() != null) {
            term = new Term("text", te.term());
            long tf = te.totalTermFreq();
            // Print out the term and its term frequency
            System.out.println(term.bytes().utf8ToString() + " " + tf);
        }
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("docid").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("text", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();

            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid
            Query termQuery = new TermQuery(new Term("text", term)); // the term
            BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            TopDocs rs = searcher.search(finalQuery, 1); // issue the query

            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + rs.getMaxScore());
        }//from  ww  w .j  a v  a2s  .  co m
    }
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Extract ALL the terms from the documents pool.
 *
 * @param docIds The reranking pool, see {@link #selectDocs} for explanations
 * @param context An instance of RerankerContext
 * @param filterPattern A Regex pattern that terms are collected only they matches the pattern, could be null
 * @return A Map of <term -> Set<docId>> kind of a small inverted list where the Set of docIds is where the term occurs
 *//* w  w  w  .  ja  v  a  2  s  .  co  m*/
private Map<String, Set<Integer>> extractTerms(Set<Integer> docIds, RerankerContext<T> context,
        Pattern filterPattern) throws Exception, IOException {
    IndexReader reader;
    IndexSearcher searcher;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
        searcher = new IndexSearcher(reader);
    } else {
        searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    Map<String, Set<Integer>> termDocidSets = new HashMap<>();
    for (int docid : docIds) {
        Terms terms = reader.getTermVector(docid, LuceneDocumentGenerator.FIELD_BODY);
        if (terms == null) {
            LOG.warn("Document vector not stored for docid: " + docid);
            continue;
        }
        TermsEnum te = terms.iterator();
        if (te == null) {
            LOG.warn("Document vector not stored for docid: " + docid);
            continue;
        }
        while ((te.next()) != null) {
            String term = te.term().utf8ToString();
            // We do some noisy filtering here ... pure empirical heuristic
            if (term.length() < 2)
                continue;
            if (!term.matches("[a-z]+"))
                continue;
            if (filterPattern == null || filterPattern.matcher(term).matches()) {
                if (!termDocidSets.containsKey(term)) {
                    termDocidSets.put(term, new HashSet<>());
                }
                termDocidSets.get(term).add(docid);
            }
        }
    }
    return termDocidSets;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector estimateRelevanceModel(ScoredDocuments docs, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    Set<String> vocab = new HashSet<>();
    int numdocs = docs.documents.length < fbDocs ? docs.documents.length : fbDocs;
    FeatureVector[] docvectors = new FeatureVector[numdocs];

    for (int i = 0; i < numdocs; i++) {
        try {/*from   www .  j  ava2s .c om*/
            FeatureVector docVector = createdFeatureVector(reader.getTermVector(docs.ids[i], field), reader,
                    tweetsearch);
            docVector.pruneToSize(fbTerms);

            vocab.addAll(docVector.getFeatures());
            docvectors[i] = docVector;
        } catch (IOException e) {
            e.printStackTrace();
            // Just return empty feature vector.
            return f;
        }
    }

    // Precompute the norms once and cache results.
    float[] norms = new float[docvectors.length];
    for (int i = 0; i < docvectors.length; i++) {
        norms[i] = (float) docvectors[i].computeL1Norm();
    }

    for (String term : vocab) {
        float fbWeight = 0.0f;
        for (int i = 0; i < docvectors.length; i++) {
            fbWeight += (docvectors[i].getFeatureWeight(term) / norms[i]) * docs.scores[i];
        }
        f.addFeatureWeight(term, fbWeight);
    }

    f.pruneToSize(fbTerms);
    f.scaleToUnitL1Norm();

    return f;
}

From source file:ir.project.TFIDFMatrix.java

private void createTermMap() {
    try {/*from w w  w.j ava 2  s .c o  m*/
        IndexReader reader = DirectoryReader.open(this.index);

        this.termMap = new HashMap<>(); // Map used to identify position in matrix for 
        this.numDocs = reader.maxDoc();
        int count = 0;

        // Setup the termMap
        for (int i = 0; i < numDocs; i++) {

            Terms vector = reader.getTermVector(i, "text");
            if (vector == null) {
                System.err.println("Vector is null!");
                continue;
            }

            TermsEnum it = vector.iterator();
            while (it.next() != null) {
                Term t = new Term("text", it.term().utf8ToString());

                if (!termMap.containsKey(it.term().utf8ToString())) {
                    termMap.put(it.term().utf8ToString(), count);
                    count += 1;
                }
            }
        }

        this.numTerms = count;
        reader.close();

    } catch (IOException ex) {
        Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:ir.project.TFIDFMatrix.java

private void createMatrix() {
    try {/*w w  w. j a  v  a 2  s . c o m*/
        this.matrix = new TFIDFBookVector[numDocs];

        IndexReader reader = DirectoryReader.open(this.index);

        for (int i = 0; i < numDocs; i++) {
            Terms vector = reader.getTermVector(i, "text");

            // get title
            IndexableField titleField = reader.document(i).getField("title");
            String title = titleField.stringValue();

            // get isbn
            IndexableField isbnField = reader.document(i).getField("isbn");
            String isbn = isbnField.stringValue();

            // get author
            IndexableField authorField = reader.document(i).getField("author");
            String author = authorField.stringValue();

            this.matrix[i] = new TFIDFBookVector(numTerms, title, isbn, author);

            if (vector == null) {
                System.err.println("Vector is null");
                continue;
            }

            TermsEnum it = vector.iterator();

            while (it.next() != null) {
                Term t = new Term("text", it.term().utf8ToString());

                // TotalTermFreq returns frequency of term in document.
                Long tf = it.totalTermFreq();
                double idf = (double) 1 / (double) reader.totalTermFreq(t);

                double tfIdfWeight = tf * idf;

                // put TF-IDF weight in matrix
                int termIndex = this.termMap.get(it.term().utf8ToString());
                this.matrix[i].editValue(termIndex, tfIdfWeight);
            }
        }

        reader.close();

    } catch (IOException ex) {
        Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Returns the cosine similarity between two documents
 * /*w  w  w .  ja v a  2 s .c  o m*/
 * @param x
 *            - the WikiId of the first document
 * @param y
 *            - the WikiId of the first document
 * @param field
 *            - the field on which to compute the similarity
 * 
 * @return a double between 0 (not similar) and 1 (same content),
 *         representing the similarity between the 2 documents
 */
public double getCosineSimilarity(int x, int y, String field) {

    IndexReader reader = getReader();
    Terms tfvX = null;
    Terms tfvY = null;
    try {
        tfvX = reader.getTermVector(getLuceneId(x), field);
        tfvY = reader.getTermVector(getLuceneId(y), field);

        // try {
        // tfvX = reader.document(idX).getBinaryValue("asd")
        // getTermFreqVectors(idX);
        // tfvY = reader.getTermFreqVectors(idY);
    } catch (IOException e) {
        logger.error("computing cosine similarity ({}) ", e.toString());
        System.exit(-1);
    }

    Map<String, Integer> xfrequencies = new HashMap<String, Integer>();
    Map<String, Integer> yfrequencies = new HashMap<String, Integer>();
    TermsEnum xtermsEnum = null;
    try {
        xtermsEnum = tfvX.iterator(null);

        BytesRef text;

        while ((text = xtermsEnum.next()) != null) {
            String term = text.utf8ToString();
            int freq = (int) xtermsEnum.totalTermFreq();
            xfrequencies.put(term, freq);
        }

        TermsEnum ytermsEnum = tfvY.iterator(null);
        while ((text = ytermsEnum.next()) != null) {
            String term = text.utf8ToString();
            int freq = (int) ytermsEnum.totalTermFreq();
            yfrequencies.put(term, freq);
        }

    } catch (IOException e) {
        logger.error("computing cosine similarity ({}) ", e.toString());
        System.exit(-1);
    }
    Map<String, Double> xTfidf = new HashMap<String, Double>();
    Map<String, Double> yTfidf = new HashMap<String, Double>();
    double xnorm = tfidfVector(xTfidf, xfrequencies, field);
    double ynorm = tfidfVector(yTfidf, yfrequencies, field);

    double dotproduct = 0;

    for (Map.Entry<String, Double> k : xTfidf.entrySet()) {
        if (yTfidf.containsKey(k.getKey())) {
            logger.info("key {}", k.getKey());
            logger.info("key x {} y {} ", k.getValue(), yTfidf.get(k.getKey()));
            dotproduct += k.getValue() * yTfidf.get(k.getKey());
            logger.info("dotproduct {} ", dotproduct);
        }

    }
    return dotproduct / (xnorm * ynorm);

}

From source file:lia.chapter5.CategorizerTest.java

License:Apache License

private void buildCategoryVectors() throws IOException {
    IndexSearcher searcher = Utils.getBookIndexSearcher();
    IndexReader reader = searcher.getIndexReader();

    int maxDoc = reader.maxDoc();
    System.out.println(maxDoc);//from  w  w  w. ja v  a 2  s.c  o  m
    for (int i = 0; i < maxDoc; i++) {
        Document doc = reader.document(i);
        String category = doc.get("category");
        System.out.println("\n" + doc.get("subject") + "\n");
        Map vectorMap = (Map) categoryMap.get(category);
        if (vectorMap == null) {
            vectorMap = new TreeMap();
            categoryMap.put(category, vectorMap);
        }

        Terms termsVector = reader.getTermVector(i, "subject");

        addTermFreqToMap(vectorMap, termsVector);
    }
}