Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:util.AllTerms.java

public void initAllTerms() throws IOException {
    int pos = 0;//from  ww w  .  jav  a  2s .  c om
    for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) {
        Terms vector = indexReader.getTermVector(docId, Configuration.FIELD_CONTENT);
        TermsEnum termsEnum = null;
        termsEnum = vector.iterator(termsEnum);
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            allTerms.put(term, pos++);
        }
    }

    //Update postition
    pos = 0;
    for (Iterator it = allTerms.entrySet().iterator(); it.hasNext();) {
        Entry<String, Integer> s = (Entry<String, Integer>) it.next();
        System.out.println(s.getKey());
        s.setValue(pos++);
    }
}

From source file:util.VectorGenerator.java

public DocVector[] GetDocumentVectors() throws IOException {
    for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) {
        Terms vector = indexReader.getTermVector(docId, Configuration.FIELD_CONTENT);
        TermsEnum termsEnum = null;//  w  w w  .j av a  2s  . c  o  m
        termsEnum = vector.iterator(termsEnum);
        BytesRef text = null;
        docVector[docId] = new DocVector(allterms);
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            int freq = (int) termsEnum.totalTermFreq();
            docVector[docId].setEntry(term, freq);
        }
        docVector[docId].normalize();
    }
    indexReader.close();
    return docVector;
}

From source file:vectorizer.TermInfo.java

private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception {
    DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME));
    Terms tfvector;/*from   w w  w.ja  va 2  s .c  o  m*/
    TermsEnum termsEnum;
    String termText;
    BytesRef term;
    int tf;
    float idf;

    tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME);

    if (tfvector == null)
        return null;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field

    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();

        float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText));
        idf = (float) Math.log(1 + numDocs / df);

        TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf);
        if (dict != null) {
            Translations translations = dict.getTranslationTerms(termText);
            for (TranslationInfo tinfo : translations.getTranslationInfo()) {
                termInfo.tf *= tinfo.weight;
            }
        }

        // Update global stats
        TermInfo seenTermInfo = collFreq.get(termText);
        if (seenTermInfo == null) {
            seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf);
            collFreq.put(termText, seenTermInfo);
        } else {
            seenTermInfo.tf += termInfo.tf; // coll freq
        }

        wmap.addTermInfo(termInfo);
    }

    return wmap;
}

From source file:yasoco.TermScore.java

List<TermScore> selTerms(int docId, String fieldName, Query q) throws Exception {

    int num_q_terms = Integer.parseInt(prop.getProperty("num_q_terms", "10"));
    int N = reader.numDocs();
    List<TermScore> tlist = new Vector<>();

    Terms terms = reader.getTermVector(docId, fieldName); //get terms vectors for one document and one field
    if (terms == null || terms.size() == 0)
        return tlist;

    TermsEnum termsEnum = terms.iterator(null); // access the terms for this field
    BytesRef term = null;

    int docLen = 0;
    while ((term = termsEnum.next()) != null) {// explore the terms for this field
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        int docIdEnum;

        while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            docLen += docsEnum.freq();/* w ww.  j  av a2 s.co m*/
        }
    }

    termsEnum = terms.iterator(null); // access the terms for this field
    while ((term = termsEnum.next()) != null) {// explore the terms for this field
        Term t = new Term(fieldName, term);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        int docIdEnum;

        while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            float ntf = tf / (float) docLen;
            int df = (int) (reader.totalTermFreq(t));
            float idf = N / (float) df;
            float tf_idf = lambda * ntf + (1 - lambda) * idf;

            tlist.add(new TermScore(term.utf8ToString(), tf_idf));
        }
    }

    Collections.sort(tlist); // desc
    List<TermScore> topList = tlist.subList(0, Math.min(tlist.size(), num_q_terms));
    return topList;
}