Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:lab_mri.RocchioExpander.java

private float getScore(List<SearchResult> results, String term) throws IOException {
    float tfidf = 0;
    try (IndexReader idxreader = IndexReader
            .open(FSDirectory.open(new File("/home/luigi/NetBeansProjects/LAB_mri/inv_index")))) {

        for (SearchResult res : results) {

            Terms termVector = idxreader.getTermVector(Integer.parseInt(res.getId()) - 1, "abst");
            int docsnum = idxreader.numDocs();
            TermsEnum itr = null;/*from  www  . j  ava 2  s. c  o m*/
            if (termVector != null) {
                itr = termVector.iterator(null);

                BytesRef this_term = null;

                while ((this_term = itr.next()) != null) {
                    String termTxt = this_term.utf8ToString();
                    if (term.equalsIgnoreCase(termTxt)) {
                        double tf = itr.totalTermFreq();
                        double df = idxreader.docFreq(new Term("abst", term));
                        float idf = (float) Math.log(docsnum / df);
                        tfidf = (float) (tf * idf);
                        return tfidf;
                    }
                }
            }

        }

    }
    return tfidf;

}

From source file:lia.chapter5.CategorizerTest.java

License:Apache License

private void addTermFreqToMap(Map vectorMap, Terms termsVector) throws IOException {
    TermsEnum termsEnum = termsVector.iterator();
    BytesRef bytesRef = termsEnum.next();
    while (bytesRef != null) {
        String term = bytesRef.utf8ToString();
        System.out.println(term + " " + termsEnum.totalTermFreq());
        if (vectorMap.containsKey(term)) {
            Long value = (Long) vectorMap.get(term);
            vectorMap.put(term, new Long(value.intValue() + termsEnum.totalTermFreq()));
        } else {/* ww  w  . ja va2 s  .  c o  m*/
            vectorMap.put(term, new Long(termsEnum.totalTermFreq()));
        }
        bytesRef = termsEnum.next();
    }
    System.out.println();
}

From source file:lucene.CosineDocumentSimilarity.java

Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) throws IOException {
    Terms vector = reader.getTermVector(docId, CONTENT);
    TermsEnum termsEnum = null;/*from   ww  w  . j a v a 2  s.  c  o  m*/
    //   termsEnum = vector.iterator(termsEnum);
    Map<String, Integer> frequencies = new HashMap<>();
    BytesRef text = null;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        int freq = (int) termsEnum.totalTermFreq();
        frequencies.put(term, freq);
        terms.add(term);
    }
    return frequencies;
}

From source file:lucene.security.index.SecureAtomicReaderTestBase.java

License:Apache License

@Test
public void testTermWalk() throws IOException, ParseException {
    SecureAtomicReader secureReader = getSecureReader();
    Fields fields = secureReader.fields();
    for (String field : fields) {
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef ref;
        while ((ref = termsEnum.next()) != null) {
            System.out.println(field + " " + ref.utf8ToString());
            DocsEnum docsEnum = termsEnum.docs(null, null);
            int doc;
            while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                System.out.println(field + " " + ref.utf8ToString() + " " + doc);
            }//from  w  w  w .ja v  a 2  s  .  c om
        }
    }
    secureReader.close();
}

From source file:lucenesearch.NGram.java

private ArrayList<String[]> get2Gram(ExtendedDocument doc) throws IOException {
    ArrayList<String[]> res = new ArrayList<>();

    Terms t = doc.getTermVector("Body");
    TermsEnum itr = t.iterator();/* w w w.  j  a va2 s .  c om*/
    BytesRef term;
    ArrayList<String> terms = new ArrayList<>();
    while ((term = itr.next()) != null) {
        String termText = term.utf8ToString();
        terms.add(termText);
    }

    int n = terms.size();

    for (int i = 1; i < n; i++) {
        String[] temp = new String[2];
        temp[0] = terms.get(i - 1);
        temp[1] = terms.get(i);
        res.add(temp);
    }

    return res;
}

From source file:lucenesearch.NGram.java

private ArrayList<String[]> get4Gram(ExtendedDocument doc) throws IOException {
    ArrayList<String[]> res = new ArrayList<>();

    Terms t = doc.getTermVector("Body");
    TermsEnum itr = t.iterator();//from   w w  w  .j a v  a2 s . c  o m
    BytesRef term;
    ArrayList<String> terms = new ArrayList<>();
    while ((term = itr.next()) != null) {
        String termText = term.utf8ToString();
        terms.add(termText);
    }

    int n = terms.size();

    for (int i = 3; i < n; i++) {
        String[] temp = new String[4];
        temp[0] = terms.get(i - 3);
        temp[1] = terms.get(i - 2);
        temp[2] = terms.get(i - 1);
        temp[3] = terms.get(i);
        res.add(temp);
    }

    return res;
}

From source file:lux.IndexTestSupport.java

License:Mozilla Public License

public static void printAllTerms(Directory dir, XmlIndexer indexer) throws IOException {
    DirectoryReader reader = DirectoryReader.open(dir);
    Fields fields = MultiFields.getFields(reader);
    System.out.println("Printing all terms (except uri)");
    String uriFieldName = indexer.getConfiguration().getFieldName(FieldRole.URI);
    for (String field : fields) {
        if (field.equals(uriFieldName)) {
            continue;
        }/*from   w  ww  .j  a  va 2 s. c o m*/
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text;
        int count = 0;
        while ((text = termsEnum.next()) != null && count++ < 100) {
            System.out.println(field + " " + text.utf8ToString() + ' ' + termsEnum.docFreq());
        }
    }
    reader.close();
}

From source file:nicta.com.au.patent.pac.index.TermFreqVector.java

public TermFreqVector(Terms terms) throws IOException {
    TermFreqVector = new HashMap<>();
    if (terms != null && terms.size() > 0) {
        TermsEnum termsEnum = terms.iterator(null); // access the terms for this field
        BytesRef term;
        //            System.out.println("--------");
        while ((term = termsEnum.next()) != null) {// explore the terms for this field
            DocsAndPositionsEnum docsPosEnum = termsEnum.docsAndPositions(null, null);
            docsPosEnum.nextDoc();/*from w w  w .  j a  v a 2  s  . c  om*/
            TermFreqVector.put(term.utf8ToString(), docsPosEnum.freq());
            //                System.out.print(term.utf8ToString() + " " + docsPosEnum.freq() + " positions: "); //get the term frequency in the document
            for (int j = 0; j < docsPosEnum.freq(); j++) {
                //                    System.out.print(docsPosEnum.nextPosition() + " ");
            }
            //                System.out.println("");
            //                System.out.print(term.utf8ToString()+" ");
        }
        //            System.out.println("");
        //            System.out.println("----------");
    }
}

From source file:org.alfresco.solr.AlfrescoFieldType.java

License:Open Source License

@Override
public Object toObject(SchemaField sf, BytesRef term) {
    return term.utf8ToString();
}

From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java

License:Apache License

/**
 * Builds a new {@link TermRangeQuery} instance
 * @param field Field//from   ww  w  .j  av a2s  .co  m
 * @param part1 min
 * @param part2 max
 * @param startInclusive true if the start of the range is inclusive
 * @param endInclusive true if the end of the range is inclusive
 * @return new {@link TermRangeQuery} instance
 */
protected TextPattern newRangeQuery(String field, String part1, String part2, boolean startInclusive,
        boolean endInclusive) {
    final BytesRef start;
    final BytesRef end;

    if (part1 == null) {
        start = null;
    } else {
        start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
    }

    if (part2 == null) {
        end = null;
    } else {
        end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
    }

    // BL was: TermRangeQuery
    final TPTermRange query = new TPTermRange(field, start.utf8ToString(), end.utf8ToString(), startInclusive,
            endInclusive);

    //query.setRewriteMethod(multiTermRewriteMethod); // BL disabled
    return query;
}