Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:perf.ShowFields.java

License:Apache License

public static void main(String[] args) throws CorruptIndexException, IOException {
    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(
            new File("/home/simon/work/projects/lucene/bench/indices/Standard.work.trunk.wiki.nd0.1M/index")));
    Fields fields = MultiFields.getFields(reader);
    for (String name : fields) {
        System.out.println(name);
        if (name.equals("docdate")) {
            TermsEnum terms = fields.terms(name).iterator(null);
            BytesRef ref;//from  w  w w. j ava  2  s  .  c  om
            int i = 0;
            while ((ref = terms.next()) != null) {
                System.out.println(ref.utf8ToString());
                if (i++ == 10) {
                    break;
                }
            }
        }
    }
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {//from   www .  j  a v  a2  s  . c  om
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}

From source file:project.lucene.RelativeTermWeightQuery.java

License:Apache License

public void collectTermContext(IndexReader reader, List<AtomicReaderContext> leaves, TermContext[] contextArray,
        Term[] queryTerms) throws IOException {
    TermsEnum termsEnum = null;//from ww  w.  j  a va 2  s  .  c o m
    for (AtomicReaderContext context : leaves) {
        final Fields fields = context.reader().fields();
        if (fields == null) {
            // reader has no fields
            continue;
        }
        for (int i = 0; i < queryTerms.length; i++) {
            Term term = queryTerms[i];
            TermContext termContext = contextArray[i];
            final Terms terms = fields.terms(term.field());
            if (terms == null) {
                // field does not exist
                continue;
            }
            termsEnum = terms.iterator(termsEnum);
            assert termsEnum != null;

            if (termsEnum == TermsEnum.EMPTY)
                continue;
            if (termsEnum.seekExact(term.bytes())) {
                if (termContext == null) {
                    contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord,
                            termsEnum.docFreq(), termsEnum.totalTermFreq());
                } else {
                    termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(),
                            termsEnum.totalTermFreq());
                }
            }
        }
    }
}

From source file:searchenginelucene.LuceneSearchEngine.java

public static void getTermFrequencyPairs(String indexLocation) throws IOException {
    Map<String, Integer> termfrequency = new HashMap<String, Integer>();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    // Temporary location to store the interediate term frequency results
    PrintWriter writer_tf = new PrintWriter("..\\terf-frequency.csv");

    int docnum = reader.numDocs();
    // System.out.println("docnum:" + docnum);
    Fields fields1 = MultiFields.getFields(reader);
    for (String field : fields1) {
        Terms terms1 = fields1.terms("contents");
        TermsEnum termsEnum = terms1.iterator(null);
        int noWords = 0;

        while (termsEnum.next() != null) {
            noWords++;//ww  w .  j  a va 2  s .  co m
            int count = 0;
            DocsEnum docsEnum = termsEnum.docs(null, null);
            int docIdEnum;
            //System.out.print("The term is->" + termsEnum.term().utf8ToString());
            while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                count += docsEnum.freq();
            }
            //System.out.println("count:" + count);
            termfrequency.put(termsEnum.term().utf8ToString(), count);
        }
        System.out.println("Total Number of Words:" + noWords);
    }

    // =========================================================
    // Write the terms anf their frequencies in a file
    // =========================================================
    for (String key : termfrequency.keySet()) {
        writer_tf.print(key + ",");
        writer_tf.println(termfrequency.get(key));
    }
    writer_tf.close();

}

From source file:stackoverflow.lucene.modified.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *//*from   ww  w.j a  va  2  s . c  om*/
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<String, Int>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector);
        }
    }

    return createQueue(termFreqMap);
}

From source file:tech.beshu.ror.es.security.DocumentFieldReader.java

License:Open Source License

@Override
public Fields getTermVectors(int docID) throws IOException {
    Fields original = in.getTermVectors(docID);

    return new Fields() {
        @Override//  www . j  av a  2  s . c om
        public Iterator<String> iterator() {
            return Iterators.filter(original.iterator(), s -> policy.canKeep(s));
        }

        @Override
        public Terms terms(String field) throws IOException {
            return policy.canKeep(field) ? original.terms(field) : null;
        }

        @Override
        public int size() {
            return remainingFieldsInfo.size();
        }
    };
}

From source file:tw.com.kyle.luminance.LumReader.java

public TokenStream GetTokenStream(int docId, String field) throws IOException {
    Fields tvFields = reader.getTermVectors(docId);
    if (tvFields.terms(field) != null) {
        TokenStream tokenStream = TokenSources.getTokenStream(field, tvFields, "", new StandardAnalyzer(), -1);
        return tokenStream;
    } else {//from w  w w . j av  a2 s.  c o m
        return null;
    }

}

From source file:utils.HighFreqTerms.java

License:Apache License

/**
 * // w w w  .  j  a  va  2 s.  c o  m
 * @param reader
 * @param numTerms
 * @param fieldNames
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator();
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        Iterator<String> fieldIterator = fields.iterator();
        while (fieldIterator.hasNext()) {
            String field = fieldIterator.next();
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator();
                fillQueue(te, tiq, field);
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}