Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:com.github.flaxsearch.util.ReaderManager.java

License:Apache License

default TermsEnum findTermPostings(Integer segment, String field, String term) throws IOException {

    Fields fields = getFields(segment);
    Terms terms = fields.terms(field);

    if (terms == null) {
        String msg = String.format("No field %s", field);
        throw new WebApplicationException(msg, Response.Status.NOT_FOUND);
    }//from  ww w. ja v  a 2s . c o m

    TermsEnum te = terms.iterator();

    assert (term != null);
    if (!te.seekExact(new BytesRef(term))) {
        String msg = String.format("No term %s on field %s", term, field);
        throw new WebApplicationException(msg, Response.Status.NOT_FOUND);
    }

    return te;
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor.//from   ww  w .  j  a v  a 2  s  . com
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java

License:Apache License

@Override
public void testInvertedWrite() throws Exception {
    Directory dir = newDirectory();/*from  w  ww.  ja v a2s .co  m*/
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);

    // Must be concurrent because thread(s) can be merging
    // while up to one thread flushes, and each of those
    // threads iterates over the map while the flushing
    // thread might be adding to it:
    final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();

    final AtomicLong sumDocFreq = new AtomicLong();
    final AtomicLong sumTotalTermFreq = new AtomicLong();

    // TODO: would be better to use / delegate to the current
    // Codec returned by getCodec()

    iwc.setCodec(new AssertingCodec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {

            PostingsFormat p = getCodec().postingsFormat();
            if (p instanceof PerFieldPostingsFormat) {
                p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            if (p instanceof RocanaPerFieldPostingsFormat) {
                p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            final PostingsFormat defaultPostingsFormat = p;

            final Thread mainThread = Thread.currentThread();

            if (field.equals("body")) {

                // A PF that counts up some stats and then in
                // the end we verify the stats match what the
                // final IndexReader says, just to exercise the
                // new freedom of iterating the postings more
                // than once at flush/merge:

                return new PostingsFormat(defaultPostingsFormat.getName()) {

                    @Override
                    public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {

                        final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);

                        return new FieldsConsumer() {
                            @Override
                            public void write(Fields fields) throws IOException {
                                fieldsConsumer.write(fields);

                                boolean isMerge = state.context.context == IOContext.Context.MERGE;

                                // We only use one thread for flushing
                                // in this test:
                                assert isMerge || Thread.currentThread() == mainThread;

                                // We iterate the provided TermsEnum
                                // twice, so we excercise this new freedom
                                // with the inverted API; if
                                // addOnSecondPass is true, we add up
                                // term stats on the 2nd iteration:
                                boolean addOnSecondPass = random().nextBoolean();

                                //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);

                                // Gather our own stats:
                                Terms terms = fields.terms("body");
                                assert terms != null;

                                TermsEnum termsEnum = terms.iterator();
                                PostingsEnum docs = null;
                                while (termsEnum.next() != null) {
                                    BytesRef term = termsEnum.term();
                                    // TODO: also sometimes ask for payloads/offsets?
                                    boolean noPositions = random().nextBoolean();
                                    if (noPositions) {
                                        docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                    } else {
                                        docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                    }
                                    int docFreq = 0;
                                    long totalTermFreq = 0;
                                    while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                        docFreq++;
                                        totalTermFreq += docs.freq();
                                        int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                        if (!noPositions) {
                                            for (int i = 0; i < limit; i++) {
                                                docs.nextPosition();
                                            }
                                        }
                                    }

                                    String termString = term.utf8ToString();

                                    // During merge we should only see terms
                                    // we had already seen during a
                                    // previous flush:
                                    assertTrue(isMerge == false || termFreqs.containsKey(termString));

                                    if (isMerge == false) {
                                        if (addOnSecondPass == false) {
                                            TermFreqs tf = termFreqs.get(termString);
                                            if (tf == null) {
                                                tf = new TermFreqs();
                                                termFreqs.put(termString, tf);
                                            }
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        } else if (termFreqs.containsKey(termString) == false) {
                                            // Add placeholder (2nd pass will
                                            // set its counts):
                                            termFreqs.put(termString, new TermFreqs());
                                        }
                                    }
                                }

                                // Also test seeking the TermsEnum:
                                for (String term : termFreqs.keySet()) {
                                    if (termsEnum.seekExact(new BytesRef(term))) {
                                        // TODO: also sometimes ask for payloads/offsets?
                                        boolean noPositions = random().nextBoolean();
                                        if (noPositions) {
                                            docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                        } else {
                                            docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                        }

                                        int docFreq = 0;
                                        long totalTermFreq = 0;
                                        while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                            docFreq++;
                                            totalTermFreq += docs.freq();
                                            int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                            if (!noPositions) {
                                                for (int i = 0; i < limit; i++) {
                                                    docs.nextPosition();
                                                }
                                            }
                                        }

                                        if (isMerge == false && addOnSecondPass) {
                                            TermFreqs tf = termFreqs.get(term);
                                            assert tf != null;
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        }

                                        //System.out.println("  term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
                                        assertTrue(docFreq <= termFreqs.get(term).docFreq);
                                        assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
                                    }
                                }

                                // Also test seekCeil
                                for (int iter = 0; iter < 10; iter++) {
                                    BytesRef term = new BytesRef(
                                            TestUtil.randomRealisticUnicodeString(random()));
                                    SeekStatus status = termsEnum.seekCeil(term);
                                    if (status == SeekStatus.NOT_FOUND) {
                                        assertTrue(term.compareTo(termsEnum.term()) < 0);
                                    }
                                }
                            }

                            @Override
                            public void close() throws IOException {
                                fieldsConsumer.close();
                            }
                        };
                    }

                    @Override
                    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
                        return defaultPostingsFormat.fieldsProducer(state);
                    }
                };
            } else {
                return defaultPostingsFormat;
            }
        }
    });

    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

    LineFileDocs docs = new LineFileDocs(random());
    int bytesToIndex = atLeast(100) * 1024;
    int bytesIndexed = 0;
    while (bytesIndexed < bytesToIndex) {
        Document doc = docs.nextDoc();
        w.addDocument(doc);
        bytesIndexed += RamUsageTester.sizeOf(doc);
    }

    IndexReader r = w.getReader();
    w.close();

    Terms terms = MultiFields.getTerms(r, "body");
    assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
    assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());

    TermsEnum termsEnum = terms.iterator();
    long termCount = 0;
    boolean supportsOrds = true;
    while (termsEnum.next() != null) {
        BytesRef term = termsEnum.term();
        assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
        assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
        if (supportsOrds) {
            long ord;
            try {
                ord = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                ord = -1;
            }
            if (ord != -1) {
                assertEquals(termCount, ord);
            }
        }
        termCount++;
    }
    assertEquals(termFreqs.size(), termCount);

    r.close();
    dir.close();
}

From source file:com.senseidb.abacus.api.codec.CodecTest.java

License:Apache License

static void testThreaded(int numThreads, final int numIter, final AtomicReader reader, final String field) {
    Runnable runnable = new Runnable() {
        public void run() {
            try {
                Fields f = reader.fields();
                Terms t = f.terms(field);

                TermsEnum te = t.iterator(null);

                ArrayList<BytesRef> termList = new ArrayList<BytesRef>();

                BytesRef termText;// ww  w.ja va  2 s  . c  o  m
                while ((termText = te.next()) != null) {
                    termList.add(termText);
                }

                Random rand = new Random();

                for (int i = 0; i < numIter; ++i) {
                    int idx = rand.nextInt(termList.size());
                    termText = termList.get(idx);
                    te = t.iterator(null);
                    te.seekCeil(termText);
                    DocsEnum de = te.docs(null, null);
                    int doc;
                    while ((doc = de.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    }

                    de = te.docs(null, null);
                    doc = -1;
                    while ((doc = de.advance(doc + 2)) != DocIdSetIterator.NO_MORE_DOCS) {
                    }
                }

            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    };

    Thread[] threads = new Thread[numThreads];
    for (int i = 0; i < numThreads; ++i) {
        threads[i] = new Thread(runnable);
    }
    for (int i = 0; i < numThreads; ++i) {
        threads[i].start();
    }
    for (int i = 0; i < numThreads; ++i) {
        try {
            threads[i].join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

From source file:com.sindicetech.siren.search.node.NodeTermCollectingRewrite.java

License:Open Source License

final void collectTerms(final IndexReader reader, final MultiNodeTermQuery query, final TermCollector collector)
        throws IOException {
    final IndexReaderContext topReaderContext = reader.getContext();
    Comparator<BytesRef> lastTermComp = null;
    for (final AtomicReaderContext context : topReaderContext.leaves()) {
        final Fields fields = context.reader().fields();
        if (fields == null) {
            // reader has no fields
            continue;
        }/*  ww w .ja v a  2 s. c  o  m*/

        final Terms terms = fields.terms(query.field);
        if (terms == null) {
            // field does not exist
            continue;
        }

        final TermsEnum termsEnum = this.getTermsEnum(query, terms, collector.attributes);
        assert termsEnum != null;

        if (termsEnum == TermsEnum.EMPTY)
            continue;

        // Check comparator compatibility:
        final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
        if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp)
            throw new RuntimeException("term comparator should not change between segments: " + lastTermComp
                    + " != " + newTermComp);
        lastTermComp = newTermComp;
        collector.setReaderContext(topReaderContext, context);
        collector.setNextEnum(termsEnum);
        BytesRef bytes;
        while ((bytes = termsEnum.next()) != null) {
            if (!collector.collect(bytes))
                return; // interrupt whole term collection, so also don't iterate other subReaders
        }
    }
}

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;/*from  ww  w.j  av a2s  . c  o m*/
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase.java

License:Apache License

@Override
protected FrequencyDistribution<String> getTopNgrams() throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(getTopN()).create();

    long ngramVocabularySize = 0;
    IndexReader reader;//w  w w .j av a  2 s .  com
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(getFieldName());
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    if (passesScreening(term)) {
                        topN.add(new TermFreqTuple(term, freq));
                        ngramVocabularySize += freq;
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        long absCount = tuple.getFreq();
        double relFrequency = ((double) absCount) / ngramVocabularySize;

        if (relFrequency >= ngramFreqThreshold) {
            topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
        }
    }

    logSelectionProcess(topNGrams.getB());

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }/*ww  w . ja v  a 2 s.c om*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                //                    Bits liveDocs = MultiFields.getLiveDocs(index);
                //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
                //                    int docId;
                //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                //                        index.g
                //                    }
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    //                        System.out.println(text.utf8ToString() + " - " + termsEnum.totalTermFreq());
                    //                        System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramCPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;// w w w .  jav a  2  s .com
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    //add conditions here, like ngram1 is in most freq ngrams1...
                    String combo1 = term.split(ComboUtils.JOINT)[0];
                    String combo2 = term.split(ComboUtils.JOINT)[1];
                    int combinedSize = combo1.split("_").length + combo2.split("_").length;
                    if (topKSetView1.contains(combo1) && topKSet.contains(combo1)
                            && topKSetView2.contains(combo2) && topKSet.contains(combo2)
                            && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
                        //print out here for testing
                        topN.add(new TermFreqTuple(term, freq));
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;/* w w  w. j  av  a 2 s  . c  o m*/
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    topN.add(new TermFreqTuple(term, freq));
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}