Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.codelibs.elasticsearch.common.lucene.search.MoreLikeThisQuery.java

License:Apache License

private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
    Set<Term> skipTerms = new HashSet<>();
    // handle like text
    if (unlikeText != null) {
        for (String text : unlikeText) {
            // only use the first field to be consistent
            String fieldName = moreLikeFields[0];
            try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();/*from www .j a v a 2s.  c om*/
                while (ts.incrementToken()) {
                    skipTerms.add(new Term(fieldName, termAtt.toString()));
                }
                ts.end();
            }
        }
    }
    // handle like fields
    if (unlikeFields != null) {
        for (Fields fields : unlikeFields) {
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                final TermsEnum termsEnum = terms.iterator();
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                    skipTerms.add(new Term(fieldName, text.utf8ToString()));
                }
            }
        }
    }
    if (!skipTerms.isEmpty()) {
        mlt.setSkipTerms(skipTerms);
    }
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Return a query that will return docs like the passed Fields.
 *
 * @return a query that will return docs like the passed Fields.
 *///ww  w.ja v  a 2s.  c  om
public Query like(Fields... likeFields) throws IOException {
    // get all field names
    Set<String> fieldNames = new HashSet<>();
    for (Fields fields : likeFields) {
        for (String fieldName : fields) {
            fieldNames.add(fieldName);
        }
    }
    // term selection is per field, then appended to a single boolean query
    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    for (String fieldName : fieldNames) {
        Map<String, Int> termFreqMap = new HashMap<>();
        for (Fields fields : likeFields) {
            Terms vector = fields.terms(fieldName);
            if (vector != null) {
                addTermFrequencies(termFreqMap, vector, fieldName);
            }
        }
        addToQuery(createQueue(termFreqMap, fieldName), bq);
    }
    return bq.build();
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *///from   w  w  w  .j ava2 s. c o  m
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}

From source file:org.codelibs.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override//from w  ww .j a  va  2s . c  o  m
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                terms.iterator();
                new SuggestPayload();
                throw new UnsupportedOperationException("QueryBuilders does not support this operation.");
                //                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                //                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                //                    int docCount = 0;
                //                    while (true) {
                //                        BytesRef term = termsEnum.next();
                //                        if (term == null) {
                //                            break;
                //                        }
                //                        docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
                //                        builder.startTerm(term);
                //                        int docFreq = 0;
                //                        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                //                            for (int i = 0; i < docsEnum.freq(); i++) {
                //                                final int position = docsEnum.nextPosition();
                //                                AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                //                                builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                //                                // multi fields have the same surface form so we sum up here
                //                                maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                //                            }
                //                            docFreq++;
                //                            docCount = Math.max(docCount, docsEnum.docID()+1);
                //                        }
                //                        builder.finishTerm(docFreq);
                //                    }
                //                    /*
                //                     * Here we are done processing the field and we can
                //                     * buid the FST and write it to disk.
                //                     */
                //                    FST<Pair<Long, BytesRef>> build = builder.build();
                //                    assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
                //                    /*
                //                     * it's possible that the FST is null if we have 2 segments that get merged
                //                     * and all docs that have a value in this field are deleted. This will cause
                //                     * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                //                     * to return null.
                //                     */
                //                    if (build != null) {
                //                        fieldOffsets.put(field, output.getFilePointer());
                //                        build.save(output);
                //                        /* write some more meta-info */
                //                        output.writeVInt(maxAnalyzedPathsForOneInput);
                //                        output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                //                        output.writeInt(maxGraphExpansions); // can be negative
                //                        int options = 0;
                //                        options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                //                        options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                //                        options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                //                        output.writeVInt(options);
                //                        output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                //                        output.writeVInt(XAnalyzingSuggester.END_BYTE);
                //                        output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                //                        output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                //                    }
            }
        }
    };
}

From source file:org.codelibs.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.java

License:Apache License

/**
 * Returns total in-heap bytes used by all suggesters.  This method has CPU cost <code>O(numIndexedFields)</code>.
 *
 * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes
 * separately in the returned {CompletionStats}
 *///from w w  w.j  ava2 s . c  om
public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    CompletionStats completionStats = new CompletionStats();
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
        LeafReader atomicReader = atomicReaderContext.reader();
        try {
            Fields fields = atomicReader.fields();
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                if (terms instanceof CompletionTerms) {
                    CompletionTerms completionTerms = (CompletionTerms) terms;
                    completionStats.add(completionTerms.stats(fieldNamePatterns));
                }
            }
        } catch (IOException ioe) {
            logger.error("Could not get completion stats", ioe);
        }
    }

    return completionStats;
}

From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-queryFromDocuments-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 * @param fields the list of field of the lucene document from which to extract terms
 * @param fieldToTermFreqMap data structure to populate with term frequencies
 *//*from   w w w. ja va2s. com*/
public Map<String, Map<String, Flt>> retrieveTerms(int docNum, String[] fields,
        Map<String, Map<String, Flt>> fieldToTermFreqMap) throws IOException {

    if (fieldToTermFreqMap == null) {
        fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>();
    }

    if (fields == null || fields.length == 0) {
        return fieldToTermFreqMap;
    }

    final Fields vectors = ir.getTermVectors(docNum);
    final Document document = ir.document(docNum);

    for (String fieldName : fields) {

        Map<String, Flt> termFreqMap = null;
        if (fieldToTermFreqMap.containsKey(fieldName)) {
            termFreqMap = fieldToTermFreqMap.get(fieldName);
        } else {
            termFreqMap = new HashMap<String, Flt>();
            fieldToTermFreqMap.put(fieldName, termFreqMap);
        }

        Terms vector = null;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        }

        // field does not store term vector info
        // even if term vectors enabled, need to extract payload from regular field reader
        if (vector == null || isPayloadField(fieldName)) {
            IndexableField docFields[] = document.getFields(fieldName);
            for (IndexableField field : docFields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermWeights(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermWeights(termFreqMap, vector);
        }
    }

    return fieldToTermFreqMap;
}

From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java

License:Apache License

public Map<String, Map<String, Flt>> retrieveTerms(int docNum, Map<String, Map<String, Flt>> fieldToTermFreqMap)
        throws IOException {

    if (fieldToTermFreqMap == null) {
        fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>();
    }//from   w  w  w . ja  v  a  2s. com
    for (String fieldName : getFieldNames()) {

        Map<String, Flt> termFreqMap = null;
        if (fieldToTermFreqMap.containsKey(fieldName)) {
            termFreqMap = fieldToTermFreqMap.get(fieldName);
        } else {
            termFreqMap = new HashMap<String, Flt>();
            fieldToTermFreqMap.put(fieldName, termFreqMap);
        }

        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        // even if term vectors enabled, need to extract payload from regular field reader
        if (vector == null || isPayloadField(fieldName)) {
            Document d = ir.document(docNum);

            IndexableField fields[] = d.getFields(fieldName);

            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermWeights(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermWeights(termFreqMap, vector);
        }
    }

    return fieldToTermFreqMap;
}

From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java

License:Apache License

private Set<String> getTokensFromIndex(File luceneFolder) throws Exception {
    Set<String> token = new HashSet<>();
    @SuppressWarnings("deprecation")
    IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder));
    Fields fields = MultiFields.getFields(idxReader);
    for (String field : fields) {
        if (field.equals("id")) {
            continue;
        }/*  w w  w.  j  a v  a  2 s. c  om*/
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            token.add(text.utf8ToString());
        }
    }
    return token;
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneCharacterNGramMetaCollectorTest.java

License:Apache License

@SuppressWarnings("unused")
@Test/*from  w  ww .  ja v a2  s .c  om*/
public void luceneCharacterNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "charMetaCollectorTest.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneCharacterNGramMetaCollector.class, LuceneCharacterNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123",
            LuceneCharacterNGram.PARAM_NGRAM_MIN_N, 2, LuceneCharacterNGramMetaCollector.PARAM_TARGET_LOCATION,
            tmpDir, LuceneCharacterNGram.PARAM_SOURCE_LOCATION, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }

    Set<String> freq2terms = new HashSet<>();

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneCharacterNGramMetaCollector.LUCENE_CHAR_NGRAM_FIELD + "123");
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (termsEnum.totalTermFreq() == 2) {
                        freq2terms.add(text.utf8ToString());
                    }
                    //                        System.out.println(text.utf8ToString() + " " + termsEnum.totalTermFreq());
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(10, i);
    assertEquals(1, freq2terms.size());
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir,
            LuceneNGramMetaCollector.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        System.out.println(jcas.getDocumentText().length());
    }/*from   ww w . j  a va  2  s. c o  m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}