Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.codelibs.elasticsearch.common.lucene.search.MoreLikeThisQuery.java

License:Apache License

private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
    Set<Term> skipTerms = new HashSet<>();
    // handle like text
    if (unlikeText != null) {
        for (String text : unlikeText) {
            // only use the first field to be consistent
            String fieldName = moreLikeFields[0];
            try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();/*from www .j a v a 2s.  c om*/
                while (ts.incrementToken()) {
                    skipTerms.add(new Term(fieldName, termAtt.toString()));
                }
                ts.end();
            }
        }
    }
    // handle like fields
    if (unlikeFields != null) {
        for (Fields fields : unlikeFields) {
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                final TermsEnum termsEnum = terms.iterator();
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                    skipTerms.add(new Term(fieldName, text.utf8ToString()));
                }
            }
        }
    }
    if (!skipTerms.isEmpty()) {
        mlt.setSkipTerms(skipTerms);
    }
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Return a query that will return docs like the passed Fields.
 *
 * @return a query that will return docs like the passed Fields.
 *///ww  w.ja v  a 2s.  c  om
public Query like(Fields... likeFields) throws IOException {
    // get all field names
    Set<String> fieldNames = new HashSet<>();
    for (Fields fields : likeFields) {
        for (String fieldName : fields) {
            fieldNames.add(fieldName);
        }
    }
    // term selection is per field, then appended to a single boolean query
    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    for (String fieldName : fieldNames) {
        Map<String, Int> termFreqMap = new HashMap<>();
        for (Fields fields : likeFields) {
            Terms vector = fields.terms(fieldName);
            if (vector != null) {
                addTermFrequencies(termFreqMap, vector, fieldName);
            }
        }
        addToQuery(createQueue(termFreqMap, fieldName), bq);
    }
    return bq.build();
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *///from   w  w  w  .j ava2 s. c o  m
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}

From source file:org.codelibs.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override//from w  ww .j a  va  2s . c  o  m
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                terms.iterator();
                new SuggestPayload();
                throw new UnsupportedOperationException("QueryBuilders does not support this operation.");
                //                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                //                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                //                    int docCount = 0;
                //                    while (true) {
                //                        BytesRef term = termsEnum.next();
                //                        if (term == null) {
                //                            break;
                //                        }
                //                        docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
                //                        builder.startTerm(term);
                //                        int docFreq = 0;
                //                        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                //                            for (int i = 0; i < docsEnum.freq(); i++) {
                //                                final int position = docsEnum.nextPosition();
                //                                AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                //                                builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                //                                // multi fields have the same surface form so we sum up here
                //                                maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                //                            }
                //                            docFreq++;
                //                            docCount = Math.max(docCount, docsEnum.docID()+1);
                //                        }
                //                        builder.finishTerm(docFreq);
                //                    }
                //                    /*
                //                     * Here we are done processing the field and we can
                //                     * buid the FST and write it to disk.
                //                     */
                //                    FST<Pair<Long, BytesRef>> build = builder.build();
                //                    assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
                //                    /*
                //                     * it's possible that the FST is null if we have 2 segments that get merged
                //                     * and all docs that have a value in this field are deleted. This will cause
                //                     * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                //                     * to return null.
                //                     */
                //                    if (build != null) {
                //                        fieldOffsets.put(field, output.getFilePointer());
                //                        build.save(output);
                //                        /* write some more meta-info */
                //                        output.writeVInt(maxAnalyzedPathsForOneInput);
                //                        output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                //                        output.writeInt(maxGraphExpansions); // can be negative
                //                        int options = 0;
                //                        options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                //                        options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                //                        options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                //                        output.writeVInt(options);
                //                        output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                //                        output.writeVInt(XAnalyzingSuggester.END_BYTE);
                //                        output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                //                        output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                //                    }
            }
        }
    };
}

From source file:org.codelibs.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.java

License:Apache License

/**
 * Returns total in-heap bytes used by all suggesters.  This method has CPU cost <code>O(numIndexedFields)</code>.
 *
 * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes
 * separately in the returned {CompletionStats}
 *///from w w  w.j  ava2 s . c  om
public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    CompletionStats completionStats = new CompletionStats();
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
        LeafReader atomicReader = atomicReaderContext.reader();
        try {
            Fields fields = atomicReader.fields();
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                if (terms instanceof CompletionTerms) {
                    CompletionTerms completionTerms = (CompletionTerms) terms;
                    completionStats.add(completionTerms.stats(fieldNamePatterns));
                }
            }
        } catch (IOException ioe) {
            logger.error("Could not get completion stats", ioe);
        }
    }

    return completionStats;
}

From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-queryFromDocuments-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 * @param fields the list of field of the lucene document from which to extract terms
 * @param fieldToTermFreqMap data structure to populate with term frequencies
 *//*from   w w w. ja va2s. com*/
public Map<String, Map<String, Flt>> retrieveTerms(int docNum, String[] fields,
        Map<String, Map<String, Flt>> fieldToTermFreqMap) throws IOException {

    if (fieldToTermFreqMap == null) {
        fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>();
    }

    if (fields == null || fields.length == 0) {
        return fieldToTermFreqMap;
    }

    final Fields vectors = ir.getTermVectors(docNum);
    final Document document = ir.document(docNum);

    for (String fieldName : fields) {

        Map<String, Flt> termFreqMap = null;
        if (fieldToTermFreqMap.containsKey(fieldName)) {
            termFreqMap = fieldToTermFreqMap.get(fieldName);
        } else {
            termFreqMap = new HashMap<String, Flt>();
            fieldToTermFreqMap.put(fieldName, termFreqMap);
        }

        Terms vector = null;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        }

        // field does not store term vector info
        // even if term vectors enabled, need to extract payload from regular field reader
        if (vector == null || isPayloadField(fieldName)) {
            IndexableField docFields[] = document.getFields(fieldName);
            for (IndexableField field : docFields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermWeights(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermWeights(termFreqMap, vector);
        }
    }

    return fieldToTermFreqMap;
}

From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java

License:Apache License

public Map<String, Map<String, Flt>> retrieveTerms(int docNum, Map<String, Map<String, Flt>> fieldToTermFreqMap)
        throws IOException {

    if (fieldToTermFreqMap == null) {
        fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>();
    }//from   w  w  w . ja  v  a  2s. com
    for (String fieldName : getFieldNames()) {

        Map<String, Flt> termFreqMap = null;
        if (fieldToTermFreqMap.containsKey(fieldName)) {
            termFreqMap = fieldToTermFreqMap.get(fieldName);
        } else {
            termFreqMap = new HashMap<String, Flt>();
            fieldToTermFreqMap.put(fieldName, termFreqMap);
        }

        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        // even if term vectors enabled, need to extract payload from regular field reader
        if (vector == null || isPayloadField(fieldName)) {
            Document d = ir.document(docNum);

            IndexableField fields[] = d.getFields(fieldName);

            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermWeights(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermWeights(termFreqMap, vector);
        }
    }

    return fieldToTermFreqMap;
}

From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java

License:Apache License

private Set<String> getTokensFromIndex(File luceneFolder) throws Exception {
    Set<String> token = new HashSet<>();
    @SuppressWarnings("deprecation")
    IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder));
    Fields fields = MultiFields.getFields(idxReader);
    for (String field : fields) {
        if (field.equals("id")) {
            continue;
        }/*  w w  w.  j  a v  a  2 s. c  om*/
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            token.add(text.utf8ToString());
        }
    }
    return token;
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneCharacterNGramMetaCollectorTest.java

License:Apache License

@SuppressWarnings("unused")
@Test/*from  w  ww .  ja v a2  s .c  om*/
public void luceneCharacterNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "charMetaCollectorTest.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneCharacterNGramMetaCollector.class, LuceneCharacterNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123",
            LuceneCharacterNGram.PARAM_NGRAM_MIN_N, 2, LuceneCharacterNGramMetaCollector.PARAM_TARGET_LOCATION,
            tmpDir, LuceneCharacterNGram.PARAM_SOURCE_LOCATION, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }

    Set<String> freq2terms = new HashSet<>();

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneCharacterNGramMetaCollector.LUCENE_CHAR_NGRAM_FIELD + "123");
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (termsEnum.totalTermFreq() == 2) {
                        freq2terms.add(text.utf8ToString());
                    }
                    //                        System.out.println(text.utf8ToString() + " " + termsEnum.totalTermFreq());
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(10, i);
    assertEquals(1, freq2terms.size());
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir,
            LuceneNGramMetaCollector.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        System.out.println(jcas.getDocumentText().length());
    }/*from   ww w . j  a va  2  s. c o  m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}