Example usage for org.apache.lucene.search TermStatistics TermStatistics

List of usage examples for org.apache.lucene.search TermStatistics TermStatistics

Introduction

In this page you can find the example usage for org.apache.lucene.search TermStatistics TermStatistics.

Prototype

public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) 

Source Link

Document

Creates statistics instance for a term.

Usage

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleWeight.java

License:Apache License

public FlexibleWeight(FlexibleQuery query, IndexSearcher searcher) throws IOException {
    this.query = query;
    this.similarity = searcher.getSimilarity();
    final IndexReaderContext context = searcher.getTopReaderContext();

    int[] maxDocFreqs = null;
    long[] maxTotalTermFreqs = null;
    Map<Term, TermContext> builtTermMap = new HashMap<>();
    if (query.enableGlobalIDF()) {
        FlexibleQuery.FlexibleTerm[][] globalTerms = query.getGlobalTerms();
        TermContext[][] globalStates = new TermContext[globalTerms.length][];
        for (int i = 0; i < globalTerms.length; ++i) {
            globalStates[i] = new TermContext[globalTerms[i].length];
            for (int j = 0; j < globalTerms[i].length; ++j) {
                Term term = globalTerms[i][j].term;
                TermContext termContext = builtTermMap.get(term);
                if (termContext != null) {
                    globalStates[i][j] = termContext;
                } else {
                    globalStates[i][j] = TermContext.build(context, globalTerms[i][j].term);
                    builtTermMap.put(term, globalStates[i][j]);
                }/*from   w w  w. j  a  va2s. c  o m*/
            }
        }
        maxDocFreqs = new int[globalTerms[0].length];
        maxTotalTermFreqs = new long[globalTerms[0].length];
        int fieldLength = globalTerms.length;
        int termLength = globalTerms[0].length;
        for (int i = 0; i < termLength; ++i) {
            int maxDocFreq = 0;
            long maxTotalTermFreq = 0;
            for (int j = 0; j < fieldLength; ++j) {
                maxDocFreq = Math.max(globalStates[j][i].docFreq(), maxDocFreq);
                maxTotalTermFreq = Math.max(globalStates[j][i].totalTermFreq(), maxTotalTermFreq);
            }
            maxDocFreqs[i] = maxDocFreq;
            maxTotalTermFreqs[i] = maxTotalTermFreq;
        }
    }

    FlexibleQuery.FlexibleTerm[][] terms = query.getTerms();
    TermContext[][] states = new TermContext[terms.length][];
    for (int i = 0; i < terms.length; ++i) {
        states[i] = new TermContext[terms[i].length];
        for (int j = 0; j < terms[i].length; ++j) {
            Term term = terms[i][j].term;
            TermContext termContext = builtTermMap.get(term);
            if (termContext != null) {
                states[i][j] = termContext;
            } else {
                states[i][j] = TermContext.build(context, terms[i][j].term);
                builtTermMap.put(term, states[i][j]);
            }
        }
    }
    termStatsMatrix = new TermStats[terms.length][];
    for (int i = 0; i < terms.length; ++i) {
        termStatsMatrix[i] = new TermStats[terms[i].length];
        for (int j = 0; j < terms[i].length; ++j) {
            FlexibleQuery.FlexibleTerm term = terms[i][j];
            TermContext state = states[i][j];
            TermStatistics termStats;
            if (query.enableGlobalIDF()) {
                termStats = new TermStatistics(term.term.bytes(), maxDocFreqs[j], maxTotalTermFreqs[j]);
            } else {
                termStats = searcher.termStatistics(term.term, state);
            }
            Similarity.SimWeight stats = similarity.computeWeight(term.boost,
                    searcher.collectionStatistics(term.term.field()), termStats);
            TermStats termStatsInfo = new TermStats();
            termStatsInfo.stats = stats;
            termStatsInfo.term = term.term;
            termStatsInfo.termContext = state;
            termStatsMatrix[i][j] = termStatsInfo;
        }
    }
}

From source file:org.apache.solr.search.stats.TermStats.java

License:Apache License

public TermStatistics toTermStatistics() {
    return new TermStatistics(t.bytes(), docFreq, totalTermFreq);
}

From source file:org.elasticsearch.action.search.SearchPhaseController.java

License:Apache License

public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) {
    ObjectObjectHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap();
    ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
    long aggMaxDoc = 0;
    for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) {
        final Term[] terms = lEntry.value.terms();
        final TermStatistics[] stats = lEntry.value.termStatistics();
        assert terms.length == stats.length;
        for (int i = 0; i < terms.length; i++) {
            assert terms[i] != null;
            TermStatistics existing = termStatistics.get(terms[i]);
            if (existing != null) {
                assert terms[i].bytes().equals(existing.term());
                // totalTermFrequency is an optional statistic we need to check if either one or both
                // are set to -1 which means not present and then set it globally to -1
                termStatistics.put(terms[i],
                        new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(),
                                optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq())));
            } else {
                termStatistics.put(terms[i], stats[i]);
            }// ww  w  .  ja va  2  s  .  co m

        }

        assert !lEntry.value.fieldStatistics().containsKey(null);
        final Object[] keys = lEntry.value.fieldStatistics().keys;
        final Object[] values = lEntry.value.fieldStatistics().values;
        for (int i = 0; i < keys.length; i++) {
            if (keys[i] != null) {
                String key = (String) keys[i];
                CollectionStatistics value = (CollectionStatistics) values[i];
                assert key != null;
                CollectionStatistics existing = fieldStatistics.get(key);
                if (existing != null) {
                    CollectionStatistics merged = new CollectionStatistics(key,
                            existing.maxDoc() + value.maxDoc(),
                            optionalSum(existing.docCount(), value.docCount()),
                            optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()),
                            optionalSum(existing.sumDocFreq(), value.sumDocFreq()));
                    fieldStatistics.put(key, merged);
                } else {
                    fieldStatistics.put(key, value);
                }
            }
        }
        aggMaxDoc += lEntry.value.maxDoc();
    }
    return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc);
}

From source file:org.elasticsearch.action.termvectors.TermVectorsFilter.java

License:Apache License

private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
    if (dfs != null) {
        return dfs.termStatistics().get(term);
    }//from   w  w  w . ja  va 2  s.co m
    return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}

From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java

License:Apache License

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
        Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter)
        throws IOException {
    int numFieldsWritten = 0;
    PostingsEnum docsAndPosEnum = null;/*  w  w w  .  j  a  v  a2s .c o m*/
    PostingsEnum docsEnum = null;
    boolean hasScores = termVectorsFilter != null;

    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }

        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);

        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = fieldTermVector;
        }

        TermsEnum topLevelIterator = topLevelTerms.iterator();
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();

        long termsSize = fieldTermVector.size();
        if (hasScores) {
            termsSize = Math.min(termsSize, termVectorsFilter.size(field));
        }
        startField(field, termsSize, positions, offsets, payloads);

        if (flags.contains(Flag.FieldStatistics)) {
            if (dfs != null) {
                writeFieldStatistics(dfs.fieldStatistics().get(field));
            } else {
                writeFieldStatistics(topLevelTerms);
            }
        }
        TermsEnum iterator = fieldTermVector.iterator();
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) { // iterate all terms of the current field
            BytesRef termBytesRef = iterator.term();
            Term term = new Term(field, termBytesRef);

            // with filtering we only keep the best terms
            if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
                continue;
            }

            startTerm(termBytesRef);
            if (flags.contains(Flag.TermStatistics)) {
                // get the doc frequency
                if (dfs != null) {
                    final TermStatistics statistics = dfs.termStatistics().get(term);
                    writeTermStatistics(
                            statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
                } else {
                    boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
                    if (foundTerm) {
                        writeTermStatistics(topLevelIterator);
                    } else {
                        writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
                    }
                }
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets,
                        payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a PostingsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
            if (hasScores) {
                writeScoreTerm(termVectorsFilter.getScoreTerm(term));
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorsField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics),
            flags.contains(Flag.FieldStatistics), hasScores));
}

From source file:org.elasticsearch.search.controller.SearchPhaseController.java

License:Apache License

public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) {
    ObjectObjectOpenHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap();
    ObjectObjectOpenHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
    long aggMaxDoc = 0;
    for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) {
        final Term[] terms = lEntry.value.terms();
        final TermStatistics[] stats = lEntry.value.termStatistics();
        assert terms.length == stats.length;
        for (int i = 0; i < terms.length; i++) {
            assert terms[i] != null;
            TermStatistics existing = termStatistics.get(terms[i]);
            if (existing != null) {
                assert terms[i].bytes().equals(existing.term());
                // totalTermFrequency is an optional statistic we need to check if either one or both
                // are set to -1 which means not present and then set it globally to -1
                termStatistics.put(terms[i],
                        new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(),
                                optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq())));
            } else {
                termStatistics.put(terms[i], stats[i]);
            }/*from w  w  w.j a  v  a2  s  . c om*/

        }
        final boolean[] states = lEntry.value.fieldStatistics().allocated;
        final Object[] keys = lEntry.value.fieldStatistics().keys;
        final Object[] values = lEntry.value.fieldStatistics().values;
        for (int i = 0; i < states.length; i++) {
            if (states[i]) {
                String key = (String) keys[i];
                CollectionStatistics value = (CollectionStatistics) values[i];
                assert key != null;
                CollectionStatistics existing = fieldStatistics.get(key);
                if (existing != null) {
                    CollectionStatistics merged = new CollectionStatistics(key,
                            existing.maxDoc() + value.maxDoc(),
                            optionalSum(existing.docCount(), value.docCount()),
                            optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()),
                            optionalSum(existing.sumDocFreq(), value.sumDocFreq()));
                    fieldStatistics.put(key, merged);
                } else {
                    fieldStatistics.put(key, value);
                }
            }
        }
        aggMaxDoc += lEntry.value.maxDoc();
    }
    return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc);
}

From source file:org.elasticsearch.search.dfs.AggregatedDfs.java

License:Apache License

@Override
public void readFrom(StreamInput in) throws IOException {
    int size = in.readVInt();
    termStatistics = HppcMaps.newMap(size);
    for (int i = 0; i < size; i++) {
        Term term = new Term(in.readString(), in.readBytesRef());
        TermStatistics stats = new TermStatistics(in.readBytesRef(), in.readVLong(),
                DfsSearchResult.subOne(in.readVLong()));
        termStatistics.put(term, stats);
    }/*from w ww  .ja v  a2  s . c  o  m*/
    fieldStatistics = DfsSearchResult.readFieldStats(in);
    maxDoc = in.readVLong();
}

From source file:org.elasticsearch.search.dfs.DfsSearchResult.java

License:Apache License

public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException {
    int termsStatsSize = in.readVInt();
    final TermStatistics[] termStatistics;
    if (termsStatsSize == 0) {
        termStatistics = EMPTY_TERM_STATS;
    } else {//from   w w w . ja va  2  s.  c o m
        termStatistics = new TermStatistics[termsStatsSize];
        assert terms.length == termsStatsSize;
        for (int i = 0; i < termStatistics.length; i++) {
            BytesRef term = terms[i].bytes();
            final long docFreq = in.readVLong();
            assert docFreq >= 0;
            final long totalTermFreq = subOne(in.readVLong());
            termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq);
        }
    }
    return termStatistics;
}

From source file:org.elasticsearch.vectorize.VectorizeService.java

License:Apache License

private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException {
    for (String fieldName : termVectorsFields) {
        TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator();
        while (termsEnum.next() != null) {
            Term term = new Term(fieldName, termsEnum.term());
            TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(),
                    termsEnum.totalTermFreq());
            int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq();
            vectorizer.add(term, termStatistics, freq);
        }/* w  w w .  ja  va2s .  co  m*/
    }
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {//www .j a va2s .  com
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}