Example usage for org.apache.lucene.search CollectionStatistics docCount

List of usage examples for org.apache.lucene.search CollectionStatistics docCount

Introduction

In this page you can find the example usage for org.apache.lucene.search CollectionStatistics docCount.

Prototype

long docCount

To view the source code for org.apache.lucene.search CollectionStatistics docCount.

Click Source Link

Usage

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
    final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
    if (sumTotalTermFreq <= 0) {
        return 1f; // field does not exist, or stat is unsupported
    } else {//from   ww w  . j av  a2  s. c o m
        final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
                : collectionStats.docCount();
        return (float) (sumTotalTermFreq / (double) docCount);
    }
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor./*w ww.  j av a 2s.  c  o  m*/
 *
 * <p>
 * The default implementation uses:
 *
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 *
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
 * {@link TermStatistics#docFreq()} is used, and when the latter
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor
and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    final float idf = idf(df, docCount);
    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 *
 * <p>/*from   w  w  w  .  java2s  .c o m*/
 * The default implementation sums the idf factor for
 * each term in the phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf
 *         score factor for the phrase and an explanation
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, docCount);
        details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
}

From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java

License:Open Source License

/**
 * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics,
 * TermStatistics)} but uses {@link CollectionStatistics#docCount()} instead
 * of {@link CollectionStatistics#maxDoc()}.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor and an
 * explanation for the term./* w ww.ja v  a  2 s .  c o m*/
 * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics)
 */
@Override
public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.docCount();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ')');
}

From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java

License:Open Source License

/**
 * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics,
 * TermStatistics[])}, but uses {@link CollectionStatistics#docCount()}
 * instead of {@link CollectionStatistics#maxDoc()}.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf score factor for the
 * phrase and an explanation for each term.
 * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics[])
 *///from w  w  w . j a v  a  2 s .  c  o  m
@SuppressWarnings("ObjectAllocationInLoop")
@Override
public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) {
    final long max = collectionStats.docCount();
    float idf = 0.0f;
    final Explanation exp = new Explanation();
    exp.setDescription("idf(), sum of:");
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ')'));
        idf += termIdf;
    }
    exp.setValue(idf);
    return exp;
}

From source file:eu.europeana.ranking.bm25f.similarity.BM25FSimilarity.java

License:Apache License

/**
 * Compute the average length for a field, given its stats.
 * //from  w  w  w  .j a  va 2s .c  om
 * @param the
 *            length statistics of a field.
 * @return the average length of the field.
 */
private float avgFieldLength(CollectionStatistics stats) {
    // logger.info("sum total term freq \t {}", stats.sumTotalTermFreq());
    // logger.info("doc count \t {}", stats.docCount());
    return (float) stats.sumTotalTermFreq() / (float) stats.docCount();
}

From source file:org.apache.solr.search.stats.CollectionStats.java

License:Apache License

public CollectionStats(CollectionStatistics stats) {
    this.field = stats.field();
    this.maxDoc = stats.maxDoc();
    this.docCount = stats.docCount();
    this.sumTotalTermFreq = stats.sumTotalTermFreq();
    this.sumDocFreq = stats.sumDocFreq();
}

From source file:org.elasticsearch.action.search.SearchPhaseController.java

License:Apache License

public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) {
    ObjectObjectHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap();
    ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
    long aggMaxDoc = 0;
    for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) {
        final Term[] terms = lEntry.value.terms();
        final TermStatistics[] stats = lEntry.value.termStatistics();
        assert terms.length == stats.length;
        for (int i = 0; i < terms.length; i++) {
            assert terms[i] != null;
            TermStatistics existing = termStatistics.get(terms[i]);
            if (existing != null) {
                assert terms[i].bytes().equals(existing.term());
                // totalTermFrequency is an optional statistic we need to check if either one or both
                // are set to -1 which means not present and then set it globally to -1
                termStatistics.put(terms[i],
                        new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(),
                                optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq())));
            } else {
                termStatistics.put(terms[i], stats[i]);
            }/*from w  w w . ja  v  a2 s  .  c  o  m*/

        }

        assert !lEntry.value.fieldStatistics().containsKey(null);
        final Object[] keys = lEntry.value.fieldStatistics().keys;
        final Object[] values = lEntry.value.fieldStatistics().values;
        for (int i = 0; i < keys.length; i++) {
            if (keys[i] != null) {
                String key = (String) keys[i];
                CollectionStatistics value = (CollectionStatistics) values[i];
                assert key != null;
                CollectionStatistics existing = fieldStatistics.get(key);
                if (existing != null) {
                    CollectionStatistics merged = new CollectionStatistics(key,
                            existing.maxDoc() + value.maxDoc(),
                            optionalSum(existing.docCount(), value.docCount()),
                            optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()),
                            optionalSum(existing.sumDocFreq(), value.sumDocFreq()));
                    fieldStatistics.put(key, merged);
                } else {
                    fieldStatistics.put(key, value);
                }
            }
        }
        aggMaxDoc += lEntry.value.maxDoc();
    }
    return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc);
}

From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java

License:Apache License

private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);/* w w w  .  jav a2  s  .c o  m*/
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}

From source file:org.elasticsearch.index.similarity.ScriptedSimilarity.java

License:Apache License

@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
    Query query = new Query(boost);
    long docCount = collectionStats.docCount();
    if (docCount == -1) {
        docCount = collectionStats.maxDoc();
    }//from  www .  j a  v a  2 s .  c  o m
    Field field = new Field(docCount, collectionStats.sumDocFreq(), collectionStats.sumTotalTermFreq());
    Term[] terms = new Term[termStats.length];
    for (int i = 0; i < termStats.length; ++i) {
        terms[i] = new Term(termStats[i].docFreq(), termStats[i].totalTermFreq());
    }
    return new Weight(collectionStats.field(), query, field, terms);
}