Example usage for org.apache.lucene.search CollectionStatistics maxDoc

List of usage examples for org.apache.lucene.search CollectionStatistics maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.search CollectionStatistics maxDoc.

Prototype

long maxDoc

To view the source code for org.apache.lucene.search CollectionStatistics maxDoc.

Click Source Link

Usage

From source file:BM25LSimilarity.java

License:Apache License

/**
 * The default implementation computes the average as
 * <code>sumTotalTermFreq / maxDoc</code>, or returns <code>1</code> if the
 * index does not store sumTotalTermFreq: any field that omits frequency
 * information)./*from   w w w.  ja v  a 2s .c o  m*/
 */
protected float avgFieldLength(CollectionStatistics collectionStats) {
    final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
    if (sumTotalTermFreq <= 0) {
        return 1f; // field does not exist, or stat is unsupported
    } else {
        return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
    }
}

From source file:BM25LSimilarity.java

License:Apache License

/**
 * Computes a score factor for a simple term and returns an explanation for
 * that score factor.//from  w  w w  .j av a 2  s  .  c om
 *
 * <p>
 * The default implementation uses:
 *
 * <pre class="prettyprint">
 * idf(docFreq, searcher.maxDoc());
 * </pre>
 *
 * Note that {@link CollectionStatistics#maxDoc()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()}
 * because also {@link TermStatistics#docFreq()} is used, and when the
 * latter is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in
 * the same direction. In addition, {@link CollectionStatistics#maxDoc()} is
 * more efficient to compute
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor and an
 * explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

From source file:BM25LSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 *
 * <p>/*ww  w .  j ava  2  s .com*/
 * The default implementation sums the idf factor for each term in the
 * phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf score factor for the
 * phrase and an explanation for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long max = collectionStats.maxDoc();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
}

From source file:com.core.nlp.similarity.TFIDFSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 * <p/>/*  ww  w .  j  ava 2  s .c  o  m*/
 * <p/>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats       term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf
 * score factor for the phrase and an explanation
 * for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long max = collectionStats.maxDoc();
    float idf = 0.0f;
    List<Explanation> subs = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", subs);
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
    final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
    if (sumTotalTermFreq <= 0) {
        return 1f; // field does not exist, or stat is unsupported
    } else {//from w w w  .  j  a va 2s.c om
        final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
                : collectionStats.docCount();
        return (float) (sumTotalTermFreq / (double) docCount);
    }
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor./*from   w  ww . j av  a 2 s.c o m*/
 *
 * <p>
 * The default implementation uses:
 *
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 *
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
 * {@link TermStatistics#docFreq()} is used, and when the latter
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor
and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    final float idf = idf(df, docCount);
    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 *
 * <p>/*from  ww w.j av  a 2 s  .  co m*/
 * The default implementation sums the idf factor for
 * each term in the phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf
 *         score factor for the phrase and an explanation
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, docCount);
        details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
}

From source file:com.xiaomi.linden.lucene.similarity.LindenSimilarity.java

License:Apache License

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idfManager.getIDF(termStats.term().utf8ToString());
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

From source file:elhuyar.bilakit.SimilarityCLIRFactory.java

License:Open Source License

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor./*from   w  w  w  . j  a va 2s . com*/
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, searcher.maxDoc());
 * </pre>
 * 
 * Note that {@link CollectionStatistics#maxDoc()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
 * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
        and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

From source file:elhuyar.bilakit.SimilarityCLIRFactory.java

License:Open Source License

/**
 * Computes a score factor for a phrase.
 * /*from   ww w . ja v  a 2  s.  c  o m*/
 * <p>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 * 
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf 
 *         score factor for the phrase and an explanation 
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long max = collectionStats.maxDoc();
    float idf = 0.0f;
    final Explanation exp = new Explanation();
    exp.setDescription("idf(), sum of:");
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        idf += termIdf;
    }
    exp.setValue(idf);
    return exp;
}