Example usage for org.apache.lucene.search TermStatistics docFreq

List of usage examples for org.apache.lucene.search TermStatistics docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.search TermStatistics docFreq.

Prototype

long docFreq

To view the source code for org.apache.lucene.search TermStatistics docFreq.

Click Source Link

Usage

From source file:BM25LSimilarity.java

License:Apache License

/**
 * Computes a score factor for a simple term and returns an explanation for
 * that score factor./*from ww  w .  j  a v  a 2 s  . c o m*/
 *
 * <p>
 * The default implementation uses:
 *
 * <pre class="prettyprint">
 * idf(docFreq, searcher.maxDoc());
 * </pre>
 *
 * Note that {@link CollectionStatistics#maxDoc()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()}
 * because also {@link TermStatistics#docFreq()} is used, and when the
 * latter is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in
 * the same direction. In addition, {@link CollectionStatistics#maxDoc()} is
 * more efficient to compute
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor and an
 * explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

From source file:BM25LSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 *
 * <p>/*from   ww w.  j av a 2  s. c o  m*/
 * The default implementation sums the idf factor for each term in the
 * phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf score factor for the
 * phrase and an explanation for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long max = collectionStats.maxDoc();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
}

From source file:com.core.nlp.similarity.TFIDFSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 * <p/>/*from   w w  w . jav  a2 s.  c  o  m*/
 * <p/>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats       term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf
 * score factor for the phrase and an explanation
 * for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long max = collectionStats.maxDoc();
    float idf = 0.0f;
    List<Explanation> subs = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", subs);
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor./*  w  ww. j  a v a2  s.com*/
 *
 * <p>
 * The default implementation uses:
 *
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 *
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
 * {@link TermStatistics#docFreq()} is used, and when the latter
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor
and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    final float idf = idf(df, docCount);
    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}

From source file:com.o19s.bm25f.BM25FSimilarity.java

License:Apache License

/**
 * Computes a score factor for a phrase.
 *
 * <p>/*from www . j  av  a2 s .c o m*/
 * The default implementation sums the idf factor for
 * each term in the phrase.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf
 *         score factor for the phrase and an explanation
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc()
            : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, docCount);
        details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
        idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
}

From source file:com.o19s.es.explore.ExplorerQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    if (!needsScores) {
        return searcher.createWeight(query, false, boost);
    }/*from ww  w  . j av a 2s . c  o  m*/
    final Weight subWeight = searcher.createWeight(query, true, boost);
    Set<Term> terms = new HashSet<>();
    subWeight.extractTerms(terms);
    if (isCollectionScoped()) {
        ClassicSimilarity sim = new ClassicSimilarity();
        StatisticsHelper df_stats = new StatisticsHelper();
        StatisticsHelper idf_stats = new StatisticsHelper();
        StatisticsHelper ttf_stats = new StatisticsHelper();

        for (Term term : terms) {
            TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term);
            TermStatistics tStats = searcher.termStatistics(term, ctx);
            df_stats.add(tStats.docFreq());
            idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs()));
            ttf_stats.add(tStats.totalTermFreq());
        }

        /*
        If no terms are parsed in the query we opt for returning 0
        instead of throwing an exception that could break various
        pipelines.
         */
        float constantScore;

        if (terms.size() > 0) {
            switch (type) {
            case ("sum_classic_idf"):
                constantScore = idf_stats.getSum();
                break;
            case ("mean_classic_idf"):
                constantScore = idf_stats.getMean();
                break;
            case ("max_classic_idf"):
                constantScore = idf_stats.getMax();
                break;
            case ("min_classic_idf"):
                constantScore = idf_stats.getMin();
                break;
            case ("stddev_classic_idf"):
                constantScore = idf_stats.getStdDev();
                break;
            case "sum_raw_df":
                constantScore = df_stats.getSum();
                break;
            case "min_raw_df":
                constantScore = df_stats.getMin();
                break;
            case "max_raw_df":
                constantScore = df_stats.getMax();
                break;
            case "mean_raw_df":
                constantScore = df_stats.getMean();
                break;
            case "stddev_raw_df":
                constantScore = df_stats.getStdDev();
                break;
            case "sum_raw_ttf":
                constantScore = ttf_stats.getSum();
                break;
            case "min_raw_ttf":
                constantScore = ttf_stats.getMin();
                break;
            case "max_raw_ttf":
                constantScore = ttf_stats.getMax();
                break;
            case "mean_raw_ttf":
                constantScore = ttf_stats.getMean();
                break;
            case "stddev_raw_ttf":
                constantScore = ttf_stats.getStdDev();
                break;
            case "unique_terms_count":
                constantScore = terms.size();
                break;

            default:
                throw new RuntimeException("Invalid stat type specified.");
            }
        } else {
            constantScore = 0.0f;
        }

        return new ConstantScoreWeight(ExplorerQuery.this, constantScore) {

            @Override
            public Explanation explain(LeafReaderContext context, int doc) throws IOException {
                Scorer scorer = scorer(context);
                int newDoc = scorer.iterator().advance(doc);
                assert newDoc == doc; // this is a DocIdSetIterator.all
                return Explanation.match(scorer.score(), "Stat Score: " + type);
            }

            @Override
            public Scorer scorer(LeafReaderContext context) throws IOException {
                return new ConstantScoreScorer(this, constantScore,
                        DocIdSetIterator.all(context.reader().maxDoc()));
            }

            @Override
            public boolean isCacheable(LeafReaderContext ctx) {
                return true;
            }

        };
    } else if (type.endsWith("_raw_tf")) {
        // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery
        BooleanQuery.Builder qb = new BooleanQuery.Builder();
        for (Term t : terms) {
            qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF),
                    BooleanClause.Occur.SHOULD));
        }
        // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly
        // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms
        // problem is that we rely on extractTerms which happen too late in the process
        Query q = qb.build().rewrite(searcher.getIndexReader());
        return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type);
    }
    throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]");
}

From source file:com.xiaomi.linden.lucene.similarity.LindenSimilarity.java

License:Apache License

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idfManager.getIDF(termStats.term().utf8ToString());
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java

License:Open Source License

/**
 * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics,
 * TermStatistics)} but uses {@link CollectionStatistics#docCount()} instead
 * of {@link CollectionStatistics#maxDoc()}.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor and an
 * explanation for the term./*from  w  w w  .j ava 2s  .com*/
 * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics)
 */
@Override
public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.docCount();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ')');
}

From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java

License:Open Source License

/**
 * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics,
 * TermStatistics[])}, but uses {@link CollectionStatistics#docCount()}
 * instead of {@link CollectionStatistics#maxDoc()}.
 *
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf score factor for the
 * phrase and an explanation for each term.
 * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics[])
 *//*from   w w  w .  j a v  a2  s  .c  o m*/
@SuppressWarnings("ObjectAllocationInLoop")
@Override
public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) {
    final long max = collectionStats.docCount();
    float idf = 0.0f;
    final Explanation exp = new Explanation();
    exp.setDescription("idf(), sum of:");
    for (final TermStatistics stat : termStats) {
        final long df = stat.docFreq();
        final float termIdf = idf(df, max);
        exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ')'));
        idf += termIdf;
    }
    exp.setValue(idf);
    return exp;
}

From source file:elhuyar.bilakit.SimilarityCLIRFactory.java

License:Open Source License

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor.// w  ww.  j  a  v  a2s  . c o  m
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, searcher.maxDoc());
 * </pre>
 * 
 * Note that {@link CollectionStatistics#maxDoc()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
 * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
        and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}