List of usage examples for org.apache.lucene.search TermStatistics docFreq
long docFreq
To view the source code for org.apache.lucene.search TermStatistics docFreq.
Click Source Link
From source file:BM25LSimilarity.java
License:Apache License
/** * Computes a score factor for a simple term and returns an explanation for * that score factor./*from ww w . j a v a 2 s . c o m*/ * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, searcher.maxDoc()); * </pre> * * Note that {@link CollectionStatistics#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} * because also {@link TermStatistics#docFreq()} is used, and when the * latter is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in * the same direction. In addition, {@link CollectionStatistics#maxDoc()} is * more efficient to compute * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an * explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
From source file:BM25LSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * * <p>/*from ww w. j av a 2 s. c o m*/ * The default implementation sums the idf factor for each term in the * phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
From source file:com.core.nlp.similarity.TFIDFSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * <p/>/*from w w w . jav a2 s. c o m*/ * <p/> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> subs = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", subs); }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a simple term and returns an explanation * for that score factor./* w ww. j a v a2 s.com*/ * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * * <p>/*from www . j av a2 s .c o m*/ * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, docCount); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
From source file:com.o19s.es.explore.ExplorerQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { if (!needsScores) { return searcher.createWeight(query, false, boost); }/*from ww w . j av a 2s . c o m*/ final Weight subWeight = searcher.createWeight(query, true, boost); Set<Term> terms = new HashSet<>(); subWeight.extractTerms(terms); if (isCollectionScoped()) { ClassicSimilarity sim = new ClassicSimilarity(); StatisticsHelper df_stats = new StatisticsHelper(); StatisticsHelper idf_stats = new StatisticsHelper(); StatisticsHelper ttf_stats = new StatisticsHelper(); for (Term term : terms) { TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term); TermStatistics tStats = searcher.termStatistics(term, ctx); df_stats.add(tStats.docFreq()); idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs())); ttf_stats.add(tStats.totalTermFreq()); } /* If no terms are parsed in the query we opt for returning 0 instead of throwing an exception that could break various pipelines. */ float constantScore; if (terms.size() > 0) { switch (type) { case ("sum_classic_idf"): constantScore = idf_stats.getSum(); break; case ("mean_classic_idf"): constantScore = idf_stats.getMean(); break; case ("max_classic_idf"): constantScore = idf_stats.getMax(); break; case ("min_classic_idf"): constantScore = idf_stats.getMin(); break; case ("stddev_classic_idf"): constantScore = idf_stats.getStdDev(); break; case "sum_raw_df": constantScore = df_stats.getSum(); break; case "min_raw_df": constantScore = df_stats.getMin(); break; case "max_raw_df": constantScore = df_stats.getMax(); break; case "mean_raw_df": constantScore = df_stats.getMean(); break; case "stddev_raw_df": constantScore = df_stats.getStdDev(); break; case "sum_raw_ttf": constantScore = ttf_stats.getSum(); break; case "min_raw_ttf": constantScore = ttf_stats.getMin(); break; case "max_raw_ttf": constantScore = ttf_stats.getMax(); break; case "mean_raw_ttf": constantScore = ttf_stats.getMean(); break; case "stddev_raw_ttf": constantScore = ttf_stats.getStdDev(); break; case "unique_terms_count": constantScore = terms.size(); break; default: throw new RuntimeException("Invalid stat type specified."); } } else { constantScore = 0.0f; } return new ConstantScoreWeight(ExplorerQuery.this, constantScore) { @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context); int newDoc = scorer.iterator().advance(doc); assert newDoc == doc; // this is a DocIdSetIterator.all return Explanation.match(scorer.score(), "Stat Score: " + type); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { return new ConstantScoreScorer(this, constantScore, DocIdSetIterator.all(context.reader().maxDoc())); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } }; } else if (type.endsWith("_raw_tf")) { // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery BooleanQuery.Builder qb = new BooleanQuery.Builder(); for (Term t : terms) { qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF), BooleanClause.Occur.SHOULD)); } // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms // problem is that we rely on extractTerms which happen too late in the process Query q = qb.build().rewrite(searcher.getIndexReader()); return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type); } throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]"); }
From source file:com.xiaomi.linden.lucene.similarity.LindenSimilarity.java
License:Apache License
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idfManager.getIDF(termStats.term().utf8ToString()); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java
License:Open Source License
/** * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics, * TermStatistics)} but uses {@link CollectionStatistics#docCount()} instead * of {@link CollectionStatistics#maxDoc()}. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an * explanation for the term./*from w w w .j ava 2s .com*/ * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics) */ @Override public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.docCount(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ')'); }
From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java
License:Open Source License
/** * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics, * TermStatistics[])}, but uses {@link CollectionStatistics#docCount()} * instead of {@link CollectionStatistics#maxDoc()}. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics[]) *//*from w w w . j a v a2 s .c o m*/ @SuppressWarnings("ObjectAllocationInLoop") @Override public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) { final long max = collectionStats.docCount(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ')')); idf += termIdf; } exp.setValue(idf); return exp; }
From source file:elhuyar.bilakit.SimilarityCLIRFactory.java
License:Open Source License
/** * Computes a score factor for a simple term and returns an explanation * for that score factor.// w ww. j a v a2s . c o m * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, searcher.maxDoc()); * </pre> * * Note that {@link CollectionStatistics#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction. * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }