List of usage examples for org.apache.lucene.search CollectionStatistics maxDoc
long maxDoc
To view the source code for org.apache.lucene.search CollectionStatistics maxDoc.
Click Source Link
From source file:BM25LSimilarity.java
License:Apache License
/** * The default implementation computes the average as * <code>sumTotalTermFreq / maxDoc</code>, or returns <code>1</code> if the * index does not store sumTotalTermFreq: any field that omits frequency * information)./*from w w w. ja v a 2s .c o m*/ */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc()); } }
From source file:BM25LSimilarity.java
License:Apache License
/** * Computes a score factor for a simple term and returns an explanation for * that score factor.//from w w w .j av a 2 s . c om * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, searcher.maxDoc()); * </pre> * * Note that {@link CollectionStatistics#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} * because also {@link TermStatistics#docFreq()} is used, and when the * latter is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in * the same direction. In addition, {@link CollectionStatistics#maxDoc()} is * more efficient to compute * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an * explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
From source file:BM25LSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * * <p>/*ww w . j ava 2 s .com*/ * The default implementation sums the idf factor for each term in the * phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
From source file:com.core.nlp.similarity.TFIDFSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * <p/>/* ww w . j ava 2 s .c o m*/ * <p/> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> subs = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", subs); }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else {//from w w w . j a va 2s.c om final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a simple term and returns an explanation * for that score factor./*from w ww . j av a 2 s.c o m*/ * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * * <p>/*from ww w.j av a 2 s . co m*/ * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, docCount); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
From source file:com.xiaomi.linden.lucene.similarity.LindenSimilarity.java
License:Apache License
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idfManager.getIDF(termStats.term().utf8ToString()); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
From source file:elhuyar.bilakit.SimilarityCLIRFactory.java
License:Open Source License
/** * Computes a score factor for a simple term and returns an explanation * for that score factor./*from w w w . j a va 2s . com*/ * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, searcher.maxDoc()); * </pre> * * Note that {@link CollectionStatistics#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction. * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
From source file:elhuyar.bilakit.SimilarityCLIRFactory.java
License:Open Source License
/** * Computes a score factor for a phrase. * /*from ww w . ja v a 2 s. c o m*/ * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; }