List of usage examples for org.apache.lucene.search CollectionStatistics docCount
long docCount
To view the source code for org.apache.lucene.search CollectionStatistics docCount.
Click Source Link
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else {//from ww w . j av a2 s. c o m final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a simple term and returns an explanation * for that score factor./*w ww. j av a 2s. c o m*/ * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); }
From source file:com.o19s.bm25f.BM25FSimilarity.java
License:Apache License
/** * Computes a score factor for a phrase. * * <p>/*from w w w . java2s .c o m*/ * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, docCount); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java
License:Open Source License
/** * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics, * TermStatistics)} but uses {@link CollectionStatistics#docCount()} instead * of {@link CollectionStatistics#maxDoc()}. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an * explanation for the term./* w ww.ja v a 2 s . c o m*/ * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics) */ @Override public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.docCount(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ')'); }
From source file:de.unihildesheim.iw.lucene.search.FDRDefaultSimilarity.java
License:Open Source License
/** * Same as {@link TFIDFSimilarity#idfExplain(CollectionStatistics, * TermStatistics[])}, but uses {@link CollectionStatistics#docCount()} * instead of {@link CollectionStatistics#maxDoc()}. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. * @see TFIDFSimilarity#idfExplain(CollectionStatistics, TermStatistics[]) *///from w w w . j a v a 2 s . c o m @SuppressWarnings("ObjectAllocationInLoop") @Override public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) { final long max = collectionStats.docCount(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ')')); idf += termIdf; } exp.setValue(idf); return exp; }
From source file:eu.europeana.ranking.bm25f.similarity.BM25FSimilarity.java
License:Apache License
/** * Compute the average length for a field, given its stats. * //from w w w .j a va 2s .c om * @param the * length statistics of a field. * @return the average length of the field. */ private float avgFieldLength(CollectionStatistics stats) { // logger.info("sum total term freq \t {}", stats.sumTotalTermFreq()); // logger.info("doc count \t {}", stats.docCount()); return (float) stats.sumTotalTermFreq() / (float) stats.docCount(); }
From source file:org.apache.solr.search.stats.CollectionStats.java
License:Apache License
public CollectionStats(CollectionStatistics stats) { this.field = stats.field(); this.maxDoc = stats.maxDoc(); this.docCount = stats.docCount(); this.sumTotalTermFreq = stats.sumTotalTermFreq(); this.sumDocFreq = stats.sumDocFreq(); }
From source file:org.elasticsearch.action.search.SearchPhaseController.java
License:Apache License
public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) { ObjectObjectHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap(); ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap(); long aggMaxDoc = 0; for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) { final Term[] terms = lEntry.value.terms(); final TermStatistics[] stats = lEntry.value.termStatistics(); assert terms.length == stats.length; for (int i = 0; i < terms.length; i++) { assert terms[i] != null; TermStatistics existing = termStatistics.get(terms[i]); if (existing != null) { assert terms[i].bytes().equals(existing.term()); // totalTermFrequency is an optional statistic we need to check if either one or both // are set to -1 which means not present and then set it globally to -1 termStatistics.put(terms[i], new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(), optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq()))); } else { termStatistics.put(terms[i], stats[i]); }/*from w w w . ja v a2 s . c o m*/ } assert !lEntry.value.fieldStatistics().containsKey(null); final Object[] keys = lEntry.value.fieldStatistics().keys; final Object[] values = lEntry.value.fieldStatistics().values; for (int i = 0; i < keys.length; i++) { if (keys[i] != null) { String key = (String) keys[i]; CollectionStatistics value = (CollectionStatistics) values[i]; assert key != null; CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics(key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()), optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()), optionalSum(existing.sumDocFreq(), value.sumDocFreq())); fieldStatistics.put(key, merged); } else { fieldStatistics.put(key, value); } } } aggMaxDoc += lEntry.value.maxDoc(); } return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc); }
From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java
License:Apache License
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf);/* w w w . jav a2 s .c o m*/ int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
From source file:org.elasticsearch.index.similarity.ScriptedSimilarity.java
License:Apache License
@Override public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Query query = new Query(boost); long docCount = collectionStats.docCount(); if (docCount == -1) { docCount = collectionStats.maxDoc(); }//from www . j a v a 2 s . c o m Field field = new Field(docCount, collectionStats.sumDocFreq(), collectionStats.sumTotalTermFreq()); Term[] terms = new Term[termStats.length]; for (int i = 0; i < termStats.length; ++i) { terms[i] = new Term(termStats[i].docFreq(), termStats[i].totalTermFreq()); } return new Weight(collectionStats.field(), query, field, terms); }