List of usage examples for org.apache.lucene.search TermStatistics totalTermFreq
long totalTermFreq
To view the source code for org.apache.lucene.search TermStatistics totalTermFreq.
Click Source Link
From source file:com.o19s.es.explore.ExplorerQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { if (!needsScores) { return searcher.createWeight(query, false, boost); }//from w ww. j a v a 2s. c o m final Weight subWeight = searcher.createWeight(query, true, boost); Set<Term> terms = new HashSet<>(); subWeight.extractTerms(terms); if (isCollectionScoped()) { ClassicSimilarity sim = new ClassicSimilarity(); StatisticsHelper df_stats = new StatisticsHelper(); StatisticsHelper idf_stats = new StatisticsHelper(); StatisticsHelper ttf_stats = new StatisticsHelper(); for (Term term : terms) { TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term); TermStatistics tStats = searcher.termStatistics(term, ctx); df_stats.add(tStats.docFreq()); idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs())); ttf_stats.add(tStats.totalTermFreq()); } /* If no terms are parsed in the query we opt for returning 0 instead of throwing an exception that could break various pipelines. */ float constantScore; if (terms.size() > 0) { switch (type) { case ("sum_classic_idf"): constantScore = idf_stats.getSum(); break; case ("mean_classic_idf"): constantScore = idf_stats.getMean(); break; case ("max_classic_idf"): constantScore = idf_stats.getMax(); break; case ("min_classic_idf"): constantScore = idf_stats.getMin(); break; case ("stddev_classic_idf"): constantScore = idf_stats.getStdDev(); break; case "sum_raw_df": constantScore = df_stats.getSum(); break; case "min_raw_df": constantScore = df_stats.getMin(); break; case "max_raw_df": constantScore = df_stats.getMax(); break; case "mean_raw_df": constantScore = df_stats.getMean(); break; case "stddev_raw_df": constantScore = df_stats.getStdDev(); break; case "sum_raw_ttf": constantScore = ttf_stats.getSum(); break; case "min_raw_ttf": constantScore = ttf_stats.getMin(); break; case "max_raw_ttf": constantScore = ttf_stats.getMax(); break; case "mean_raw_ttf": constantScore = ttf_stats.getMean(); break; case "stddev_raw_ttf": constantScore = ttf_stats.getStdDev(); break; case "unique_terms_count": constantScore = terms.size(); break; default: throw new RuntimeException("Invalid stat type specified."); } } else { constantScore = 0.0f; } return new ConstantScoreWeight(ExplorerQuery.this, constantScore) { @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context); int newDoc = scorer.iterator().advance(doc); assert newDoc == doc; // this is a DocIdSetIterator.all return Explanation.match(scorer.score(), "Stat Score: " + type); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { return new ConstantScoreScorer(this, constantScore, DocIdSetIterator.all(context.reader().maxDoc())); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } }; } else if (type.endsWith("_raw_tf")) { // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery BooleanQuery.Builder qb = new BooleanQuery.Builder(); for (Term t : terms) { qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF), BooleanClause.Occur.SHOULD)); } // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms // problem is that we rely on extractTerms which happen too late in the process Query q = qb.build().rewrite(searcher.getIndexReader()); return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type); } throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]"); }
From source file:org.apache.solr.search.stats.TermStats.java
License:Apache License
public TermStats(String field, TermStatistics stats) { this.term = field + ":" + stats.term().utf8ToString(); this.t = new Term(field, stats.term()); this.docFreq = stats.docFreq(); this.totalTermFreq = stats.totalTermFreq(); }
From source file:org.elasticsearch.action.search.SearchPhaseController.java
License:Apache License
public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) { ObjectObjectHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap(); ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap(); long aggMaxDoc = 0; for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) { final Term[] terms = lEntry.value.terms(); final TermStatistics[] stats = lEntry.value.termStatistics(); assert terms.length == stats.length; for (int i = 0; i < terms.length; i++) { assert terms[i] != null; TermStatistics existing = termStatistics.get(terms[i]); if (existing != null) { assert terms[i].bytes().equals(existing.term()); // totalTermFrequency is an optional statistic we need to check if either one or both // are set to -1 which means not present and then set it globally to -1 termStatistics.put(terms[i], new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(), optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq()))); } else { termStatistics.put(terms[i], stats[i]); }//from w w w. j av a 2 s. c om } assert !lEntry.value.fieldStatistics().containsKey(null); final Object[] keys = lEntry.value.fieldStatistics().keys; final Object[] values = lEntry.value.fieldStatistics().values; for (int i = 0; i < keys.length; i++) { if (keys[i] != null) { String key = (String) keys[i]; CollectionStatistics value = (CollectionStatistics) values[i]; assert key != null; CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics(key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()), optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()), optionalSum(existing.sumDocFreq(), value.sumDocFreq())); fieldStatistics.put(key, merged); } else { fieldStatistics.put(key, value); } } } aggMaxDoc += lEntry.value.maxDoc(); } return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc); }
From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java
License:Apache License
private void writeTermStatistics(TermStatistics termStatistics) throws IOException { int docFreq = (int) termStatistics.docFreq(); assert (docFreq >= -1); writePotentiallyNegativeVInt(docFreq); long ttf = termStatistics.totalTermFreq(); assert (ttf >= -1); writePotentiallyNegativeVLong(ttf);//from ww w . j a v a2 s . c o m }
From source file:org.elasticsearch.search.controller.SearchPhaseController.java
License:Apache License
public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) { ObjectObjectOpenHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap(); ObjectObjectOpenHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap(); long aggMaxDoc = 0; for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) { final Term[] terms = lEntry.value.terms(); final TermStatistics[] stats = lEntry.value.termStatistics(); assert terms.length == stats.length; for (int i = 0; i < terms.length; i++) { assert terms[i] != null; TermStatistics existing = termStatistics.get(terms[i]); if (existing != null) { assert terms[i].bytes().equals(existing.term()); // totalTermFrequency is an optional statistic we need to check if either one or both // are set to -1 which means not present and then set it globally to -1 termStatistics.put(terms[i], new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(), optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq()))); } else { termStatistics.put(terms[i], stats[i]); }//from w ww.j av a 2s .co m } final boolean[] states = lEntry.value.fieldStatistics().allocated; final Object[] keys = lEntry.value.fieldStatistics().keys; final Object[] values = lEntry.value.fieldStatistics().values; for (int i = 0; i < states.length; i++) { if (states[i]) { String key = (String) keys[i]; CollectionStatistics value = (CollectionStatistics) values[i]; assert key != null; CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics(key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()), optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()), optionalSum(existing.sumDocFreq(), value.sumDocFreq())); fieldStatistics.put(key, merged); } else { fieldStatistics.put(key, value); } } } aggMaxDoc += lEntry.value.maxDoc(); } return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc); }
From source file:org.elasticsearch.search.dfs.AggregatedDfs.java
License:Apache License
@Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(termStatistics.size()); final boolean[] states = termStatistics.allocated; final Object[] keys = termStatistics.keys; final Object[] values = termStatistics.values; for (int i = 0; i < states.length; i++) { if (states[i]) { Term term = (Term) keys[i];/*from w w w . j ava 2s .c om*/ out.writeString(term.field()); out.writeBytesRef(term.bytes()); TermStatistics stats = (TermStatistics) values[i]; out.writeBytesRef(stats.term()); out.writeVLong(stats.docFreq()); out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq())); } } DfsSearchResult.writeFieldStats(out, fieldStatistics); out.writeVLong(maxDoc); }
From source file:org.elasticsearch.search.dfs.DfsSearchResult.java
License:Apache License
public static void writeSingleTermStats(StreamOutput out, TermStatistics termStatistic) throws IOException { assert termStatistic.docFreq() >= 0; out.writeVLong(termStatistic.docFreq()); out.writeVLong(addOne(termStatistic.totalTermFreq())); }
From source file:org.elasticsearch.vectorize.Vectorizer.java
License:Apache License
private int getValue(String fieldName, @Nullable TermStatistics termStatistics, int freq) { ValueOption valueOption = valueOptions.get(fieldName); switch (valueOption) { case BINARY:/*from w w w .ja va 2s .c om*/ return 1; case TERM_FREQ: return freq; case DOC_FREQ: return termStatistics != null ? (int) termStatistics.docFreq() : -1; // -1 term stats not requested case TTF: return termStatistics != null ? (int) termStatistics.totalTermFreq() : -1; // -1 term stats not requested default: throw new IllegalArgumentException("[" + valueOption + "] is not a valid valud option"); } }