List of usage examples for org.apache.lucene.search TermStatistics TermStatistics
public TermStatistics(BytesRef term, long docFreq, long totalTermFreq)
From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleWeight.java
License:Apache License
public FlexibleWeight(FlexibleQuery query, IndexSearcher searcher) throws IOException { this.query = query; this.similarity = searcher.getSimilarity(); final IndexReaderContext context = searcher.getTopReaderContext(); int[] maxDocFreqs = null; long[] maxTotalTermFreqs = null; Map<Term, TermContext> builtTermMap = new HashMap<>(); if (query.enableGlobalIDF()) { FlexibleQuery.FlexibleTerm[][] globalTerms = query.getGlobalTerms(); TermContext[][] globalStates = new TermContext[globalTerms.length][]; for (int i = 0; i < globalTerms.length; ++i) { globalStates[i] = new TermContext[globalTerms[i].length]; for (int j = 0; j < globalTerms[i].length; ++j) { Term term = globalTerms[i][j].term; TermContext termContext = builtTermMap.get(term); if (termContext != null) { globalStates[i][j] = termContext; } else { globalStates[i][j] = TermContext.build(context, globalTerms[i][j].term); builtTermMap.put(term, globalStates[i][j]); }/*from w w w. j a va2s. c o m*/ } } maxDocFreqs = new int[globalTerms[0].length]; maxTotalTermFreqs = new long[globalTerms[0].length]; int fieldLength = globalTerms.length; int termLength = globalTerms[0].length; for (int i = 0; i < termLength; ++i) { int maxDocFreq = 0; long maxTotalTermFreq = 0; for (int j = 0; j < fieldLength; ++j) { maxDocFreq = Math.max(globalStates[j][i].docFreq(), maxDocFreq); maxTotalTermFreq = Math.max(globalStates[j][i].totalTermFreq(), maxTotalTermFreq); } maxDocFreqs[i] = maxDocFreq; maxTotalTermFreqs[i] = maxTotalTermFreq; } } FlexibleQuery.FlexibleTerm[][] terms = query.getTerms(); TermContext[][] states = new TermContext[terms.length][]; for (int i = 0; i < terms.length; ++i) { states[i] = new TermContext[terms[i].length]; for (int j = 0; j < terms[i].length; ++j) { Term term = terms[i][j].term; TermContext termContext = builtTermMap.get(term); if (termContext != null) { states[i][j] = termContext; } else { states[i][j] = TermContext.build(context, terms[i][j].term); builtTermMap.put(term, states[i][j]); } } } termStatsMatrix = new TermStats[terms.length][]; for (int i = 0; i < terms.length; ++i) { termStatsMatrix[i] = new TermStats[terms[i].length]; for (int j = 0; j < terms[i].length; ++j) { FlexibleQuery.FlexibleTerm term = terms[i][j]; TermContext state = states[i][j]; TermStatistics termStats; if (query.enableGlobalIDF()) { termStats = new TermStatistics(term.term.bytes(), maxDocFreqs[j], maxTotalTermFreqs[j]); } else { termStats = searcher.termStatistics(term.term, state); } Similarity.SimWeight stats = similarity.computeWeight(term.boost, searcher.collectionStatistics(term.term.field()), termStats); TermStats termStatsInfo = new TermStats(); termStatsInfo.stats = stats; termStatsInfo.term = term.term; termStatsInfo.termContext = state; termStatsMatrix[i][j] = termStatsInfo; } } }
From source file:org.apache.solr.search.stats.TermStats.java
License:Apache License
public TermStatistics toTermStatistics() { return new TermStatistics(t.bytes(), docFreq, totalTermFreq); }
From source file:org.elasticsearch.action.search.SearchPhaseController.java
License:Apache License
public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) { ObjectObjectHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap(); ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap(); long aggMaxDoc = 0; for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) { final Term[] terms = lEntry.value.terms(); final TermStatistics[] stats = lEntry.value.termStatistics(); assert terms.length == stats.length; for (int i = 0; i < terms.length; i++) { assert terms[i] != null; TermStatistics existing = termStatistics.get(terms[i]); if (existing != null) { assert terms[i].bytes().equals(existing.term()); // totalTermFrequency is an optional statistic we need to check if either one or both // are set to -1 which means not present and then set it globally to -1 termStatistics.put(terms[i], new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(), optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq()))); } else { termStatistics.put(terms[i], stats[i]); }// ww w . ja va 2 s . co m } assert !lEntry.value.fieldStatistics().containsKey(null); final Object[] keys = lEntry.value.fieldStatistics().keys; final Object[] values = lEntry.value.fieldStatistics().values; for (int i = 0; i < keys.length; i++) { if (keys[i] != null) { String key = (String) keys[i]; CollectionStatistics value = (CollectionStatistics) values[i]; assert key != null; CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics(key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()), optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()), optionalSum(existing.sumDocFreq(), value.sumDocFreq())); fieldStatistics.put(key, merged); } else { fieldStatistics.put(key, value); } } } aggMaxDoc += lEntry.value.maxDoc(); } return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc); }
From source file:org.elasticsearch.action.termvectors.TermVectorsFilter.java
License:Apache License
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException { if (dfs != null) { return dfs.termStatistics().get(term); }//from w w w . ja va 2 s.co m return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq()); }
From source file:org.elasticsearch.action.termvectors.TermVectorsWriter.java
License:Apache License
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException { int numFieldsWritten = 0; PostingsEnum docsAndPosEnum = null;/* w w w . j a v a2s .c o m*/ PostingsEnum docsEnum = null; boolean hasScores = termVectorsFilter != null; for (String field : termVectorsByField) { if ((selectedFields != null) && (!selectedFields.contains(field))) { continue; } Terms fieldTermVector = termVectorsByField.terms(field); Terms topLevelTerms = topLevelFields.terms(field); // if no terms found, take the retrieved term vector fields for stats if (topLevelTerms == null) { topLevelTerms = fieldTermVector; } TermsEnum topLevelIterator = topLevelTerms.iterator(); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); long termsSize = fieldTermVector.size(); if (hasScores) { termsSize = Math.min(termsSize, termVectorsFilter.size(field)); } startField(field, termsSize, positions, offsets, payloads); if (flags.contains(Flag.FieldStatistics)) { if (dfs != null) { writeFieldStatistics(dfs.fieldStatistics().get(field)); } else { writeFieldStatistics(topLevelTerms); } } TermsEnum iterator = fieldTermVector.iterator(); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the current field BytesRef termBytesRef = iterator.term(); Term term = new Term(field, termBytesRef); // with filtering we only keep the best terms if (hasScores && !termVectorsFilter.hasScoreTerm(term)) { continue; } startTerm(termBytesRef); if (flags.contains(Flag.TermStatistics)) { // get the doc frequency if (dfs != null) { final TermStatistics statistics = dfs.termStatistics().get(term); writeTermStatistics( statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics); } else { boolean foundTerm = topLevelIterator.seekExact(termBytesRef); if (foundTerm) { writeTermStatistics(topLevelIterator); } else { writeTermStatistics(new TermStatistics(termBytesRef, 0, 0)); } } } if (useDocsAndPos) { // given we have pos or offsets docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads); } else { // if we do not have the positions stored, we need to // get the frequency from a PostingsEnum. docsEnum = writeTermWithDocsOnly(iterator, docsEnum); } if (hasScores) { writeScoreTerm(termVectorsFilter.getScoreTerm(term)); } } numFieldsWritten++; } response.setTermVectorsField(output); response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores)); }
From source file:org.elasticsearch.search.controller.SearchPhaseController.java
License:Apache License
public AggregatedDfs aggregateDfs(AtomicArray<DfsSearchResult> results) { ObjectObjectOpenHashMap<Term, TermStatistics> termStatistics = HppcMaps.newNoNullKeysMap(); ObjectObjectOpenHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap(); long aggMaxDoc = 0; for (AtomicArray.Entry<DfsSearchResult> lEntry : results.asList()) { final Term[] terms = lEntry.value.terms(); final TermStatistics[] stats = lEntry.value.termStatistics(); assert terms.length == stats.length; for (int i = 0; i < terms.length; i++) { assert terms[i] != null; TermStatistics existing = termStatistics.get(terms[i]); if (existing != null) { assert terms[i].bytes().equals(existing.term()); // totalTermFrequency is an optional statistic we need to check if either one or both // are set to -1 which means not present and then set it globally to -1 termStatistics.put(terms[i], new TermStatistics(existing.term(), existing.docFreq() + stats[i].docFreq(), optionalSum(existing.totalTermFreq(), stats[i].totalTermFreq()))); } else { termStatistics.put(terms[i], stats[i]); }/*from w w w.j a v a2 s . c om*/ } final boolean[] states = lEntry.value.fieldStatistics().allocated; final Object[] keys = lEntry.value.fieldStatistics().keys; final Object[] values = lEntry.value.fieldStatistics().values; for (int i = 0; i < states.length; i++) { if (states[i]) { String key = (String) keys[i]; CollectionStatistics value = (CollectionStatistics) values[i]; assert key != null; CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics(key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()), optionalSum(existing.sumTotalTermFreq(), value.sumTotalTermFreq()), optionalSum(existing.sumDocFreq(), value.sumDocFreq())); fieldStatistics.put(key, merged); } else { fieldStatistics.put(key, value); } } } aggMaxDoc += lEntry.value.maxDoc(); } return new AggregatedDfs(termStatistics, fieldStatistics, aggMaxDoc); }
From source file:org.elasticsearch.search.dfs.AggregatedDfs.java
License:Apache License
@Override public void readFrom(StreamInput in) throws IOException { int size = in.readVInt(); termStatistics = HppcMaps.newMap(size); for (int i = 0; i < size; i++) { Term term = new Term(in.readString(), in.readBytesRef()); TermStatistics stats = new TermStatistics(in.readBytesRef(), in.readVLong(), DfsSearchResult.subOne(in.readVLong())); termStatistics.put(term, stats); }/*from w ww .ja v a2 s . c o m*/ fieldStatistics = DfsSearchResult.readFieldStats(in); maxDoc = in.readVLong(); }
From source file:org.elasticsearch.search.dfs.DfsSearchResult.java
License:Apache License
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException { int termsStatsSize = in.readVInt(); final TermStatistics[] termStatistics; if (termsStatsSize == 0) { termStatistics = EMPTY_TERM_STATS; } else {//from w w w . ja va 2 s. c o m termStatistics = new TermStatistics[termsStatsSize]; assert terms.length == termsStatsSize; for (int i = 0; i < termStatistics.length; i++) { BytesRef term = terms[i].bytes(); final long docFreq = in.readVLong(); assert docFreq >= 0; final long totalTermFreq = subOne(in.readVLong()); termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq); } } return termStatistics; }
From source file:org.elasticsearch.vectorize.VectorizeService.java
License:Apache License
private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException { for (String fieldName : termVectorsFields) { TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator(); while (termsEnum.next() != null) { Term term = new Term(fieldName, termsEnum.term()); TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq()); int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq(); vectorizer.add(term, termStatistics, freq); }/* w w w . ja va2s . co m*/ } }
From source file:pretraga.IsolationSimilarity.java
public void test(String vec) { List<String> vector = processInput(vec); HashMap<String, Long> map = new HashMap<>(); try {//www .j a va2s . com Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); List<Integer> docId = getDocumentsFromVector(vector, reader, searcher); for (int i = 0; i < docId.size(); i++) { Fields ff = reader.getTermVectors(docId.get(i)); Terms terms = ff.terms(CONTENT); TermsEnum te = terms.iterator(); Object tmp = te.next(); while (tmp != null) { BytesRef by = (BytesRef) tmp; String term = by.utf8ToString(); ClassicSimilarity sim = null; if (searcher.getSimilarity(true) instanceof ClassicSimilarity) { sim = (ClassicSimilarity) searcher.getSimilarity(true); } float idf = sim.idf(te.docFreq(), reader.maxDoc()); float tf = sim.tf(te.totalTermFreq()); //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq()); TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq()); CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(), terms.getSumTotalTermFreq(), terms.getSumDocFreq()); Document d = reader.document(docId.get(i)); if (vector.contains(term)) { float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0)) .score(docId.get(i), te.totalTermFreq()); System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term); } tmp = te.next(); } /*Iterator<String> ss = ff.iterator(); while (ss.hasNext()) { String fieldString = ss.next(); System.out.println(fieldString); }*/ } } catch (Exception e) { } }