List of usage examples for org.apache.lucene.index PostingsEnum FREQS
short FREQS
To view the source code for org.apache.lucene.index PostingsEnum FREQS.
Click Source Link
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();//www . j a v a 2 s .co m MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:org.codelibs.elasticsearch.common.lucene.index.FilterableTermsEnum.java
License:Apache License
public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException { if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) { throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag); }// w w w.j av a 2 s . c om this.docsEnumFlag = docsEnumFlag; List<LeafReaderContext> leaves = reader.leaves(); List<Holder> enums = new ArrayList<>(leaves.size()); final Weight weight; if (filter == null) { weight = null; } else { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); weight = searcher.createNormalizedWeight(filter, false); } for (LeafReaderContext context : leaves) { Terms terms = context.reader().terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); if (termsEnum == null) { continue; } BitSet bits = null; if (weight != null) { Scorer scorer = weight.scorer(context); if (scorer == null) { // fully filtered, none matching, no need to iterate on this continue; } DocIdSetIterator docs = scorer.iterator(); // we want to force apply deleted docs final Bits liveDocs = context.reader().getLiveDocs(); if (liveDocs != null) { docs = new FilteredDocIdSetIterator(docs) { @Override protected boolean match(int doc) { return liveDocs.get(doc); } }; } bits = BitSet.of(docs, context.reader().maxDoc()); } enums.add(new Holder(termsEnum, bits)); } this.enums = enums.toArray(new Holder[enums.size()]); }
From source file:org.codelibs.elasticsearch.common.lucene.index.FilterableTermsEnum.java
License:Apache License
@Override public boolean seekExact(BytesRef text) throws IOException { int docFreq = 0; long totalTermFreq = 0; for (Holder anEnum : enums) { if (anEnum.termsEnum.seekExact(text)) { if (anEnum.bits == null) { docFreq += anEnum.termsEnum.docFreq(); if (docsEnumFlag == PostingsEnum.FREQS) { long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq(); if (totalTermFreq == -1 || leafTotalTermFreq == -1) { totalTermFreq = -1; continue; }/*from w ww . j a va 2s. c o m*/ totalTermFreq += leafTotalTermFreq; } } else { final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag); // 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not if (docsEnumFlag == PostingsEnum.FREQS) { for (int docId = docsEnum .nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { if (anEnum.bits != null && anEnum.bits.get(docId) == false) { continue; } docFreq++; // docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value // is really 1 or unrecorded when filtering like this totalTermFreq += docsEnum.freq(); } } else { for (int docId = docsEnum .nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { if (anEnum.bits != null && anEnum.bits.get(docId) == false) { continue; } // docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call docFreq++; } } } } } if (docFreq > 0) { currentDocFreq = docFreq; currentTotalTermFreq = totalTermFreq; current = text; return true; } else { currentDocFreq = NOT_FOUND; currentTotalTermFreq = NOT_FOUND; current = null; return false; } }
From source file:org.codelibs.elasticsearch.common.lucene.index.FreqTermsEnum.java
License:Apache License
public FreqTermsEnum(IndexReader reader, String field, boolean needDocFreq, boolean needTotalTermFreq, @Nullable Query filter, BigArrays bigArrays) throws IOException { super(reader, field, needTotalTermFreq ? PostingsEnum.FREQS : PostingsEnum.NONE, filter); this.bigArrays = bigArrays; this.needDocFreqs = needDocFreq; this.needTotalTermFreqs = needTotalTermFreq; if (needDocFreq) { termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false); } else {//from w w w . j a v a 2 s . c o m termDocFreqs = null; } if (needTotalTermFreq) { termsTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false); } else { termsTotalFreqs = null; } cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays); }
From source file:org.elasticsearch.common.lucene.index.FilterableTermsEnum.java
License:Apache License
public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException { if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) { throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag); }/*from w w w . j a v a 2s .co m*/ this.docsEnumFlag = docsEnumFlag; if (filter == null) { // Important - need to use the doc count that includes deleted docs // or we have this issue: https://github.com/elasticsearch/elasticsearch/issues/7951 numDocs = reader.maxDoc(); } List<LeafReaderContext> leaves = reader.leaves(); List<Holder> enums = new ArrayList<>(leaves.size()); final Weight weight; if (filter == null) { weight = null; } else { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); weight = searcher.createNormalizedWeight(filter, false); } for (LeafReaderContext context : leaves) { Terms terms = context.reader().terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); if (termsEnum == null) { continue; } BitSet bits = null; if (weight != null) { Scorer scorer = weight.scorer(context); if (scorer == null) { // fully filtered, none matching, no need to iterate on this continue; } DocIdSetIterator docs = scorer.iterator(); // we want to force apply deleted docs final Bits liveDocs = context.reader().getLiveDocs(); if (liveDocs != null) { docs = new FilteredDocIdSetIterator(docs) { @Override protected boolean match(int doc) { return liveDocs.get(doc); } }; } BitDocIdSet.Builder builder = new BitDocIdSet.Builder(context.reader().maxDoc()); builder.or(docs); bits = builder.build().bits(); // Count how many docs are in our filtered set // TODO make this lazy-loaded only for those that need it? numDocs += bits.cardinality(); } enums.add(new Holder(termsEnum, bits)); } this.enums = enums.toArray(new Holder[enums.size()]); }
From source file:org.opengrok.suggest.SuggesterSearcher.java
License:Open Source License
private List<LookupResultItem> suggest(final Query query, final LeafReaderContext leafReaderContext, final String project, final SuggesterQuery suggesterQuery, final PopularityCounter searchCounts) throws IOException { if (Thread.currentThread().isInterrupted()) { interrupted = true;//from w w w .j a v a2 s . c o m return Collections.emptyList(); } boolean shouldLeaveOutSameTerms = shouldLeaveOutSameTerms(query, suggesterQuery); Set<BytesRef> tokensAlreadyIncluded = null; if (shouldLeaveOutSameTerms) { tokensAlreadyIncluded = SuggesterUtils.intoTermsExceptPhraseQuery(query).stream() .filter(t -> t.field().equals(suggesterQuery.getField())).map(Term::bytes) .collect(Collectors.toSet()); } boolean needsDocumentIds = query != null && !(query instanceof MatchAllDocsQuery); ComplexQueryData complexQueryData = null; if (needsDocumentIds) { complexQueryData = getComplexQueryData(query, leafReaderContext); if (interrupted) { return Collections.emptyList(); } } Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField()); TermsEnum termsEnum = suggesterQuery.getTermsEnumForSuggestions(terms); LookupPriorityQueue queue = new LookupPriorityQueue(resultSize); boolean needPositionsAndFrequencies = needPositionsAndFrequencies(query); PostingsEnum postingsEnum = null; BytesRef term = termsEnum.next(); while (term != null) { if (Thread.currentThread().isInterrupted()) { interrupted = true; break; } if (needPositionsAndFrequencies) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS); } else { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); } int score; if (!needsDocumentIds) { score = normalizeDocumentFrequency(termsEnum.docFreq(), numDocs); } else if (needPositionsAndFrequencies) { score = getPhraseScore(complexQueryData, leafReaderContext.docBase, postingsEnum); } else { score = getDocumentFrequency(complexQueryData.documentIds, leafReaderContext.docBase, postingsEnum); } if (score > 0) { if (!shouldLeaveOutSameTerms || !tokensAlreadyIncluded.contains(term)) { score += searchCounts.get(term) * TERM_ALREADY_SEARCHED_MULTIPLIER; if (queue.canInsert(score)) { queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score)); } } } term = termsEnum.next(); } return queue.getResult(); }
From source file:org.voyanttools.trombone.tool.corpus.DocumentCollocates.java
License:Open Source License
private FlexibleQueue<DocumentCollocate> getCollocates(LeafReader LeafReader, int luceneDoc, int corpusDocIndex, int lastToken, List<DocumentSpansData> documentSpansData, Keywords stopwords) throws IOException { Map<Integer, TermInfo> termsOfInterest = getTermsOfInterest(LeafReader, luceneDoc, lastToken, documentSpansData, true);/*w w w .jav a2 s . c o m*/ Map<String, Map<String, AtomicInteger>> mapOfTermsMap = new HashMap<String, Map<String, AtomicInteger>>(); Map<String, Integer> queryStringFrequencyMap = new HashMap<String, Integer>(); // this keeps track of the terms we want to lookup total document frequencies Map<String, Integer> stringsOfInterestMap = new HashMap<String, Integer>(); // Map<String, Map<String, Integer>> for (DocumentSpansData dsd : documentSpansData) { Map<String, AtomicInteger> termsMap = new HashMap<String, AtomicInteger>(); queryStringFrequencyMap.put(dsd.queryString, dsd.spansData.length); int contextTotalTokens = 0; for (int[] data : dsd.spansData) { int keywordstart = data[0]; int keywordend = data[1]; int leftstart = keywordstart - context; if (leftstart < 0) { leftstart = 0; } for (int i = leftstart; i < keywordstart - 1; i++) { contextTotalTokens++; String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); if (termsMap.containsKey(term)) { termsMap.get(term).getAndIncrement(); } else { termsMap.put(term, new AtomicInteger(1)); } } for (int i = keywordstart; i < keywordend; i++) { String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); } int rightend = keywordend + context; if (rightend > lastToken) { rightend = lastToken; } for (int i = keywordend; i < rightend; i++) { contextTotalTokens++; String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); if (termsMap.containsKey(term)) { termsMap.get(term).getAndIncrement(); } else { termsMap.put(term, new AtomicInteger(1)); } } } mapOfTermsMap.put(dsd.queryString, termsMap); } // gather document frequency for strings of interest int documentTotalTokens = 0; Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); postingsEnum.nextDoc(); int freq = postingsEnum.freq(); documentTotalTokens += freq; if (stringsOfInterestMap.containsKey(termString)) { stringsOfInterestMap.put(termString, freq); } } else { break; } } FlexibleQueue<DocumentCollocate> documentCollocatesQueue = new FlexibleQueue(comparator, limit); for (Map.Entry<String, Map<String, AtomicInteger>> keywordMapEntry : mapOfTermsMap.entrySet()) { String keyword = keywordMapEntry.getKey(); int keywordContextRawFrequency = queryStringFrequencyMap.get(keyword); Map<String, AtomicInteger> termsMap = keywordMapEntry.getValue(); // once through to determine contextTotalTokens int contextTotalTokens = 0; for (Map.Entry<String, AtomicInteger> termsMapEntry : termsMap.entrySet()) { contextTotalTokens += termsMapEntry.getValue().intValue(); } /* * public DocumentCollocate(int corpusDocumentIndex, String keyword, String term, int keywordContextRawFrequency, int termContextRawFrequency, int termDocumentRawFrequency, int totalContextTokens, int totalDocumentTokens) { */ // and now to create document collocate objects for (Map.Entry<String, AtomicInteger> termsMapEntry : termsMap.entrySet()) { String term = termsMapEntry.getKey(); int termDocumentRawFrequency = stringsOfInterestMap.get(term); int termContextRawFrequency = termsMapEntry.getValue().intValue(); DocumentCollocate documentCollocate = new DocumentCollocate(corpusDocIndex, keyword, term, keywordContextRawFrequency, termContextRawFrequency, termDocumentRawFrequency, contextTotalTokens, documentTotalTokens); // DocumentCollocate documentCollocate = new DocumentCollocate(corpusDocIndex, keyword, term, contextTermRawFrequency, ((float) contextTermRawFrequency)/contextTotalTokens, documentTermRawFrequency, ((float) documentTermRawFrequency)/documentTotalTokens); documentCollocatesQueue.offer(documentCollocate); } } return documentCollocatesQueue; }