List of usage examples for org.apache.lucene.index PostingsEnum docID
public abstract int docID();
-1
if #nextDoc() or #advance(int) were not called yet. From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks docs + freqs + positions + payloads, sequentially *///from w w w . j a v a 2 s .c o m public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { assertNotNull(leftDocs); assertNotNull(rightDocs); assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); int freq = leftDocs.freq(); assertEquals(freq, rightDocs.freq()); for (int i = 0; i < freq; i++) { assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); // we don't assert offsets/payloads, they are allowed to be different } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks docs + freqs, sequentially// ww w .java2 s. c o m */ public void assertDocsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; } assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); // we don't assert freqs, they are allowed to be different } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
From source file:com.shaie.utils.IndexUtils.java
License:Apache License
/** Prints the terms indexed under the given fields with full postings information. */ public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException { for (final String field : fields) { System.out.println(format("Terms for field [%s], with positional info:", field)); final TermsEnum te = reader.terms(field).iterator(); BytesRef scratch;/*from w w w . ja va2s . co m*/ PostingsEnum postings = null; while ((scratch = te.next()) != null) { System.out.println(format(" %s", scratch.utf8ToString())); postings = te.postings(postings, PostingsEnum.ALL); for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) { final Map<Integer, BytesRef> positions = Maps.newTreeMap(); boolean addedPayload = false; for (int i = 0; i < postings.freq(); i++) { final int pos = postings.nextPosition(); final BytesRef payload = postings.getPayload(); if (payload != null) { positions.put(pos, BytesRef.deepCopyOf(payload)); addedPayload = true; } else { positions.put(pos, null); } } if (addedPayload) { System.out.println( format(" doc=%d, freq=%d", postings.docID(), postings.freq(), positions)); for (final Entry<Integer, BytesRef> e : positions.entrySet()) { System.out.println(format(" pos=%d, payload=%s", e.getKey(), e.getValue())); } } else { System.out.println(format(" doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(), positions.keySet())); } } } } }
From source file:edu.upenn.library.solrplugins.ProofOfConceptPayloadHandler.java
License:Apache License
private NamedList<Object> buildEntryValue(long count, PostingsEnum postings, Bits liveDocs) throws IOException { NamedList<Object> entry = new NamedList<>(); entry.add("count", count); int i = -1;/*from www . j a va2s . c o m*/ while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (!liveDocs.get(postings.docID())) { continue; } i++; NamedList<Object> documentEntry = new NamedList<>(); entry.add("doc" + i, documentEntry); for (int j = 0; j < postings.freq(); j++) { postings.nextPosition(); String extra = postings.getPayload().utf8ToString(); documentEntry.add("position" + j, extra); } } return entry; }
From source file:io.anserini.index.IndexUtils.java
License:Apache License
public void printTermCounts(String termStr) throws IOException, ParseException { EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); TermQuery q = (TermQuery) qp.parse(termStr); Term t = q.getTerm();/*from w w w . ja v a 2 s .c o m*/ System.out.println("raw term: " + termStr); System.out.println("stemmed term: " + q.toString(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("collection frequency: " + reader.totalTermFreq(t)); System.out.println("document frequency: " + reader.docFreq(t)); PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); System.out.println("postings:\n"); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq()); } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void dumpPostings(IndexReader reader) throws IOException { // This is how you iterate through terms in the postings list. LeafReader leafReader = reader.leaves().get(0).reader(); TermsEnum termsEnum = leafReader.terms("text").iterator(); BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { // This is the current term in the dictionary. String token = bytesRef.utf8ToString(); Term term = new Term("text", token); System.out.print(token + " (df = " + reader.docFreq(term) + "):"); PostingsEnum postingsEnum = leafReader.postings(term); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq())); }// w ww. j a v a 2 s .co m System.out.println(""); bytesRef = termsEnum.next(); } }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { TermsEnum termsEnum = terms.iterator(); if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; }/*ww w . j av a 2s. co m*/ PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) { return null; } return reader.document(postingsEnum.docID()); }
From source file:org.eclipse.rdf4j.sail.lucene.LuceneIndexTest.java
License:Open Source License
@Test public void testAddStatement() throws IOException, ParseException { // add a statement to an index index.begin();// ww w . ja va 2 s. c om index.addStatement(statement11); index.commit(); // check that it arrived properly DirectoryReader reader = DirectoryReader.open(directory); assertEquals(1, reader.numDocs()); Term term = new Term(SearchFields.URI_FIELD_NAME, subject.toString()); PostingsEnum docs = termDocs(reader, term); assertTrue(next(docs)); int documentNr = docs.docID(); Document document = reader.document(documentNr); assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME)); assertEquals(object1.getLabel(), document.get(predicate1.toString())); assertFalse(next(docs)); reader.close(); // add another statement index.begin(); index.addStatement(statement12); index.commit(); // See if everything remains consistent. We must create a new IndexReader // in order to be able to see the updates reader = DirectoryReader.open(directory); assertEquals(1, reader.numDocs()); // #docs should *not* have increased docs = termDocs(reader, term); assertTrue(next(docs)); documentNr = docs.docID(); document = reader.document(documentNr); assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME)); assertEquals(object1.getLabel(), document.get(predicate1.toString())); assertEquals(object2.getLabel(), document.get(predicate2.toString())); assertFalse(next(docs)); // see if we can query for these literals IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(SearchFields.TEXT_FIELD_NAME, analyzer); Query query = parser.parse(object1.getLabel()); System.out.println("query=" + query); TotalHitCountCollector results = new TotalHitCountCollector(); searcher.search(query, results); assertEquals(1, results.getTotalHits()); query = parser.parse(object2.getLabel()); results = new TotalHitCountCollector(); searcher.search(query, results); assertEquals(1, results.getTotalHits()); reader.close(); // remove the first statement index.begin(); index.removeStatement(statement11); index.commit(); // check that that statement is actually removed and that the other still // exists reader = DirectoryReader.open(directory); assertEquals(1, reader.numDocs()); docs = termDocs(reader, term); assertTrue(next(docs)); documentNr = docs.docID(); document = reader.document(documentNr); assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME)); assertNull(document.get(predicate1.toString())); assertEquals(object2.getLabel(), document.get(predicate2.toString())); assertFalse(next(docs)); reader.close(); // remove the other statement index.begin(); index.removeStatement(statement12); index.commit(); // check that there are no documents left (i.e. the last Document was // removed completely, rather than its remaining triple removed) reader = DirectoryReader.open(directory); assertEquals(0, reader.numDocs()); reader.close(); }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override//from ww w .ja v a 2s . c o m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } } }; }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProviderV1.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { // TODO write index header? CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override//ww w .ja va 2s.co m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProviderV1.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); } } } }; }