List of usage examples for org.apache.lucene.index PostingsEnum nextDoc
public abstract int nextDoc() throws IOException;
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {//from ww w . ja v a2 s . co m int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { try {/*from ww w . j a v a2 s . c om*/ int[] documentIds = getDocumentIds(d2Index); Fields fields = MultiFields.getFields(d2Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (!hashedBlocks.containsKey(text.utf8ToString())) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); int[] d1Entities = hashedBlocks.get(text.utf8ToString()); blocks.add(new BilateralBlock(d1Entities, idsArray)); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseIndex(IndexReader d1Index) { try {//from w w w . j a v a2 s .c om int[] documentIds = getDocumentIds(d1Index); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (termsEnum.docFreq() < 2) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock block = new UnilateralBlock(idsArray); blocks.add(block); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) { try {// w w w .j a v a 2 s. c o m Term term = new Term(VALUE_LABEL, blockingKey); List<Integer> entityIds = new ArrayList<>(); int docFrequency = iReader.docFreq(term); if (0 < docFrequency) { BytesRef text = term.bytes(); PostingsEnum pe = MultiFields.getTermDocsEnum(iReader, VALUE_LABEL, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(docIds[doc]); } } return entityIds; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private List<Entry<String, Float>> getTermScoreList(Directory directory) throws CorruptIndexException, IOException { Map<String, Float> termScoreMap = new HashMap<>(); ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); try (IndexReader idxReader = DirectoryReader.open(directory)) { idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> { try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; int docsNum = idxReader.numDocs(); BytesRef text;/*from ww w . j a v a 2s . co m*/ while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); termScoreMap.put(text.utf8ToString(), BETA * (tf * idf)); } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } finally { try { idxReader.close(); } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } }); } return new ArrayList<>(termScoreMap.entrySet()); }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private float getScore(Directory directory, String term) throws CorruptIndexException, IOException { try (IndexReader idxReader = DirectoryReader.open(directory)) { ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); for (LeafReaderContext context : idxReader.leaves()) { LeafReader reader = context.reader(); try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; BytesRef text;/* w w w . j a va 2 s .c o m*/ while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); if (text.utf8ToString().equalsIgnoreCase(term)) { while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); return tf * idf; } } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } } return 0; }
From source file:com.github.flaxsearch.resources.PostingsResource.java
License:Apache License
@GET public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count) throws IOException { TermsEnum te = readerManager.findTermPostings(segment, field, term); Bits liveDocs = readerManager.getLiveDocs(segment); PostingsEnum pe = te.postings(null, PostingsEnum.NONE); int docFreq = te.docFreq(); long totalTermFreq = te.totalTermFreq(); int size = (docFreq < count) ? docFreq : count; int[] postings = new int[size]; int docId;/* www. ja v a 2s. c o m*/ int i = 0; while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) { if (liveDocs != null && liveDocs.get(docId) == false) continue; postings[i] = docId; i++; } return new TermData(term, docFreq, totalTermFreq, postings); }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();/*from ww w . j ava2s . com*/ MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks docs + freqs + positions + payloads, sequentially *//*from w w w . j a v a2s . co m*/ public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { assertNotNull(leftDocs); assertNotNull(rightDocs); assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); int freq = leftDocs.freq(); assertEquals(freq, rightDocs.freq()); for (int i = 0; i < freq; i++) { assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); // we don't assert offsets/payloads, they are allowed to be different } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks docs + freqs, sequentially//from w w w. j a v a 2 s . c o m */ public void assertDocsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; } assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); // we don't assert freqs, they are allowed to be different } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }