List of usage examples for org.apache.lucene.index PostingsEnum advance
public abstract int advance(int target) throws IOException;
From source file:com.github.flaxsearch.resources.PositionsResource.java
License:Apache License
@GET public DocTermData getDocTermData(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @PathParam("docId") int docId) throws Exception { TermsEnum te = readerManager.findTermPostings(segment, field, term); PostingsEnum pe = te.postings(null, PostingsEnum.ALL); if (pe.advance(docId) != docId) { String seg = segment == null ? "" : " in segment " + segment; String msg = String.format(Locale.ROOT, "No document %d%s in index", docId, seg); throw new WebApplicationException(msg, Response.Status.NOT_FOUND); }//from w w w .j a v a 2 s . c o m List<PositionData> positions = new ArrayList<>(); int remaining = pe.freq(); while (remaining > 0) { remaining--; positions.add(new PositionData(pe)); } return new DocTermData(docId, positions); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks advancing docs/*from w ww . j a v a2s .c o m*/ */ public void assertDocsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1 + docFreq); int skipInterval = 16; while (true) { if (random().nextBoolean()) { // nextDoc() docid = leftDocs.nextDoc(); assertEquals(docid, rightDocs.nextDoc()); } else { // advance() int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); docid = leftDocs.advance(skip); assertEquals(docid, rightDocs.advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } // we don't assert freqs, they are allowed to be different } }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks advancing docs + positions/*from w w w . ja v a 2 s . c o m*/ */ public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null || rightDocs == null) { assertNull(leftDocs); assertNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1 + docFreq); int skipInterval = 16; while (true) { if (random().nextBoolean()) { // nextDoc() docid = leftDocs.nextDoc(); assertEquals(docid, rightDocs.nextDoc()); } else { // advance() int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); docid = leftDocs.advance(skip); assertEquals(docid, rightDocs.advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } int freq = leftDocs.freq(); assertEquals(freq, rightDocs.freq()); for (int i = 0; i < freq; i++) { assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); // we don't compare the payloads, it's allowed that one is empty etc } } }
From source file:org.nlp4l.lucene.BuddyWordsFinder.java
License:Apache License
public Scorer[] findCoincidentalTerms(String field, BytesRef term) throws IOException { baseTermFilter.start(reader, field, term); if (baseTermFilter.skip(term) || baseTermFilter.skipByPopularity(term)) return null; //System.out.println(term.utf8ToString()); Bits liveDocs = MultiFields.getLiveDocs(reader); PostingsEnum de = MultiFields.getTermDocsEnum(reader, field, term); if (de == null) return null; int numDocsAnalyzed = 0; phraseTerms.clear();/*from www. ja v a 2 s.c o m*/ while (de.nextDoc() != PostingsEnum.NO_MORE_DOCS && numDocsAnalyzed < maxDocsToAnalyze) { int docId = de.docID(); //first record all of the positions of the term in a bitset which // represents terms in the current doc. int freq = de.freq(); PostingsEnum pe = MultiFields.getTermPositionsEnum(reader, field, term); int ret = pe.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; termPos.clear(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < termPos.size()) termPos.set(pos); } // now look at all OTHER terms in this doc and see if they are // positioned in a pre-defined sized window around the current term Fields vectors = reader.getTermVectors(docId); // check it has term vectors if (vectors == null) return null; Terms vector = vectors.terms(field); // check it has position info if (vector == null || !vector.hasPositions()) return null; TermsEnum te = vector.iterator(); BytesRef otherTerm = null; while ((otherTerm = te.next()) != null) { if (term.bytesEquals(otherTerm)) continue; coiTermFilter.start(reader, field, otherTerm); if (coiTermFilter.skip(otherTerm) || coiTermFilter.skipByPopularity(otherTerm)) continue; PostingsEnum pe2 = MultiFields.getTermPositionsEnum(reader, field, otherTerm); ret = pe2.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; freq = pe2.freq(); boolean matchFound = false; for (int i = 0; i < freq && (!matchFound); i++) { int pos = pe2.nextPosition(); int startpos = Math.max(0, pos - slop); int endpos = pos + slop; for (int prevpos = startpos; (prevpos <= endpos) && (!matchFound); prevpos++) { if (termPos.get(prevpos)) { // Add term to hashmap containing co-occurence // counts for this term Scorer pt = phraseTerms.get(otherTerm.utf8ToString()); if (pt == null) { pt = new Scorer(baseTermFilter.getCurrentTermDocFreq(), otherTerm.utf8ToString(), coiTermFilter.getCurrentTermDocFreq()); phraseTerms.put(pt.coiTerm, pt); } pt.incCoiDocCount(); matchFound = true; } } } } numDocsAnalyzed++; } // end of while loop // now sort and dump the top terms associated with this term. TopTerms topTerms = new TopTerms(maxCoiTermsPerTerm); for (String key : phraseTerms.keySet()) { Scorer pt = phraseTerms.get(key); topTerms.insertWithOverflow(pt); } Scorer[] tops = new Scorer[topTerms.size()]; int tp = tops.length - 1; while (topTerms.size() > 0) { Scorer top = topTerms.pop(); tops[tp--] = top; } return tops; }