Example usage for org.apache.lucene.index PostingsEnum freq

List of usage examples for org.apache.lucene.index PostingsEnum freq

Introduction

In this page you can find the example usage for org.apache.lucene.index PostingsEnum freq.

Prototype

public abstract int freq() throws IOException;

Source Link

Document

Returns term frequency in the current document, or 1 if the field was indexed with IndexOptions#DOCS .

Usage

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private List<Entry<String, Float>> getTermScoreList(Directory directory)
        throws CorruptIndexException, IOException {

    Map<String, Float> termScoreMap = new HashMap<>();

    ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> {
            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;
                int docsNum = idxReader.numDocs();

                BytesRef text;/*  www  .j  a v  a2s .  c o  m*/
                while ((text = termsEnum.next()) != null) {

                    postings = termsEnum.postings(postings);

                    while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                        int freq = postings.freq();
                        float tf = sim.tf(freq);
                        float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                        termScoreMap.put(text.utf8ToString(), BETA * (tf * idf));
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                try {
                    idxReader.close();
                } catch (IOException ex) {
                    Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

    }

    return new ArrayList<>(termScoreMap.entrySet());
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private float getScore(Directory directory, String term) throws CorruptIndexException, IOException {

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

        for (LeafReaderContext context : idxReader.leaves()) {
            LeafReader reader = context.reader();

            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;

                BytesRef text;/*from  w w w . jav  a2  s.co m*/
                while ((text = termsEnum.next()) != null) {
                    postings = termsEnum.postings(postings);
                    if (text.utf8ToString().equalsIgnoreCase(term)) {

                        while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                            int freq = postings.freq();
                            float tf = sim.tf(freq);
                            float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                            return tf * idf;
                        }
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    }

    return 0;
}

From source file:com.github.flaxsearch.resources.PositionsResource.java

License:Apache License

@GET
public DocTermData getDocTermData(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @PathParam("term") String term, @PathParam("docId") int docId) throws Exception {

    TermsEnum te = readerManager.findTermPostings(segment, field, term);
    PostingsEnum pe = te.postings(null, PostingsEnum.ALL);

    if (pe.advance(docId) != docId) {
        String seg = segment == null ? "" : " in segment " + segment;
        String msg = String.format(Locale.ROOT, "No document %d%s in index", docId, seg);
        throw new WebApplicationException(msg, Response.Status.NOT_FOUND);
    }//w w  w . ja v  a  2 s. c  o m
    List<PositionData> positions = new ArrayList<>();
    int remaining = pe.freq();
    while (remaining > 0) {
        remaining--;
        positions.add(new PositionData(pe));
    }

    return new DocTermData(docId, positions);
}

From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java

License:Apache License

@Override
public void testInvertedWrite() throws Exception {
    Directory dir = newDirectory();/* w w  w .j  a  va 2  s.  c  o m*/
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);

    // Must be concurrent because thread(s) can be merging
    // while up to one thread flushes, and each of those
    // threads iterates over the map while the flushing
    // thread might be adding to it:
    final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();

    final AtomicLong sumDocFreq = new AtomicLong();
    final AtomicLong sumTotalTermFreq = new AtomicLong();

    // TODO: would be better to use / delegate to the current
    // Codec returned by getCodec()

    iwc.setCodec(new AssertingCodec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {

            PostingsFormat p = getCodec().postingsFormat();
            if (p instanceof PerFieldPostingsFormat) {
                p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            if (p instanceof RocanaPerFieldPostingsFormat) {
                p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            final PostingsFormat defaultPostingsFormat = p;

            final Thread mainThread = Thread.currentThread();

            if (field.equals("body")) {

                // A PF that counts up some stats and then in
                // the end we verify the stats match what the
                // final IndexReader says, just to exercise the
                // new freedom of iterating the postings more
                // than once at flush/merge:

                return new PostingsFormat(defaultPostingsFormat.getName()) {

                    @Override
                    public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {

                        final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);

                        return new FieldsConsumer() {
                            @Override
                            public void write(Fields fields) throws IOException {
                                fieldsConsumer.write(fields);

                                boolean isMerge = state.context.context == IOContext.Context.MERGE;

                                // We only use one thread for flushing
                                // in this test:
                                assert isMerge || Thread.currentThread() == mainThread;

                                // We iterate the provided TermsEnum
                                // twice, so we excercise this new freedom
                                // with the inverted API; if
                                // addOnSecondPass is true, we add up
                                // term stats on the 2nd iteration:
                                boolean addOnSecondPass = random().nextBoolean();

                                //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);

                                // Gather our own stats:
                                Terms terms = fields.terms("body");
                                assert terms != null;

                                TermsEnum termsEnum = terms.iterator();
                                PostingsEnum docs = null;
                                while (termsEnum.next() != null) {
                                    BytesRef term = termsEnum.term();
                                    // TODO: also sometimes ask for payloads/offsets?
                                    boolean noPositions = random().nextBoolean();
                                    if (noPositions) {
                                        docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                    } else {
                                        docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                    }
                                    int docFreq = 0;
                                    long totalTermFreq = 0;
                                    while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                        docFreq++;
                                        totalTermFreq += docs.freq();
                                        int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                        if (!noPositions) {
                                            for (int i = 0; i < limit; i++) {
                                                docs.nextPosition();
                                            }
                                        }
                                    }

                                    String termString = term.utf8ToString();

                                    // During merge we should only see terms
                                    // we had already seen during a
                                    // previous flush:
                                    assertTrue(isMerge == false || termFreqs.containsKey(termString));

                                    if (isMerge == false) {
                                        if (addOnSecondPass == false) {
                                            TermFreqs tf = termFreqs.get(termString);
                                            if (tf == null) {
                                                tf = new TermFreqs();
                                                termFreqs.put(termString, tf);
                                            }
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        } else if (termFreqs.containsKey(termString) == false) {
                                            // Add placeholder (2nd pass will
                                            // set its counts):
                                            termFreqs.put(termString, new TermFreqs());
                                        }
                                    }
                                }

                                // Also test seeking the TermsEnum:
                                for (String term : termFreqs.keySet()) {
                                    if (termsEnum.seekExact(new BytesRef(term))) {
                                        // TODO: also sometimes ask for payloads/offsets?
                                        boolean noPositions = random().nextBoolean();
                                        if (noPositions) {
                                            docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                        } else {
                                            docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                        }

                                        int docFreq = 0;
                                        long totalTermFreq = 0;
                                        while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                            docFreq++;
                                            totalTermFreq += docs.freq();
                                            int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                            if (!noPositions) {
                                                for (int i = 0; i < limit; i++) {
                                                    docs.nextPosition();
                                                }
                                            }
                                        }

                                        if (isMerge == false && addOnSecondPass) {
                                            TermFreqs tf = termFreqs.get(term);
                                            assert tf != null;
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        }

                                        //System.out.println("  term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
                                        assertTrue(docFreq <= termFreqs.get(term).docFreq);
                                        assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
                                    }
                                }

                                // Also test seekCeil
                                for (int iter = 0; iter < 10; iter++) {
                                    BytesRef term = new BytesRef(
                                            TestUtil.randomRealisticUnicodeString(random()));
                                    SeekStatus status = termsEnum.seekCeil(term);
                                    if (status == SeekStatus.NOT_FOUND) {
                                        assertTrue(term.compareTo(termsEnum.term()) < 0);
                                    }
                                }
                            }

                            @Override
                            public void close() throws IOException {
                                fieldsConsumer.close();
                            }
                        };
                    }

                    @Override
                    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
                        return defaultPostingsFormat.fieldsProducer(state);
                    }
                };
            } else {
                return defaultPostingsFormat;
            }
        }
    });

    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

    LineFileDocs docs = new LineFileDocs(random());
    int bytesToIndex = atLeast(100) * 1024;
    int bytesIndexed = 0;
    while (bytesIndexed < bytesToIndex) {
        Document doc = docs.nextDoc();
        w.addDocument(doc);
        bytesIndexed += RamUsageTester.sizeOf(doc);
    }

    IndexReader r = w.getReader();
    w.close();

    Terms terms = MultiFields.getTerms(r, "body");
    assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
    assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());

    TermsEnum termsEnum = terms.iterator();
    long termCount = 0;
    boolean supportsOrds = true;
    while (termsEnum.next() != null) {
        BytesRef term = termsEnum.term();
        assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
        assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
        if (supportsOrds) {
            long ord;
            try {
                ord = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                ord = -1;
            }
            if (ord != -1) {
                assertEquals(termCount, ord);
            }
        }
        termCount++;
    }
    assertEquals(termFreqs.size(), termCount);

    r.close();
    dir.close();
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

/**
 * checks docs + freqs + positions + payloads, sequentially
 *///from ww w . j  a va2  s .c om
public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
    assertNotNull(leftDocs);
    assertNotNull(rightDocs);
    assertEquals(-1, leftDocs.docID());
    assertEquals(-1, rightDocs.docID());
    int docid;
    while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        assertEquals(docid, rightDocs.nextDoc());
        int freq = leftDocs.freq();
        assertEquals(freq, rightDocs.freq());
        for (int i = 0; i < freq; i++) {
            assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
            // we don't assert offsets/payloads, they are allowed to be different
        }
    }
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

/**
 * checks advancing docs + positions//from w ww  .ja  va 2 s.com
 */
public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs)
        throws Exception {
    if (leftDocs == null || rightDocs == null) {
        assertNull(leftDocs);
        assertNull(rightDocs);
        return;
    }

    int docid = -1;
    int averageGap = MAXDOC / (1 + docFreq);
    int skipInterval = 16;

    while (true) {
        if (random().nextBoolean()) {
            // nextDoc()
            docid = leftDocs.nextDoc();
            assertEquals(docid, rightDocs.nextDoc());
        } else {
            // advance()
            int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
            docid = leftDocs.advance(skip);
            assertEquals(docid, rightDocs.advance(skip));
        }

        if (docid == DocIdSetIterator.NO_MORE_DOCS) {
            return;
        }
        int freq = leftDocs.freq();
        assertEquals(freq, rightDocs.freq());
        for (int i = 0; i < freq; i++) {
            assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
            // we don't compare the payloads, it's allowed that one is empty etc
        }
    }
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);//from  w  ww.j  a v a2  s .  c  o  m

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.AnnotationsUtils.java

License:Apache License

public static void printAnnotations(LeafReader reader, Term term) throws IOException {
    System.out.println("Annotations for " + term);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
    for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
        final int freq = postings.freq();
        System.out.println("  doc=" + docID + ", freq=" + freq);
        for (int i = 0; i < freq; i++) {
            postings.nextPosition();//from   w  ww.  ja v  a  2  s .  co m
            final BytesRef payload = postings.getPayload();
            in.reset(payload.bytes, payload.offset, payload.length);
            System.out.println("    start=" + in.readVInt() + ", length=" + in.readVInt());
        }
    }
}

From source file:com.shaie.utils.IndexUtils.java

License:Apache License

/** Prints the terms indexed under the given fields with full postings information. */
public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException {
    for (final String field : fields) {
        System.out.println(format("Terms for field [%s], with positional info:", field));
        final TermsEnum te = reader.terms(field).iterator();
        BytesRef scratch;/*from   w w w.j  a  v a  2  s  . com*/
        PostingsEnum postings = null;
        while ((scratch = te.next()) != null) {
            System.out.println(format("  %s", scratch.utf8ToString()));
            postings = te.postings(postings, PostingsEnum.ALL);
            for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) {
                final Map<Integer, BytesRef> positions = Maps.newTreeMap();
                boolean addedPayload = false;
                for (int i = 0; i < postings.freq(); i++) {
                    final int pos = postings.nextPosition();
                    final BytesRef payload = postings.getPayload();
                    if (payload != null) {
                        positions.put(pos, BytesRef.deepCopyOf(payload));
                        addedPayload = true;
                    } else {
                        positions.put(pos, null);
                    }
                }
                if (addedPayload) {
                    System.out.println(
                            format("    doc=%d, freq=%d", postings.docID(), postings.freq(), positions));
                    for (final Entry<Integer, BytesRef> e : positions.entrySet()) {
                        System.out.println(format("      pos=%d, payload=%s", e.getKey(), e.getValue()));
                    }
                } else {
                    System.out.println(format("    doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(),
                            positions.keySet()));
                }
            }
        }
    }
}

From source file:edu.upenn.library.solrplugins.ProofOfConceptPayloadHandler.java

License:Apache License

private NamedList<Object> buildEntryValue(long count, PostingsEnum postings, Bits liveDocs) throws IOException {
    NamedList<Object> entry = new NamedList<>();
    entry.add("count", count);
    int i = -1;//from  w  w  w  .j  a v a  2s . c o  m
    while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        if (!liveDocs.get(postings.docID())) {
            continue;
        }
        i++;
        NamedList<Object> documentEntry = new NamedList<>();
        entry.add("doc" + i, documentEntry);
        for (int j = 0; j < postings.freq(); j++) {
            postings.nextPosition();
            String extra = postings.getPayload().utf8ToString();
            documentEntry.add("position" + j, extra);
        }
    }
    return entry;
}