Example usage for org.apache.lucene.index PostingsEnum OFFSETS

List of usage examples for org.apache.lucene.index PostingsEnum OFFSETS

Introduction

In this page you can find the example usage for org.apache.lucene.index PostingsEnum OFFSETS.

Prototype

short OFFSETS

To view the source code for org.apache.lucene.index PostingsEnum OFFSETS.

Click Source Link

Document

Flag to pass to TermsEnum#postings(PostingsEnum,int) if you require offsets in the returned enum.

Usage

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

/**
 * checks the terms enum sequentially/*from  w  ww.  ja  v  a2 s .c  o m*/
 * if deep is false, it does a 'shallow' test that doesnt go down to the docsenums
 */
public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep,
        boolean hasPositions) throws Exception {
    BytesRef term;
    PostingsEnum leftPositions = null;
    PostingsEnum rightPositions = null;
    PostingsEnum leftDocs = null;
    PostingsEnum rightDocs = null;

    while ((term = leftTermsEnum.next()) != null) {
        assertEquals(term, rightTermsEnum.next());
        assertTermStats(leftTermsEnum, rightTermsEnum);
        if (deep) {
            if (hasPositions) {
                // with payloads + off
                assertDocsAndPositionsEnum(
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL));

                assertPositionsSkipping(leftTermsEnum.docFreq(),
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL));
                // with payloads only
                assertDocsAndPositionsEnum(
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS));

                assertPositionsSkipping(leftTermsEnum.docFreq(),
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS));

                // with offsets only
                assertDocsAndPositionsEnum(
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS));

                assertPositionsSkipping(leftTermsEnum.docFreq(),
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS));

                // with positions only
                assertDocsAndPositionsEnum(
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS));

                assertPositionsSkipping(leftTermsEnum.docFreq(),
                        leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS),
                        rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS));
            }

            // with freqs:
            assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs),
                    rightDocs = rightTermsEnum.postings(rightDocs));

            // w/o freqs:
            assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE),
                    rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE));

            // with freqs:
            assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs),
                    rightDocs = rightTermsEnum.postings(rightDocs));

            // w/o freqs:
            assertDocsSkipping(leftTermsEnum.docFreq(),
                    leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE),
                    rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE));
        }
    }
    assertNull(rightTermsEnum.next());
}

From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java

License:Open Source License

private void fillTermsOfInterest(LeafReader LeafReader, int luceneDoc, Map<Integer, TermInfo> termsOfInterest)
        throws IOException {
    // fill in terms of interest
    Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name());
    TermsEnum termsEnum = terms.iterator();
    while (true) {
        BytesRef term = termsEnum.next();
        if (term != null) {
            String termString = term.utf8ToString();
            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
            if (postingsEnum != null) {
                postingsEnum.nextDoc();/*from w  w w .  j  a  va 2  s .  c  o m*/
                for (int i = 0, len = postingsEnum.freq(); i < len; i++) {
                    int pos = postingsEnum.nextPosition();
                    if (termsOfInterest.containsKey(pos)) {
                        termsOfInterest.put(pos, new TermInfo(termString, postingsEnum.startOffset(),
                                postingsEnum.endOffset(), pos, 1));
                    }
                }
            }
        } else {
            break;
        }
    }
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java

License:Open Source License

private SimplifiedTermInfo[] getSparseSimplifiedTermInfoArray(CorpusMapper corpusMapper, int luceneDoc,
        int lastTokenOffset) throws IOException {

    Keywords stopwords = this.getStopwords(corpusMapper.getCorpus());
    Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, tokenType.name());
    TermsEnum termsEnum = terms.iterator();
    SimplifiedTermInfo[] simplifiedTermInfoArray = new SimplifiedTermInfo[lastTokenOffset + 1];
    while (true) {
        BytesRef term = termsEnum.next();
        if (term != null) {
            String termString = term.utf8ToString();
            //if (stopwords.isKeyword(termString)) {continue;} // treat as whitespace or punctuation
            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
            while (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                int freq = postingsEnum.freq();
                for (int i = 0, len = freq; i < len; i++) {
                    int pos = postingsEnum.nextPosition();
                    new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(),
                            postingsEnum.endOffset());
                    simplifiedTermInfoArray[pos] = freq > 1
                            ? new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(),
                                    postingsEnum.endOffset())
                            : new SimplifiedTermInfo(""); // empty string if not repeating
                }/*from  ww  w  .java 2s. c o  m*/
            }
        } else {
            break;
        }
    }
    return simplifiedTermInfoArray;
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java

License:Open Source License

private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords)
        throws IOException {
    FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit);
    LeafReader reader = corpusMapper.getLeafReader();
    Corpus corpus = corpusMapper.getCorpus();
    CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType);
    TermsEnum termsEnum = null;/*  w  w  w . j  ava 2  s .  c  o m*/
    Bits docIdBitSet = corpusMapper
            .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus));
    Bits allBits = new Bits.MatchAllBits(reader.numDocs());
    int[] tokenCounts = corpus.getTokensCounts(tokenType);
    float[] typesCountMeans = corpus.getTypesCountMeans(tokenType);
    float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType);
    for (int doc : corpusMapper.getLuceneIds()) {
        if (!docIdBitSet.get(doc)) {
            continue;
        }
        FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator,
                limit * docIdBitSet.length());
        int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc);
        String docId = corpusMapper.getDocumentIdFromLuceneId(doc);
        float mean = typesCountMeans[documentPosition];
        float stdDev = typesCountStdDev[documentPosition];
        int totalTokensCount = tokenCounts[documentPosition];
        Terms terms = reader.getTermVector(doc, tokenType.name());
        if (terms != null) {
            termsEnum = terms.iterator();
            if (termsEnum != null) {
                BytesRef bytesRef = termsEnum.next();

                while (bytesRef != null) {
                    String termString = bytesRef.utf8ToString();
                    if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) {
                        bytesRef = termsEnum.next();
                        continue;
                    }
                    if (!stopwords.isKeyword(termString)) {
                        CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString);
                        int[] positions = null;
                        int[] offsets = null;
                        int freq;
                        if (isNeedsPositions || isNeedsOffsets) {
                            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                            postingsEnum.nextDoc();
                            freq = postingsEnum.freq();
                            positions = new int[freq];
                            offsets = new int[freq];
                            for (int i = 0; i < freq; i++) {
                                positions[i] = postingsEnum.nextPosition();
                                offsets[i] = postingsEnum.startOffset();
                            }
                        } else {
                            freq = (int) termsEnum.totalTermFreq();
                        }
                        if (freq >= minRawFreq) {
                            total++;
                            float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN;
                            DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString,
                                    freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal);
                            docQueue.offer(documentTerm);
                        }
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
        int i = 0;
        for (DocumentTerm docTerm : docQueue.getOrderedList()) {
            queue.offer(docTerm);
            if (++i >= perDocLimit) {
                break;
            }
        }
    }
    corpusTermMinimalsDB.close();
    this.terms.addAll(queue.getOrderedList(start));
}