Example usage for org.apache.lucene.search.spans Spans nextStartPosition

List of usage examples for org.apache.lucene.search.spans Spans nextStartPosition

Introduction

In this page you can find the example usage for org.apache.lucene.search.spans Spans nextStartPosition.

Prototype

public abstract int nextStartPosition() throws IOException;

Source Link

Document

Returns the next start position for the current doc.

Usage

From source file:it.cnr.ilc.lc.clavius.search.Tester.java

private static void searchWithContext(String term) {

    try {//from w w  w .  j  ava  2 s .c om
        logger.info("searchWithContext(" + term + ")");
        SpanQuery spanQuery = new SpanTermQuery(new Term("content", term));
        Directory indexDirectory = FSDirectory.open(
                Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText"));
        DirectoryReader indexReader = DirectoryReader.open(indexDirectory);
        IndexSearcher searcher = new IndexSearcher(indexReader);
        IndexReader reader = searcher.getIndexReader();
        //spanQuery = (SpanQuery) spanQuery.rewrite(reader);
        //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false);
        Spans spans = spanQuery.createWeight(searcher, false)
                .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
        //            Spans spans2 = weight.getSpans(reader.leaves().get(0),
        //                    SpanWeight.Postings.OFFSETS);
        //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS);
        ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs;

        logger.info("hits :" + sc.length);

        int i;
        if (null != spans) {
            //                while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) {
            for (int k = 0; k < sc.length; k++) {
                int docId = sc[k].doc;
                logger.info("docID: " + docId);
                int newDocID = spans.advance(docId);
                logger.info("newDocID: " + newDocID);

                int nextSpan = -1;
                while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) {
                    logger.info("nextSpan             : " + nextSpan);
                    logger.info("spans.startPosition(): " + spans.startPosition());
                    logger.info("spans.endPosition()  : " + spans.endPosition());
                    logger.info("spans.width()        : " + spans.width());

                    Fields fields = reader.getTermVectors(docId);
                    Terms terms = fields.terms("content");

                    TermsEnum termsEnum = terms.iterator();
                    BytesRef text;
                    PostingsEnum postingEnum = null;
                    int start = spans.startPosition() - 3;
                    int end = spans.endPosition() + 3;
                    while ((text = termsEnum.next()) != null) {
                        //could store the BytesRef here, but String is easier for this example
                        String s = new String(text.bytes, text.offset, text.length);
                        //                DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null);
                        postingEnum = termsEnum.postings(postingEnum);
                        if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                            i = 0;
                            int position = -1;
                            while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) {
                                if (position >= start && position <= end) {
                                    logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset
                                            + " length: " + text.length);
                                }
                                i++;
                            }

                        }

                    }
                }
            }
        } else {
            logger.info("no " + term + " found!");
        }
    } catch (IOException e) {
        logger.error(e.getMessage());
    }
    logger.info("End.");
}

From source file:nl.inl.blacklab.search.lucene.BLSpans.java

License:Apache License

/**
 * Advance the start position in the current doc to target or beyond.
 *
 * Always at least advances to the next hit, even if the current start
 * position is already at or beyond the target.
 *
 * @param spans the spans to operate on/*from w w w  .j  a  va 2s .  c  o m*/
 * @param target target start position to advance to
 * @return new start position, or Spans.NO_MORE_POSITIONS if we're done with this document
 * @throws IOException
 */
public static int advanceStartPosition(Spans spans, int target) throws IOException {
    if (spans instanceof BLSpans) {
        return ((BLSpans) spans).advanceStartPosition(target);
    }
    // Naive implementations; subclasses may provide a faster version.
    int pos;
    do {
        pos = spans.nextStartPosition();
    } while (pos < target && pos != NO_MORE_POSITIONS);
    return pos;
}

From source file:nl.inl.blacklab.TestUtil.java

License:Apache License

public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException {
    int docNumber = 0, hitNumber;
    boolean firstDoc = true;
    while (true) {
        int actualDocId;
        if (firstDoc && skipFirstNextDoc) {
            // Actual Spans already skipped to document for testing. Don't .nextDoc() this time.
            firstDoc = false;/*from w w w  .  j a  v a 2 s . c o  m*/
            actualDocId = actual.docID();
        } else {
            actualDocId = actual.nextDoc();
        }
        docNumber++;
        hitNumber = 0;
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId);
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID());
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
        if (actualDocId == DocIdSetIterator.NO_MORE_DOCS)
            break;
        Assert.assertEquals(-1, actual.startPosition());
        Assert.assertEquals(-1, actual.endPosition());
        boolean first = true;
        while (true) {
            int actualStartPos = actual.nextStartPosition();
            if (first) {
                // .nextDoc() should always place us in a document with at least 1 hit
                first = false;
                Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS);
            }
            hitNumber++;
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(),
                    actualStartPos);
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(),
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos,
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(),
                    actual.endPosition());
            if (actualStartPos == Spans.NO_MORE_POSITIONS) {
                Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.nextStartPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.startPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS,
                        actual.endPosition());
                break;
            }
        }
    }
}

From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java

License:Apache License

long countSpans(String field, Query q) throws Exception {
    List<LeafReaderContext> ctxs = reader.leaves();

    assert (ctxs.size() == 1);
    LeafReaderContext leafReaderContext = ctxs.get(0);
    SpanQuery sq = convert(field, q);//from   w w w. j ava  2 s . c  o m
    sq = (SpanQuery) sq.rewrite(reader);
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);

    long i = 0;
    if (spans != null) {
        while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                i++;
            }
        }
    }
    return i;
}

From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java

License:Apache License

long countDocs(String field, Query q) throws Exception {
    BitSet docs = new BitSet();
    List<LeafReaderContext> ctxs = reader.leaves();
    assert (ctxs.size() == 1);
    LeafReaderContext leafReaderContext = ctxs.get(0);
    SpanQuery sq = convert(field, q);//from w  w  w.  j ava 2  s. c  o  m
    sq = (SpanQuery) sq.rewrite(reader);
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);
    if (spans != null) {
        while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                docs.set(spans.docID());
            }
        }
    }
    long spanDocHits = docs.cardinality();
    // double check with a regular searcher and original query
    TotalHitCountCollector coll = new TotalHitCountCollector();
    searcher.search(q, coll);
    assertEquals(coll.getTotalHits(), spanDocHits);
    return spanDocHits;
}

From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java

License:Apache License

private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart,
        int trueSpanEnd) throws Exception {
    SpanQuery sq = (SpanQuery) p.parse(s);
    List<LeafReaderContext> ctxs = reader.leaves();
    assert (ctxs.size() == 1);
    LeafReaderContext ctx = ctxs.get(0);
    sq = (SpanQuery) sq.rewrite(ctx.reader());
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS);

    int i = 0;/*from  w  w  w .  j a  v a 2 s . co m*/
    int spanStart = -1;
    int spanEnd = -1;
    int docID = -1;

    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            spanStart = spans.startPosition();
            spanEnd = spans.endPosition();
            docID = spans.docID();
            i++;
        }
    }
    assertEquals("should only be one matching span", 1, i);
    assertEquals("doc id", trueDocID, docID);
    assertEquals("span start", trueSpanStart, spanStart);
    assertEquals("span end", trueSpanEnd, spanEnd);
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java

License:Apache License

static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor)
        throws IOException, TargetTokenNotFoundException {
    Document document = leafCtx.reader().document(spans.docID(), visitor.getFields());
    DocTokenOffsets offsets = visitor.getDocTokenOffsets();
    offsets.reset(leafCtx.docBase, spans.docID(), document);
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        offsets.addOffset(spans.startPosition(), spans.endPosition());
    }//from   ww  w .j  a  v  a2 s.co  m
    return visitor.visit(offsets);
}

From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java

License:Open Source License

@Test
public void test() throws IOException {

    //      File storageDirectory = TestHelper.getTemporaryTestStorageDirectory();
    //      Storage storage = new FileStorage(storageDirectory);
    Storage storage = new MemoryStorage();
    Document document;//from ww w. j  a v  a2 s  .  c  o  m
    LuceneManager luceneManager = storage.getLuceneManager();
    Bits bits = new Bits.MatchAllBits(2);
    Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>();

    document = new Document();
    document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES));
    luceneManager.addDocument(document);
    document = new Document();
    document.add(
            new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES));
    luceneManager.addDocument(document);

    LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader());
    IndexSearcher indexSearcher = new IndexSearcher(atomicReader);

    SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader,
            storage.getLuceneManager().getAnalyzer());

    Map<String, SpanQuery> queriesMap;
    SpanQuery query;
    SpanWeight weight;
    Spans spans;

    // single term
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    spans.nextDoc();
    assertEquals(0, spans.docID());
    spans.nextStartPosition();
    assertEquals(3, spans.startPosition());
    assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS);
    assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS);

    // single term with case (this gets converted to lower case)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("It");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // single term (ignore quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed), with spaces
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // comma-separated terms (collapased)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());

    query = queriesMap.get("dark,best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar*,b*t");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate wildcards (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dar*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("bes*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark and");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("it was");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar* an*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards (ignored quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical,
            true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    storage.destroy();
}

From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java

License:Open Source License

protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper,
        String[] queries) throws IOException {

    FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
            corpusMapper.getLeafReader(),
            storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
            tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
    Map<String, SpanQuery> queriesMap;
    try {/*from  w w  w.j a v a 2s.c  om*/
        queriesMap = parser.getSpanQueriesMap(queries, false);
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
    }

    Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>();

    List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus());
    BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids);

    //      CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort);
    for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
        String queryString = spanQueryEntry.getKey();
        SpanQuery spanQuery = spanQueryEntry.getValue();
        Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet);
        if (spans != null) {
            // map lucene document id to span offset information
            List<int[]> spansDocDataList = new ArrayList<int[]>();

            // we're going to go through all the span for all documents so that we can then
            // parallelize the searching of kwics
            int doc = spans.nextDoc();
            while (doc != spans.NO_MORE_DOCS) {
                int pos = spans.nextStartPosition();
                while (pos != spans.NO_MORE_POSITIONS) {
                    spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() });
                    pos = spans.nextStartPosition();
                }
                if (!spansDocDataList.isEmpty()) {
                    int[][] data = new int[spansDocDataList.size()][2];
                    for (int i = 0, len = data.length; i < len; i++) {
                        data[i] = spansDocDataList.get(i);
                    }
                    documentSpansDataList.add(new DocumentSpansData(doc, data, queryString));
                    spansDocDataList.clear();
                    //                  total++;
                }
                doc = spans.nextDoc();
            }
        }
    }

    // build a map to organize by document for efficiency
    Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>();
    for (DocumentSpansData dsd : documentSpansDataList) {
        if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) {
            documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>());
        }
        documentSpansDataMap.get(dsd.luceneDoc).add(dsd);
    }

    return documentSpansDataMap;
}

From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java

License:Open Source License

private void addToQueueFromSpansWithDistributions(CorpusMapper corpusMapper, FlexibleQueue<CorpusTerm> queue,
        String queryString, Spans spans) throws IOException {
    Corpus corpus = corpusMapper.getCorpus();
    int docIndexInCorpus = -1; // this should always be changed on the first span
    int tokensCounts[] = corpus.getTokensCounts(tokenType);
    Map<Integer, AtomicInteger> positionsMap = new HashMap<Integer, AtomicInteger>();
    int totalTokens = corpus.getTokensCount(tokenType);
    int doc = spans.nextDoc();
    while (doc != Spans.NO_MORE_DOCS) {
        docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc);
        int pos = spans.nextStartPosition();
        while (pos != Spans.NO_MORE_POSITIONS) {
            if (positionsMap.containsKey(docIndexInCorpus) == false) {
                positionsMap.put(docIndexInCorpus, new AtomicInteger(1));
            } else {
                positionsMap.get(docIndexInCorpus).incrementAndGet();
            }/*  w ww.j  av a 2  s .c  o  m*/
            pos = spans.nextStartPosition();
        }
        doc = spans.nextDoc();
    }
    int[] rawFreqs = new int[corpus.size()];
    float[] relativeFreqs = new float[corpus.size()];
    int freq = 0;
    int inDocumentsCount = 0;
    for (Map.Entry<Integer, AtomicInteger> entry : positionsMap.entrySet()) {
        int f = entry.getValue().intValue();
        int documentPosition = entry.getKey();
        if (f > 0) {
            freq += f;
            inDocumentsCount++;
        }
        rawFreqs[documentPosition] = f;
        relativeFreqs[documentPosition] = (float) f / tokensCounts[documentPosition];
    }
    CorpusTerm corpusTerm = new CorpusTerm(queryString, freq, totalTokens, inDocumentsCount, corpus.size(),
            rawFreqs, relativeFreqs, parameters.getParameterIntValue("bins", corpus.size()));
    offer(queue, corpusTerm);
}