Example usage for org.apache.lucene.search.spans Spans nextDoc

List of usage examples for org.apache.lucene.search.spans Spans nextDoc

Introduction

In this page you can find the example usage for org.apache.lucene.search.spans Spans nextDoc.

Prototype

public abstract int nextDoc() throws IOException;

Source Link

Document

Advances to the next document in the set and returns the doc it is currently on, or #NO_MORE_DOCS if there are no more docs in the set.
NOTE: after the iterator has exhausted you should not call this method, as it may result in unpredicted behavior.

Usage

From source file:nl.inl.blacklab.TestUtil.java

License:Apache License

public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException {
    int docNumber = 0, hitNumber;
    boolean firstDoc = true;
    while (true) {
        int actualDocId;
        if (firstDoc && skipFirstNextDoc) {
            // Actual Spans already skipped to document for testing. Don't .nextDoc() this time.
            firstDoc = false;/*ww  w .  ja v  a  2  s .  c  o m*/
            actualDocId = actual.docID();
        } else {
            actualDocId = actual.nextDoc();
        }
        docNumber++;
        hitNumber = 0;
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId);
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID());
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
        if (actualDocId == DocIdSetIterator.NO_MORE_DOCS)
            break;
        Assert.assertEquals(-1, actual.startPosition());
        Assert.assertEquals(-1, actual.endPosition());
        boolean first = true;
        while (true) {
            int actualStartPos = actual.nextStartPosition();
            if (first) {
                // .nextDoc() should always place us in a document with at least 1 hit
                first = false;
                Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS);
            }
            hitNumber++;
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(),
                    actualStartPos);
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(),
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos,
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(),
                    actual.endPosition());
            if (actualStartPos == Spans.NO_MORE_POSITIONS) {
                Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.nextStartPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.startPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS,
                        actual.endPosition());
                break;
            }
        }
    }
}

From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java

License:Apache License

long countSpans(String field, Query q) throws Exception {
    List<LeafReaderContext> ctxs = reader.leaves();

    assert (ctxs.size() == 1);
    LeafReaderContext leafReaderContext = ctxs.get(0);
    SpanQuery sq = convert(field, q);//from   ww w  .j  av  a2s  .co m
    sq = (SpanQuery) sq.rewrite(reader);
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);

    long i = 0;
    if (spans != null) {
        while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                i++;
            }
        }
    }
    return i;
}

From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java

License:Apache License

long countDocs(String field, Query q) throws Exception {
    BitSet docs = new BitSet();
    List<LeafReaderContext> ctxs = reader.leaves();
    assert (ctxs.size() == 1);
    LeafReaderContext leafReaderContext = ctxs.get(0);
    SpanQuery sq = convert(field, q);//from   ww w  .  j  a v  a  2 s . co m
    sq = (SpanQuery) sq.rewrite(reader);
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);
    if (spans != null) {
        while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                docs.set(spans.docID());
            }
        }
    }
    long spanDocHits = docs.cardinality();
    // double check with a regular searcher and original query
    TotalHitCountCollector coll = new TotalHitCountCollector();
    searcher.search(q, coll);
    assertEquals(coll.getTotalHits(), spanDocHits);
    return spanDocHits;
}

From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java

License:Apache License

private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart,
        int trueSpanEnd) throws Exception {
    SpanQuery sq = (SpanQuery) p.parse(s);
    List<LeafReaderContext> ctxs = reader.leaves();
    assert (ctxs.size() == 1);
    LeafReaderContext ctx = ctxs.get(0);
    sq = (SpanQuery) sq.rewrite(ctx.reader());
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS);

    int i = 0;//from   w w w  . ja v a 2s .co m
    int spanStart = -1;
    int spanEnd = -1;
    int docID = -1;

    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            spanStart = spans.startPosition();
            spanEnd = spans.endPosition();
            docID = spans.docID();
            i++;
        }
    }
    assertEquals("should only be one matching span", 1, i);
    assertEquals("doc id", trueDocID, docID);
    assertEquals("span start", trueSpanStart, spanStart);
    assertEquals("span end", trueSpanEnd, spanEnd);
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java

License:Apache License

static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocIdSetIterator filterItr,
        DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
    int filterDoc = -1;
    int spansDoc = spans.nextDoc();
    while (true) {
        if (spansDoc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
        }/*from w  w w. j a  v  a2s.  c o m*/
        filterDoc = filterItr.advance(spansDoc);
        if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
        } else if (filterDoc > spansDoc) {
            while (spansDoc <= filterDoc) {
                spansDoc = spans.nextDoc();
                if (spansDoc == filterDoc) {
                    boolean cont = visit(leafCtx, spans, visitor);
                    if (!cont) {
                        return false;
                    }

                } else {
                    continue;
                }
            }
        } else if (filterDoc == spansDoc) {
            boolean cont = visit(leafCtx, spans, visitor);
            if (!cont) {
                return false;
            }
            //then iterate spans
            spansDoc = spans.nextDoc();
        } else if (filterDoc < spansDoc) {
            throw new IllegalArgumentException("FILTER doc is < spansdoc!!!");
        } else {
            throw new IllegalArgumentException("Something horrible happened");
        }
    }
    return true;
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java

License:Apache License

static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor)
        throws IOException, TargetTokenNotFoundException {
    while (spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        boolean cont = visit(leafCtx, spans, visitor);
        if (!cont) {
            return false;
        }//from w w w.  j a  v a2  s.  c  om
    }
    return true;
}

From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java

License:Open Source License

@Test
public void test() throws IOException {

    //      File storageDirectory = TestHelper.getTemporaryTestStorageDirectory();
    //      Storage storage = new FileStorage(storageDirectory);
    Storage storage = new MemoryStorage();
    Document document;/*from w  ww. j  a  v a2 s . co m*/
    LuceneManager luceneManager = storage.getLuceneManager();
    Bits bits = new Bits.MatchAllBits(2);
    Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>();

    document = new Document();
    document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES));
    luceneManager.addDocument(document);
    document = new Document();
    document.add(
            new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES));
    luceneManager.addDocument(document);

    LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader());
    IndexSearcher indexSearcher = new IndexSearcher(atomicReader);

    SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader,
            storage.getLuceneManager().getAnalyzer());

    Map<String, SpanQuery> queriesMap;
    SpanQuery query;
    SpanWeight weight;
    Spans spans;

    // single term
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    spans.nextDoc();
    assertEquals(0, spans.docID());
    spans.nextStartPosition();
    assertEquals(3, spans.startPosition());
    assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS);
    assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS);

    // single term with case (this gets converted to lower case)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("It");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // single term (ignore quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed), with spaces
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // comma-separated terms (collapased)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());

    query = queriesMap.get("dark,best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar*,b*t");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate wildcards (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dar*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("bes*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark and");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("it was");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar* an*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards (ignored quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical,
            true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    storage.destroy();
}

From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java

License:Open Source License

protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper,
        String[] queries) throws IOException {

    FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
            corpusMapper.getLeafReader(),
            storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
            tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
    Map<String, SpanQuery> queriesMap;
    try {/*  w  ww  . j  a va2  s . co  m*/
        queriesMap = parser.getSpanQueriesMap(queries, false);
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
    }

    Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>();

    List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus());
    BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids);

    //      CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort);
    for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
        String queryString = spanQueryEntry.getKey();
        SpanQuery spanQuery = spanQueryEntry.getValue();
        Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet);
        if (spans != null) {
            // map lucene document id to span offset information
            List<int[]> spansDocDataList = new ArrayList<int[]>();

            // we're going to go through all the span for all documents so that we can then
            // parallelize the searching of kwics
            int doc = spans.nextDoc();
            while (doc != spans.NO_MORE_DOCS) {
                int pos = spans.nextStartPosition();
                while (pos != spans.NO_MORE_POSITIONS) {
                    spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() });
                    pos = spans.nextStartPosition();
                }
                if (!spansDocDataList.isEmpty()) {
                    int[][] data = new int[spansDocDataList.size()][2];
                    for (int i = 0, len = data.length; i < len; i++) {
                        data[i] = spansDocDataList.get(i);
                    }
                    documentSpansDataList.add(new DocumentSpansData(doc, data, queryString));
                    spansDocDataList.clear();
                    //                  total++;
                }
                doc = spans.nextDoc();
            }
        }
    }

    // build a map to organize by document for efficiency
    Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>();
    for (DocumentSpansData dsd : documentSpansDataList) {
        if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) {
            documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>());
        }
        documentSpansDataMap.get(dsd.luceneDoc).add(dsd);
    }

    return documentSpansDataMap;
}

From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java

License:Open Source License

private void addToQueueFromSpansWithDistributions(CorpusMapper corpusMapper, FlexibleQueue<CorpusTerm> queue,
        String queryString, Spans spans) throws IOException {
    Corpus corpus = corpusMapper.getCorpus();
    int docIndexInCorpus = -1; // this should always be changed on the first span
    int tokensCounts[] = corpus.getTokensCounts(tokenType);
    Map<Integer, AtomicInteger> positionsMap = new HashMap<Integer, AtomicInteger>();
    int totalTokens = corpus.getTokensCount(tokenType);
    int doc = spans.nextDoc();
    while (doc != Spans.NO_MORE_DOCS) {
        docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc);
        int pos = spans.nextStartPosition();
        while (pos != Spans.NO_MORE_POSITIONS) {
            if (positionsMap.containsKey(docIndexInCorpus) == false) {
                positionsMap.put(docIndexInCorpus, new AtomicInteger(1));
            } else {
                positionsMap.get(docIndexInCorpus).incrementAndGet();
            }/*from  w w w  .  j a va2 s. co  m*/
            pos = spans.nextStartPosition();
        }
        doc = spans.nextDoc();
    }
    int[] rawFreqs = new int[corpus.size()];
    float[] relativeFreqs = new float[corpus.size()];
    int freq = 0;
    int inDocumentsCount = 0;
    for (Map.Entry<Integer, AtomicInteger> entry : positionsMap.entrySet()) {
        int f = entry.getValue().intValue();
        int documentPosition = entry.getKey();
        if (f > 0) {
            freq += f;
            inDocumentsCount++;
        }
        rawFreqs[documentPosition] = f;
        relativeFreqs[documentPosition] = (float) f / tokensCounts[documentPosition];
    }
    CorpusTerm corpusTerm = new CorpusTerm(queryString, freq, totalTokens, inDocumentsCount, corpus.size(),
            rawFreqs, relativeFreqs, parameters.getParameterIntValue("bins", corpus.size()));
    offer(queue, corpusTerm);
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java

License:Open Source License

List<DocumentNgram> getNgrams(CorpusMapper corpusMapper, Keywords stopwords, String[] queries)
        throws IOException {
    FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
            corpusMapper.getLeafReader(),
            storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
            tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
    Map<String, SpanQuery> queriesMap;
    try {//ww w  .  jav  a2 s  . c  o  m
        queriesMap = parser.getSpanQueriesMap(queries, false);
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
    }

    Corpus corpus = corpusMapper.getCorpus();
    int docIndexInCorpus; // this should always be changed on the first span
    Map<Integer, Map<String, List<int[]>>> docTermPositionsMap = new HashMap<Integer, Map<String, List<int[]>>>();

    for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
        //         CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(queryString);
        Spans spans = corpusMapper.getFilteredSpans(spanQueryEntry.getValue());
        if (spans != null) {
            Map<Integer, List<int[]>> documentAndPositionsMap = new HashMap<Integer, List<int[]>>();
            int doc = spans.nextDoc();
            while (doc != spans.NO_MORE_DOCS) {
                int pos = spans.nextStartPosition();
                docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc);
                documentAndPositionsMap.put(docIndexInCorpus, new ArrayList<int[]>());
                while (pos != spans.NO_MORE_POSITIONS) {
                    documentAndPositionsMap.get(docIndexInCorpus)
                            .add(new int[] { spans.startPosition(), spans.endPosition() });
                    pos = spans.nextStartPosition();
                }
                doc = spans.nextDoc();
            }
            String queryString = spanQueryEntry.getKey();
            for (Map.Entry<Integer, List<int[]>> entry : documentAndPositionsMap.entrySet()) {
                doc = entry.getKey();
                if (docTermPositionsMap.containsKey(doc) == false) {
                    docTermPositionsMap.put(doc, new HashMap<String, List<int[]>>());
                }
                docTermPositionsMap.get(doc).put(queryString, entry.getValue());
            }
            documentAndPositionsMap.clear();
        }
    }

    int[] totalTokens = corpus.getLastTokenPositions(tokenType);
    StringBuilder realTermBuilder = new StringBuilder();
    String realTerm;
    List<DocumentNgram> allNgrams = new ArrayList<DocumentNgram>();
    OverlapFilter filter = getDocumentNgramsOverlapFilter(parameters);
    for (Map.Entry<Integer, Map<String, List<int[]>>> docEntry : docTermPositionsMap.entrySet()) {
        docIndexInCorpus = docEntry.getKey();
        SimplifiedTermInfo[] sparseSimplifiedTermInfoArray = getSparseSimplifiedTermInfoArray(corpusMapper,
                corpusMapper.getLuceneIdFromDocumentPosition(docIndexInCorpus), totalTokens[docIndexInCorpus]);
        Map<String, List<int[]>> realStringsMap = new HashMap<String, List<int[]>>();
        for (Map.Entry<String, List<int[]>> termEntry : docEntry.getValue().entrySet()) {
            //            new Ngram(docIndexInCorpus, term, positions, length)
            for (int[] positions : termEntry.getValue()) {
                for (int i = positions[0]; i < positions[1]; i++) {
                    realTermBuilder.append(sparseSimplifiedTermInfoArray[i].term).append(" ");
                }
                realTerm = realTermBuilder.toString().trim();
                realTermBuilder.setLength(0);
                if (realStringsMap.containsKey(realTerm) == false) {
                    realStringsMap.put(realTerm, new ArrayList<int[]>());
                }
                realStringsMap.get(realTerm).add(new int[] { positions[0], positions[1] - 1 });
            }
        }
        List<DocumentNgram> ngrams = new ArrayList<DocumentNgram>();
        for (Map.Entry<String, List<int[]>> realTermMap : realStringsMap.entrySet()) {
            List<int[]> values = realTermMap.getValue();
            DocumentNgram ngram = new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
                    values.get(0)[1] + 1 - values.get(0)[0]);
            ngrams.add(new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
                    values.get(0)[1] + 1 - values.get(0)[0]));
        }

        // we need to go through our first list to see if any of them are long enough
        List<DocumentNgram> nextNgrams = getNextNgrams(ngrams, sparseSimplifiedTermInfoArray, docIndexInCorpus,
                2);
        for (DocumentNgram ngram : ngrams) {
            if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
                nextNgrams.add(ngram);
            }
        }

        //ngrams = getFilteredNgrams(ngrams, totalTokens[docIndexInCorpus]);
        allNgrams.addAll(filter.getFilteredNgrams(nextNgrams, totalTokens[docIndexInCorpus]));
    }

    FlexibleQueue<DocumentNgram> queue = new FlexibleQueue<DocumentNgram>(comparator, start + limit);
    for (DocumentNgram ngram : allNgrams) {
        if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
            queue.offer(ngram);
        }
    }
    return queue.getOrderedList(start);
}