Example usage for org.apache.lucene.search.spans Spans endPosition

List of usage examples for org.apache.lucene.search.spans Spans endPosition

Introduction

In this page you can find the example usage for org.apache.lucene.search.spans Spans endPosition.

Prototype

public abstract int endPosition();

Source Link

Document

Returns the end position for the current start position, or -1 when #nextStartPosition was not yet called on the current doc.

Usage

From source file:it.cnr.ilc.lc.clavius.search.Tester.java

private static void searchWithContext(String term) {

    try {/* ww  w  . j  av  a2 s.  c om*/
        logger.info("searchWithContext(" + term + ")");
        SpanQuery spanQuery = new SpanTermQuery(new Term("content", term));
        Directory indexDirectory = FSDirectory.open(
                Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText"));
        DirectoryReader indexReader = DirectoryReader.open(indexDirectory);
        IndexSearcher searcher = new IndexSearcher(indexReader);
        IndexReader reader = searcher.getIndexReader();
        //spanQuery = (SpanQuery) spanQuery.rewrite(reader);
        //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false);
        Spans spans = spanQuery.createWeight(searcher, false)
                .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
        //            Spans spans2 = weight.getSpans(reader.leaves().get(0),
        //                    SpanWeight.Postings.OFFSETS);
        //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS);
        ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs;

        logger.info("hits :" + sc.length);

        int i;
        if (null != spans) {
            //                while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) {
            for (int k = 0; k < sc.length; k++) {
                int docId = sc[k].doc;
                logger.info("docID: " + docId);
                int newDocID = spans.advance(docId);
                logger.info("newDocID: " + newDocID);

                int nextSpan = -1;
                while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) {
                    logger.info("nextSpan             : " + nextSpan);
                    logger.info("spans.startPosition(): " + spans.startPosition());
                    logger.info("spans.endPosition()  : " + spans.endPosition());
                    logger.info("spans.width()        : " + spans.width());

                    Fields fields = reader.getTermVectors(docId);
                    Terms terms = fields.terms("content");

                    TermsEnum termsEnum = terms.iterator();
                    BytesRef text;
                    PostingsEnum postingEnum = null;
                    int start = spans.startPosition() - 3;
                    int end = spans.endPosition() + 3;
                    while ((text = termsEnum.next()) != null) {
                        //could store the BytesRef here, but String is easier for this example
                        String s = new String(text.bytes, text.offset, text.length);
                        //                DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null);
                        postingEnum = termsEnum.postings(postingEnum);
                        if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                            i = 0;
                            int position = -1;
                            while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) {
                                if (position >= start && position <= end) {
                                    logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset
                                            + " length: " + text.length);
                                }
                                i++;
                            }

                        }

                    }
                }
            }
        } else {
            logger.info("no " + term + " found!");
        }
    } catch (IOException e) {
        logger.error(e.getMessage());
    }
    logger.info("End.");
}

From source file:nl.inl.blacklab.TestUtil.java

License:Apache License

public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException {
    int docNumber = 0, hitNumber;
    boolean firstDoc = true;
    while (true) {
        int actualDocId;
        if (firstDoc && skipFirstNextDoc) {
            // Actual Spans already skipped to document for testing. Don't .nextDoc() this time.
            firstDoc = false;/*w  w w .j a v a2 s. c  om*/
            actualDocId = actual.docID();
        } else {
            actualDocId = actual.nextDoc();
        }
        docNumber++;
        hitNumber = 0;
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId);
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID());
        Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
        if (actualDocId == DocIdSetIterator.NO_MORE_DOCS)
            break;
        Assert.assertEquals(-1, actual.startPosition());
        Assert.assertEquals(-1, actual.endPosition());
        boolean first = true;
        while (true) {
            int actualStartPos = actual.nextStartPosition();
            if (first) {
                // .nextDoc() should always place us in a document with at least 1 hit
                first = false;
                Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS);
            }
            hitNumber++;
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(),
                    actualStartPos);
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(),
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos,
                    actual.startPosition());
            Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(),
                    actual.endPosition());
            if (actualStartPos == Spans.NO_MORE_POSITIONS) {
                Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.nextStartPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS,
                        actual.startPosition());
                Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS,
                        actual.endPosition());
                break;
            }
        }
    }
}

From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java

License:Apache License

private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart,
        int trueSpanEnd) throws Exception {
    SpanQuery sq = (SpanQuery) p.parse(s);
    List<LeafReaderContext> ctxs = reader.leaves();
    assert (ctxs.size() == 1);
    LeafReaderContext ctx = ctxs.get(0);
    sq = (SpanQuery) sq.rewrite(ctx.reader());
    SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);

    final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS);

    int i = 0;/*from   w  w  w  .j  a  v a  2  s  .  com*/
    int spanStart = -1;
    int spanEnd = -1;
    int docID = -1;

    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            spanStart = spans.startPosition();
            spanEnd = spans.endPosition();
            docID = spans.docID();
            i++;
        }
    }
    assertEquals("should only be one matching span", 1, i);
    assertEquals("doc id", trueDocID, docID);
    assertEquals("span start", trueSpanStart, spanStart);
    assertEquals("span end", trueSpanEnd, spanEnd);
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java

License:Apache License

static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor)
        throws IOException, TargetTokenNotFoundException {
    Document document = leafCtx.reader().document(spans.docID(), visitor.getFields());
    DocTokenOffsets offsets = visitor.getDocTokenOffsets();
    offsets.reset(leafCtx.docBase, spans.docID(), document);
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        offsets.addOffset(spans.startPosition(), spans.endPosition());
    }/*  w w  w. ja  va 2s .  c  o  m*/
    return visitor.visit(offsets);
}

From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java

License:Open Source License

@Test
public void test() throws IOException {

    //      File storageDirectory = TestHelper.getTemporaryTestStorageDirectory();
    //      Storage storage = new FileStorage(storageDirectory);
    Storage storage = new MemoryStorage();
    Document document;/*  w  w  w  .j av a 2  s . c om*/
    LuceneManager luceneManager = storage.getLuceneManager();
    Bits bits = new Bits.MatchAllBits(2);
    Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>();

    document = new Document();
    document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES));
    luceneManager.addDocument(document);
    document = new Document();
    document.add(
            new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES));
    luceneManager.addDocument(document);

    LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader());
    IndexSearcher indexSearcher = new IndexSearcher(atomicReader);

    SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader,
            storage.getLuceneManager().getAnalyzer());

    Map<String, SpanQuery> queriesMap;
    SpanQuery query;
    SpanWeight weight;
    Spans spans;

    // single term
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    spans.nextDoc();
    assertEquals(0, spans.docID());
    spans.nextStartPosition();
    assertEquals(3, spans.startPosition());
    assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS);
    assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS);

    // single term with case (this gets converted to lower case)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("It");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // single term (ignore quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate terms (not collapsed), with spaces
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dark");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // comma-separated terms (collapased)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());

    query = queriesMap.get("dark,best");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar*,b*t");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // two separate wildcards (not collapsed)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true);
    assertEquals(2, queriesMap.size());

    query = queriesMap.get("dar*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    query = queriesMap.get("bes*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(1, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark and");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("it was");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(1, spans.nextDoc());
    assertEquals(0, spans.nextStartPosition());
    assertEquals(6, spans.nextStartPosition());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dar* an*");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(5, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    // phrase with wildcards (ignored quotes)
    queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical,
            true);
    assertEquals(1, queriesMap.size());
    query = queriesMap.get("dark stormy~2");
    weight = query.createWeight(indexSearcher, false);
    spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS);
    assertEquals(0, spans.nextDoc());
    assertEquals(3, spans.nextStartPosition());
    assertEquals(6, spans.endPosition());
    assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition());
    assertEquals(spans.NO_MORE_DOCS, spans.nextDoc());

    storage.destroy();
}

From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java

License:Open Source License

protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper,
        String[] queries) throws IOException {

    FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
            corpusMapper.getLeafReader(),
            storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
            tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
    Map<String, SpanQuery> queriesMap;
    try {//w  w w.j  a va  2 s.com
        queriesMap = parser.getSpanQueriesMap(queries, false);
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
    }

    Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>();

    List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus());
    BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids);

    //      CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort);
    for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
        String queryString = spanQueryEntry.getKey();
        SpanQuery spanQuery = spanQueryEntry.getValue();
        Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet);
        if (spans != null) {
            // map lucene document id to span offset information
            List<int[]> spansDocDataList = new ArrayList<int[]>();

            // we're going to go through all the span for all documents so that we can then
            // parallelize the searching of kwics
            int doc = spans.nextDoc();
            while (doc != spans.NO_MORE_DOCS) {
                int pos = spans.nextStartPosition();
                while (pos != spans.NO_MORE_POSITIONS) {
                    spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() });
                    pos = spans.nextStartPosition();
                }
                if (!spansDocDataList.isEmpty()) {
                    int[][] data = new int[spansDocDataList.size()][2];
                    for (int i = 0, len = data.length; i < len; i++) {
                        data[i] = spansDocDataList.get(i);
                    }
                    documentSpansDataList.add(new DocumentSpansData(doc, data, queryString));
                    spansDocDataList.clear();
                    //                  total++;
                }
                doc = spans.nextDoc();
            }
        }
    }

    // build a map to organize by document for efficiency
    Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>();
    for (DocumentSpansData dsd : documentSpansDataList) {
        if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) {
            documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>());
        }
        documentSpansDataMap.get(dsd.luceneDoc).add(dsd);
    }

    return documentSpansDataMap;
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java

License:Open Source License

List<DocumentNgram> getNgrams(CorpusMapper corpusMapper, Keywords stopwords, String[] queries)
        throws IOException {
    FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
            corpusMapper.getLeafReader(),
            storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
            tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
    Map<String, SpanQuery> queriesMap;
    try {/*from w  w w  .j a v  a2s.c o m*/
        queriesMap = parser.getSpanQueriesMap(queries, false);
    } catch (Exception e) {
        throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
    }

    Corpus corpus = corpusMapper.getCorpus();
    int docIndexInCorpus; // this should always be changed on the first span
    Map<Integer, Map<String, List<int[]>>> docTermPositionsMap = new HashMap<Integer, Map<String, List<int[]>>>();

    for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
        //         CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(queryString);
        Spans spans = corpusMapper.getFilteredSpans(spanQueryEntry.getValue());
        if (spans != null) {
            Map<Integer, List<int[]>> documentAndPositionsMap = new HashMap<Integer, List<int[]>>();
            int doc = spans.nextDoc();
            while (doc != spans.NO_MORE_DOCS) {
                int pos = spans.nextStartPosition();
                docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc);
                documentAndPositionsMap.put(docIndexInCorpus, new ArrayList<int[]>());
                while (pos != spans.NO_MORE_POSITIONS) {
                    documentAndPositionsMap.get(docIndexInCorpus)
                            .add(new int[] { spans.startPosition(), spans.endPosition() });
                    pos = spans.nextStartPosition();
                }
                doc = spans.nextDoc();
            }
            String queryString = spanQueryEntry.getKey();
            for (Map.Entry<Integer, List<int[]>> entry : documentAndPositionsMap.entrySet()) {
                doc = entry.getKey();
                if (docTermPositionsMap.containsKey(doc) == false) {
                    docTermPositionsMap.put(doc, new HashMap<String, List<int[]>>());
                }
                docTermPositionsMap.get(doc).put(queryString, entry.getValue());
            }
            documentAndPositionsMap.clear();
        }
    }

    int[] totalTokens = corpus.getLastTokenPositions(tokenType);
    StringBuilder realTermBuilder = new StringBuilder();
    String realTerm;
    List<DocumentNgram> allNgrams = new ArrayList<DocumentNgram>();
    OverlapFilter filter = getDocumentNgramsOverlapFilter(parameters);
    for (Map.Entry<Integer, Map<String, List<int[]>>> docEntry : docTermPositionsMap.entrySet()) {
        docIndexInCorpus = docEntry.getKey();
        SimplifiedTermInfo[] sparseSimplifiedTermInfoArray = getSparseSimplifiedTermInfoArray(corpusMapper,
                corpusMapper.getLuceneIdFromDocumentPosition(docIndexInCorpus), totalTokens[docIndexInCorpus]);
        Map<String, List<int[]>> realStringsMap = new HashMap<String, List<int[]>>();
        for (Map.Entry<String, List<int[]>> termEntry : docEntry.getValue().entrySet()) {
            //            new Ngram(docIndexInCorpus, term, positions, length)
            for (int[] positions : termEntry.getValue()) {
                for (int i = positions[0]; i < positions[1]; i++) {
                    realTermBuilder.append(sparseSimplifiedTermInfoArray[i].term).append(" ");
                }
                realTerm = realTermBuilder.toString().trim();
                realTermBuilder.setLength(0);
                if (realStringsMap.containsKey(realTerm) == false) {
                    realStringsMap.put(realTerm, new ArrayList<int[]>());
                }
                realStringsMap.get(realTerm).add(new int[] { positions[0], positions[1] - 1 });
            }
        }
        List<DocumentNgram> ngrams = new ArrayList<DocumentNgram>();
        for (Map.Entry<String, List<int[]>> realTermMap : realStringsMap.entrySet()) {
            List<int[]> values = realTermMap.getValue();
            DocumentNgram ngram = new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
                    values.get(0)[1] + 1 - values.get(0)[0]);
            ngrams.add(new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
                    values.get(0)[1] + 1 - values.get(0)[0]));
        }

        // we need to go through our first list to see if any of them are long enough
        List<DocumentNgram> nextNgrams = getNextNgrams(ngrams, sparseSimplifiedTermInfoArray, docIndexInCorpus,
                2);
        for (DocumentNgram ngram : ngrams) {
            if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
                nextNgrams.add(ngram);
            }
        }

        //ngrams = getFilteredNgrams(ngrams, totalTokens[docIndexInCorpus]);
        allNgrams.addAll(filter.getFilteredNgrams(nextNgrams, totalTokens[docIndexInCorpus]));
    }

    FlexibleQueue<DocumentNgram> queue = new FlexibleQueue<DocumentNgram>(comparator, start + limit);
    for (DocumentNgram ngram : allNgrams) {
        if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
            queue.offer(ngram);
        }
    }
    return queue.getOrderedList(start);
}

From source file:tw.com.kyle.luminance.LumQuery.java

public List<Integer[]> query_for_offsets(String term, String field, boolean useNearQuery) throws IOException {
    if (term.length() == 0) {
        return null;
    }/*from  w  w  w .  jav a 2  s .  c  o  m*/

    SpanQuery sq = null;
    if (!useNearQuery) {
        sq = new SpanTermQuery(new Term(field, term));
    } else {

        SpanNearQuery.Builder builder = new SpanNearQuery.Builder(field, true);
        for (int i = 0; i < term.length(); ++i) {
            builder.addClause(new SpanTermQuery(new Term(field, term.substring(i, i + 1))));
        }
        sq = builder.build();
    }

    IndexSearcher searcher = new IndexSearcher(idx_reader);
    List<Integer[]> offs = new ArrayList<>();
    for (LeafReaderContext ctx : idx_reader.leaves()) {

        SpanWeight weights = sq.createWeight(searcher, false);
        if (weights == null) {
            continue;
        }
        Spans spans = weights.getSpans(ctx, Postings.OFFSETS);
        if (spans == null) {
            System.out.printf("Nothing found for %s%n", term);
            continue;
        }

        int nxtDoc = -1;
        while ((nxtDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) {
            final int doc_id = nxtDoc;
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                final int start_pos = spans.startPosition();
                final int end_pos = spans.endPosition();
                Integer[] off_x = new Integer[] { doc_id, -1, -1 };
                spans.collect(new SpanCollector() {
                    @Override
                    public void collectLeaf(PostingsEnum pe, int i, Term term) throws IOException {
                        int s_off = pe.startOffset();
                        int e_off = pe.endOffset();
                        if (i == start_pos)
                            off_x[1] = s_off;
                        if (i + 1 == end_pos)
                            off_x[2] = e_off;
                    }

                    @Override
                    public void reset() {
                        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
                    }
                });
                offs.add(off_x);
            }

        }

    }

    return offs;
}

From source file:uk.co.flax.luwak.util.XNearSpansOrdered.java

License:Apache License

/**
 * Order the subSpans within the same document by using nextStartPosition on all subSpans
 * after the first as little as necessary.
 * Return true when the subSpans could be ordered in this way,
 * otherwise at least one is exhausted in the current doc.
 *//*from   www .  j a va  2s .  c o  m*/
private boolean stretchToOrder() throws IOException {
    Spans prevSpans = subSpans[0];
    matchStart = prevSpans.startPosition();
    assert prevSpans.startPosition() != NO_MORE_POSITIONS : "prevSpans no start position " + prevSpans;
    assert prevSpans.endPosition() != NO_MORE_POSITIONS;
    matchWidth = 0;
    for (int i = 1; i < subSpans.length; i++) {
        Spans spans = subSpans[i];
        assert spans.startPosition() != NO_MORE_POSITIONS;
        assert spans.endPosition() != NO_MORE_POSITIONS;
        if (advancePosition(spans, prevSpans.endPosition()) == NO_MORE_POSITIONS) {
            oneExhaustedInCurrentDoc = true;
            return false;
        }
        matchWidth += (spans.startPosition() - prevSpans.endPosition());
        prevSpans = spans;
    }
    matchEnd = subSpans[subSpans.length - 1].endPosition();
    return true; // all subSpans ordered and non overlapping
}