List of usage examples for org.apache.lucene.search.spans Spans nextDoc
public abstract int nextDoc() throws IOException;
From source file:nl.inl.blacklab.TestUtil.java
License:Apache License
public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException { int docNumber = 0, hitNumber; boolean firstDoc = true; while (true) { int actualDocId; if (firstDoc && skipFirstNextDoc) { // Actual Spans already skipped to document for testing. Don't .nextDoc() this time. firstDoc = false;/*ww w . ja v a 2 s . c o m*/ actualDocId = actual.docID(); } else { actualDocId = actual.nextDoc(); } docNumber++; hitNumber = 0; Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID()); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); if (actualDocId == DocIdSetIterator.NO_MORE_DOCS) break; Assert.assertEquals(-1, actual.startPosition()); Assert.assertEquals(-1, actual.endPosition()); boolean first = true; while (true) { int actualStartPos = actual.nextStartPosition(); if (first) { // .nextDoc() should always place us in a document with at least 1 hit first = false; Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS); } hitNumber++; Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(), actualStartPos); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(), actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(), actual.endPosition()); if (actualStartPos == Spans.NO_MORE_POSITIONS) { Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.nextStartPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS, actual.endPosition()); break; } } } }
From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java
License:Apache License
long countSpans(String field, Query q) throws Exception { List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext leafReaderContext = ctxs.get(0); SpanQuery sq = convert(field, q);//from ww w .j av a2s .co m sq = (SpanQuery) sq.rewrite(reader); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS); long i = 0; if (spans != null) { while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { i++; } } } return i; }
From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java
License:Apache License
long countDocs(String field, Query q) throws Exception { BitSet docs = new BitSet(); List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext leafReaderContext = ctxs.get(0); SpanQuery sq = convert(field, q);//from ww w . j a v a 2 s . co m sq = (SpanQuery) sq.rewrite(reader); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS); if (spans != null) { while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { docs.set(spans.docID()); } } } long spanDocHits = docs.cardinality(); // double check with a regular searcher and original query TotalHitCountCollector coll = new TotalHitCountCollector(); searcher.search(q, coll); assertEquals(coll.getTotalHits(), spanDocHits); return spanDocHits; }
From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java
License:Apache License
private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception { SpanQuery sq = (SpanQuery) p.parse(s); List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext ctx = ctxs.get(0); sq = (SpanQuery) sq.rewrite(ctx.reader()); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS); int i = 0;//from w w w . ja v a 2s .co m int spanStart = -1; int spanEnd = -1; int docID = -1; while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { spanStart = spans.startPosition(); spanEnd = spans.endPosition(); docID = spans.docID(); i++; } } assertEquals("should only be one matching span", 1, i); assertEquals("doc id", trueDocID, docID); assertEquals("span start", trueSpanStart, spanStart); assertEquals("span end", trueSpanEnd, spanEnd); }
From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java
License:Apache License
static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocIdSetIterator filterItr, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { int filterDoc = -1; int spansDoc = spans.nextDoc(); while (true) { if (spansDoc == DocIdSetIterator.NO_MORE_DOCS) { break; }/*from w w w. j a v a2s. c o m*/ filterDoc = filterItr.advance(spansDoc); if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) { break; } else if (filterDoc > spansDoc) { while (spansDoc <= filterDoc) { spansDoc = spans.nextDoc(); if (spansDoc == filterDoc) { boolean cont = visit(leafCtx, spans, visitor); if (!cont) { return false; } } else { continue; } } } else if (filterDoc == spansDoc) { boolean cont = visit(leafCtx, spans, visitor); if (!cont) { return false; } //then iterate spans spansDoc = spans.nextDoc(); } else if (filterDoc < spansDoc) { throw new IllegalArgumentException("FILTER doc is < spansdoc!!!"); } else { throw new IllegalArgumentException("Something horrible happened"); } } return true; }
From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java
License:Apache License
static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { while (spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { boolean cont = visit(leafCtx, spans, visitor); if (!cont) { return false; }//from w w w. j a v a2 s. c om } return true; }
From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java
License:Open Source License
@Test public void test() throws IOException { // File storageDirectory = TestHelper.getTemporaryTestStorageDirectory(); // Storage storage = new FileStorage(storageDirectory); Storage storage = new MemoryStorage(); Document document;/*from w ww. j a v a2 s . co m*/ LuceneManager luceneManager = storage.getLuceneManager(); Bits bits = new Bits.MatchAllBits(2); Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>(); document = new Document(); document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES)); luceneManager.addDocument(document); document = new Document(); document.add( new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES)); luceneManager.addDocument(document); LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); IndexSearcher indexSearcher = new IndexSearcher(atomicReader); SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader, storage.getLuceneManager().getAnalyzer()); Map<String, SpanQuery> queriesMap; SpanQuery query; SpanWeight weight; Spans spans; // single term queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); spans.nextDoc(); assertEquals(0, spans.docID()); spans.nextStartPosition(); assertEquals(3, spans.startPosition()); assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS); assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS); // single term with case (this gets converted to lower case) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("It"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // single term (ignore quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed), with spaces queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // comma-separated terms (collapased) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark,best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best assertEquals(1, queriesMap.size()); query = queriesMap.get("dar*,b*t"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate wildcards (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dar*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("bes*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark and"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("it was"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dar* an*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards (ignored quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); storage.destroy(); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper, String[] queries) throws IOException { FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser( corpusMapper.getLeafReader(), storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()), tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name()); Map<String, SpanQuery> queriesMap; try {/* w ww . j a va2 s . co m*/ queriesMap = parser.getSpanQueriesMap(queries, false); } catch (Exception e) { throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e); } Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>(); List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus()); BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids); // CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort); for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) { String queryString = spanQueryEntry.getKey(); SpanQuery spanQuery = spanQueryEntry.getValue(); Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet); if (spans != null) { // map lucene document id to span offset information List<int[]> spansDocDataList = new ArrayList<int[]>(); // we're going to go through all the span for all documents so that we can then // parallelize the searching of kwics int doc = spans.nextDoc(); while (doc != spans.NO_MORE_DOCS) { int pos = spans.nextStartPosition(); while (pos != spans.NO_MORE_POSITIONS) { spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() }); pos = spans.nextStartPosition(); } if (!spansDocDataList.isEmpty()) { int[][] data = new int[spansDocDataList.size()][2]; for (int i = 0, len = data.length; i < len; i++) { data[i] = spansDocDataList.get(i); } documentSpansDataList.add(new DocumentSpansData(doc, data, queryString)); spansDocDataList.clear(); // total++; } doc = spans.nextDoc(); } } } // build a map to organize by document for efficiency Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>(); for (DocumentSpansData dsd : documentSpansDataList) { if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) { documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>()); } documentSpansDataMap.get(dsd.luceneDoc).add(dsd); } return documentSpansDataMap; }
From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java
License:Open Source License
private void addToQueueFromSpansWithDistributions(CorpusMapper corpusMapper, FlexibleQueue<CorpusTerm> queue, String queryString, Spans spans) throws IOException { Corpus corpus = corpusMapper.getCorpus(); int docIndexInCorpus = -1; // this should always be changed on the first span int tokensCounts[] = corpus.getTokensCounts(tokenType); Map<Integer, AtomicInteger> positionsMap = new HashMap<Integer, AtomicInteger>(); int totalTokens = corpus.getTokensCount(tokenType); int doc = spans.nextDoc(); while (doc != Spans.NO_MORE_DOCS) { docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc); int pos = spans.nextStartPosition(); while (pos != Spans.NO_MORE_POSITIONS) { if (positionsMap.containsKey(docIndexInCorpus) == false) { positionsMap.put(docIndexInCorpus, new AtomicInteger(1)); } else { positionsMap.get(docIndexInCorpus).incrementAndGet(); }/*from w w w . j a va2 s. co m*/ pos = spans.nextStartPosition(); } doc = spans.nextDoc(); } int[] rawFreqs = new int[corpus.size()]; float[] relativeFreqs = new float[corpus.size()]; int freq = 0; int inDocumentsCount = 0; for (Map.Entry<Integer, AtomicInteger> entry : positionsMap.entrySet()) { int f = entry.getValue().intValue(); int documentPosition = entry.getKey(); if (f > 0) { freq += f; inDocumentsCount++; } rawFreqs[documentPosition] = f; relativeFreqs[documentPosition] = (float) f / tokensCounts[documentPosition]; } CorpusTerm corpusTerm = new CorpusTerm(queryString, freq, totalTokens, inDocumentsCount, corpus.size(), rawFreqs, relativeFreqs, parameters.getParameterIntValue("bins", corpus.size())); offer(queue, corpusTerm); }
From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java
License:Open Source License
List<DocumentNgram> getNgrams(CorpusMapper corpusMapper, Keywords stopwords, String[] queries)
throws IOException {
FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser(
corpusMapper.getLeafReader(),
storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()),
tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name());
Map<String, SpanQuery> queriesMap;
try {//ww w . jav a2 s . c o m
queriesMap = parser.getSpanQueriesMap(queries, false);
} catch (Exception e) {
throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e);
}
Corpus corpus = corpusMapper.getCorpus();
int docIndexInCorpus; // this should always be changed on the first span
Map<Integer, Map<String, List<int[]>>> docTermPositionsMap = new HashMap<Integer, Map<String, List<int[]>>>();
for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) {
// CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(queryString);
Spans spans = corpusMapper.getFilteredSpans(spanQueryEntry.getValue());
if (spans != null) {
Map<Integer, List<int[]>> documentAndPositionsMap = new HashMap<Integer, List<int[]>>();
int doc = spans.nextDoc();
while (doc != spans.NO_MORE_DOCS) {
int pos = spans.nextStartPosition();
docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc);
documentAndPositionsMap.put(docIndexInCorpus, new ArrayList<int[]>());
while (pos != spans.NO_MORE_POSITIONS) {
documentAndPositionsMap.get(docIndexInCorpus)
.add(new int[] { spans.startPosition(), spans.endPosition() });
pos = spans.nextStartPosition();
}
doc = spans.nextDoc();
}
String queryString = spanQueryEntry.getKey();
for (Map.Entry<Integer, List<int[]>> entry : documentAndPositionsMap.entrySet()) {
doc = entry.getKey();
if (docTermPositionsMap.containsKey(doc) == false) {
docTermPositionsMap.put(doc, new HashMap<String, List<int[]>>());
}
docTermPositionsMap.get(doc).put(queryString, entry.getValue());
}
documentAndPositionsMap.clear();
}
}
int[] totalTokens = corpus.getLastTokenPositions(tokenType);
StringBuilder realTermBuilder = new StringBuilder();
String realTerm;
List<DocumentNgram> allNgrams = new ArrayList<DocumentNgram>();
OverlapFilter filter = getDocumentNgramsOverlapFilter(parameters);
for (Map.Entry<Integer, Map<String, List<int[]>>> docEntry : docTermPositionsMap.entrySet()) {
docIndexInCorpus = docEntry.getKey();
SimplifiedTermInfo[] sparseSimplifiedTermInfoArray = getSparseSimplifiedTermInfoArray(corpusMapper,
corpusMapper.getLuceneIdFromDocumentPosition(docIndexInCorpus), totalTokens[docIndexInCorpus]);
Map<String, List<int[]>> realStringsMap = new HashMap<String, List<int[]>>();
for (Map.Entry<String, List<int[]>> termEntry : docEntry.getValue().entrySet()) {
// new Ngram(docIndexInCorpus, term, positions, length)
for (int[] positions : termEntry.getValue()) {
for (int i = positions[0]; i < positions[1]; i++) {
realTermBuilder.append(sparseSimplifiedTermInfoArray[i].term).append(" ");
}
realTerm = realTermBuilder.toString().trim();
realTermBuilder.setLength(0);
if (realStringsMap.containsKey(realTerm) == false) {
realStringsMap.put(realTerm, new ArrayList<int[]>());
}
realStringsMap.get(realTerm).add(new int[] { positions[0], positions[1] - 1 });
}
}
List<DocumentNgram> ngrams = new ArrayList<DocumentNgram>();
for (Map.Entry<String, List<int[]>> realTermMap : realStringsMap.entrySet()) {
List<int[]> values = realTermMap.getValue();
DocumentNgram ngram = new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
values.get(0)[1] + 1 - values.get(0)[0]);
ngrams.add(new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values,
values.get(0)[1] + 1 - values.get(0)[0]));
}
// we need to go through our first list to see if any of them are long enough
List<DocumentNgram> nextNgrams = getNextNgrams(ngrams, sparseSimplifiedTermInfoArray, docIndexInCorpus,
2);
for (DocumentNgram ngram : ngrams) {
if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
nextNgrams.add(ngram);
}
}
//ngrams = getFilteredNgrams(ngrams, totalTokens[docIndexInCorpus]);
allNgrams.addAll(filter.getFilteredNgrams(nextNgrams, totalTokens[docIndexInCorpus]));
}
FlexibleQueue<DocumentNgram> queue = new FlexibleQueue<DocumentNgram>(comparator, start + limit);
for (DocumentNgram ngram : allNgrams) {
if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) {
queue.offer(ngram);
}
}
return queue.getOrderedList(start);
}