List of usage examples for org.apache.lucene.search.spans Spans endPosition
public abstract int endPosition();
From source file:it.cnr.ilc.lc.clavius.search.Tester.java
private static void searchWithContext(String term) { try {/* ww w . j av a2 s. c om*/ logger.info("searchWithContext(" + term + ")"); SpanQuery spanQuery = new SpanTermQuery(new Term("content", term)); Directory indexDirectory = FSDirectory.open( Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText")); DirectoryReader indexReader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(indexReader); IndexReader reader = searcher.getIndexReader(); //spanQuery = (SpanQuery) spanQuery.rewrite(reader); //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false); Spans spans = spanQuery.createWeight(searcher, false) .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS); // Spans spans2 = weight.getSpans(reader.leaves().get(0), // SpanWeight.Postings.OFFSETS); //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS); ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs; logger.info("hits :" + sc.length); int i; if (null != spans) { // while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) { for (int k = 0; k < sc.length; k++) { int docId = sc[k].doc; logger.info("docID: " + docId); int newDocID = spans.advance(docId); logger.info("newDocID: " + newDocID); int nextSpan = -1; while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) { logger.info("nextSpan : " + nextSpan); logger.info("spans.startPosition(): " + spans.startPosition()); logger.info("spans.endPosition() : " + spans.endPosition()); logger.info("spans.width() : " + spans.width()); Fields fields = reader.getTermVectors(docId); Terms terms = fields.terms("content"); TermsEnum termsEnum = terms.iterator(); BytesRef text; PostingsEnum postingEnum = null; int start = spans.startPosition() - 3; int end = spans.endPosition() + 3; while ((text = termsEnum.next()) != null) { //could store the BytesRef here, but String is easier for this example String s = new String(text.bytes, text.offset, text.length); // DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null); postingEnum = termsEnum.postings(postingEnum); if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { i = 0; int position = -1; while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) { if (position >= start && position <= end) { logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset + " length: " + text.length); } i++; } } } } } } else { logger.info("no " + term + " found!"); } } catch (IOException e) { logger.error(e.getMessage()); } logger.info("End."); }
From source file:nl.inl.blacklab.TestUtil.java
License:Apache License
public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException { int docNumber = 0, hitNumber; boolean firstDoc = true; while (true) { int actualDocId; if (firstDoc && skipFirstNextDoc) { // Actual Spans already skipped to document for testing. Don't .nextDoc() this time. firstDoc = false;/*w w w .j a v a2 s. c om*/ actualDocId = actual.docID(); } else { actualDocId = actual.nextDoc(); } docNumber++; hitNumber = 0; Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID()); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); if (actualDocId == DocIdSetIterator.NO_MORE_DOCS) break; Assert.assertEquals(-1, actual.startPosition()); Assert.assertEquals(-1, actual.endPosition()); boolean first = true; while (true) { int actualStartPos = actual.nextStartPosition(); if (first) { // .nextDoc() should always place us in a document with at least 1 hit first = false; Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS); } hitNumber++; Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(), actualStartPos); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(), actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(), actual.endPosition()); if (actualStartPos == Spans.NO_MORE_POSITIONS) { Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.nextStartPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS, actual.endPosition()); break; } } } }
From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java
License:Apache License
private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception { SpanQuery sq = (SpanQuery) p.parse(s); List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext ctx = ctxs.get(0); sq = (SpanQuery) sq.rewrite(ctx.reader()); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS); int i = 0;/*from w w w .j a v a 2 s . com*/ int spanStart = -1; int spanEnd = -1; int docID = -1; while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { spanStart = spans.startPosition(); spanEnd = spans.endPosition(); docID = spans.docID(); i++; } } assertEquals("should only be one matching span", 1, i); assertEquals("doc id", trueDocID, docID); assertEquals("span start", trueSpanStart, spanStart); assertEquals("span end", trueSpanEnd, spanEnd); }
From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java
License:Apache License
static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { Document document = leafCtx.reader().document(spans.docID(), visitor.getFields()); DocTokenOffsets offsets = visitor.getDocTokenOffsets(); offsets.reset(leafCtx.docBase, spans.docID(), document); while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { offsets.addOffset(spans.startPosition(), spans.endPosition()); }/* w w w. ja va 2s . c o m*/ return visitor.visit(offsets); }
From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java
License:Open Source License
@Test public void test() throws IOException { // File storageDirectory = TestHelper.getTemporaryTestStorageDirectory(); // Storage storage = new FileStorage(storageDirectory); Storage storage = new MemoryStorage(); Document document;/* w w w .j av a 2 s . c om*/ LuceneManager luceneManager = storage.getLuceneManager(); Bits bits = new Bits.MatchAllBits(2); Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>(); document = new Document(); document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES)); luceneManager.addDocument(document); document = new Document(); document.add( new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES)); luceneManager.addDocument(document); LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); IndexSearcher indexSearcher = new IndexSearcher(atomicReader); SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader, storage.getLuceneManager().getAnalyzer()); Map<String, SpanQuery> queriesMap; SpanQuery query; SpanWeight weight; Spans spans; // single term queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); spans.nextDoc(); assertEquals(0, spans.docID()); spans.nextStartPosition(); assertEquals(3, spans.startPosition()); assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS); assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS); // single term with case (this gets converted to lower case) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("It"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // single term (ignore quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed), with spaces queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // comma-separated terms (collapased) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark,best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best assertEquals(1, queriesMap.size()); query = queriesMap.get("dar*,b*t"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate wildcards (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dar*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("bes*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark and"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("it was"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dar* an*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards (ignored quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); storage.destroy(); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper, String[] queries) throws IOException { FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser( corpusMapper.getLeafReader(), storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()), tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name()); Map<String, SpanQuery> queriesMap; try {//w w w.j a va 2 s.com queriesMap = parser.getSpanQueriesMap(queries, false); } catch (Exception e) { throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e); } Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>(); List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus()); BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids); // CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort); for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) { String queryString = spanQueryEntry.getKey(); SpanQuery spanQuery = spanQueryEntry.getValue(); Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet); if (spans != null) { // map lucene document id to span offset information List<int[]> spansDocDataList = new ArrayList<int[]>(); // we're going to go through all the span for all documents so that we can then // parallelize the searching of kwics int doc = spans.nextDoc(); while (doc != spans.NO_MORE_DOCS) { int pos = spans.nextStartPosition(); while (pos != spans.NO_MORE_POSITIONS) { spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() }); pos = spans.nextStartPosition(); } if (!spansDocDataList.isEmpty()) { int[][] data = new int[spansDocDataList.size()][2]; for (int i = 0, len = data.length; i < len; i++) { data[i] = spansDocDataList.get(i); } documentSpansDataList.add(new DocumentSpansData(doc, data, queryString)); spansDocDataList.clear(); // total++; } doc = spans.nextDoc(); } } } // build a map to organize by document for efficiency Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>(); for (DocumentSpansData dsd : documentSpansDataList) { if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) { documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>()); } documentSpansDataMap.get(dsd.luceneDoc).add(dsd); } return documentSpansDataMap; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java
License:Open Source License
List<DocumentNgram> getNgrams(CorpusMapper corpusMapper, Keywords stopwords, String[] queries) throws IOException { FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser( corpusMapper.getLeafReader(), storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()), tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name()); Map<String, SpanQuery> queriesMap; try {/*from w w w .j a v a2s.c o m*/ queriesMap = parser.getSpanQueriesMap(queries, false); } catch (Exception e) { throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e); } Corpus corpus = corpusMapper.getCorpus(); int docIndexInCorpus; // this should always be changed on the first span Map<Integer, Map<String, List<int[]>>> docTermPositionsMap = new HashMap<Integer, Map<String, List<int[]>>>(); for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) { // CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(queryString); Spans spans = corpusMapper.getFilteredSpans(spanQueryEntry.getValue()); if (spans != null) { Map<Integer, List<int[]>> documentAndPositionsMap = new HashMap<Integer, List<int[]>>(); int doc = spans.nextDoc(); while (doc != spans.NO_MORE_DOCS) { int pos = spans.nextStartPosition(); docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc); documentAndPositionsMap.put(docIndexInCorpus, new ArrayList<int[]>()); while (pos != spans.NO_MORE_POSITIONS) { documentAndPositionsMap.get(docIndexInCorpus) .add(new int[] { spans.startPosition(), spans.endPosition() }); pos = spans.nextStartPosition(); } doc = spans.nextDoc(); } String queryString = spanQueryEntry.getKey(); for (Map.Entry<Integer, List<int[]>> entry : documentAndPositionsMap.entrySet()) { doc = entry.getKey(); if (docTermPositionsMap.containsKey(doc) == false) { docTermPositionsMap.put(doc, new HashMap<String, List<int[]>>()); } docTermPositionsMap.get(doc).put(queryString, entry.getValue()); } documentAndPositionsMap.clear(); } } int[] totalTokens = corpus.getLastTokenPositions(tokenType); StringBuilder realTermBuilder = new StringBuilder(); String realTerm; List<DocumentNgram> allNgrams = new ArrayList<DocumentNgram>(); OverlapFilter filter = getDocumentNgramsOverlapFilter(parameters); for (Map.Entry<Integer, Map<String, List<int[]>>> docEntry : docTermPositionsMap.entrySet()) { docIndexInCorpus = docEntry.getKey(); SimplifiedTermInfo[] sparseSimplifiedTermInfoArray = getSparseSimplifiedTermInfoArray(corpusMapper, corpusMapper.getLuceneIdFromDocumentPosition(docIndexInCorpus), totalTokens[docIndexInCorpus]); Map<String, List<int[]>> realStringsMap = new HashMap<String, List<int[]>>(); for (Map.Entry<String, List<int[]>> termEntry : docEntry.getValue().entrySet()) { // new Ngram(docIndexInCorpus, term, positions, length) for (int[] positions : termEntry.getValue()) { for (int i = positions[0]; i < positions[1]; i++) { realTermBuilder.append(sparseSimplifiedTermInfoArray[i].term).append(" "); } realTerm = realTermBuilder.toString().trim(); realTermBuilder.setLength(0); if (realStringsMap.containsKey(realTerm) == false) { realStringsMap.put(realTerm, new ArrayList<int[]>()); } realStringsMap.get(realTerm).add(new int[] { positions[0], positions[1] - 1 }); } } List<DocumentNgram> ngrams = new ArrayList<DocumentNgram>(); for (Map.Entry<String, List<int[]>> realTermMap : realStringsMap.entrySet()) { List<int[]> values = realTermMap.getValue(); DocumentNgram ngram = new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values, values.get(0)[1] + 1 - values.get(0)[0]); ngrams.add(new DocumentNgram(docIndexInCorpus, realTermMap.getKey(), values, values.get(0)[1] + 1 - values.get(0)[0])); } // we need to go through our first list to see if any of them are long enough List<DocumentNgram> nextNgrams = getNextNgrams(ngrams, sparseSimplifiedTermInfoArray, docIndexInCorpus, 2); for (DocumentNgram ngram : ngrams) { if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) { nextNgrams.add(ngram); } } //ngrams = getFilteredNgrams(ngrams, totalTokens[docIndexInCorpus]); allNgrams.addAll(filter.getFilteredNgrams(nextNgrams, totalTokens[docIndexInCorpus])); } FlexibleQueue<DocumentNgram> queue = new FlexibleQueue<DocumentNgram>(comparator, start + limit); for (DocumentNgram ngram : allNgrams) { if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) { queue.offer(ngram); } } return queue.getOrderedList(start); }
From source file:tw.com.kyle.luminance.LumQuery.java
public List<Integer[]> query_for_offsets(String term, String field, boolean useNearQuery) throws IOException { if (term.length() == 0) { return null; }/*from w w w . jav a 2 s . c o m*/ SpanQuery sq = null; if (!useNearQuery) { sq = new SpanTermQuery(new Term(field, term)); } else { SpanNearQuery.Builder builder = new SpanNearQuery.Builder(field, true); for (int i = 0; i < term.length(); ++i) { builder.addClause(new SpanTermQuery(new Term(field, term.substring(i, i + 1)))); } sq = builder.build(); } IndexSearcher searcher = new IndexSearcher(idx_reader); List<Integer[]> offs = new ArrayList<>(); for (LeafReaderContext ctx : idx_reader.leaves()) { SpanWeight weights = sq.createWeight(searcher, false); if (weights == null) { continue; } Spans spans = weights.getSpans(ctx, Postings.OFFSETS); if (spans == null) { System.out.printf("Nothing found for %s%n", term); continue; } int nxtDoc = -1; while ((nxtDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) { final int doc_id = nxtDoc; while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { final int start_pos = spans.startPosition(); final int end_pos = spans.endPosition(); Integer[] off_x = new Integer[] { doc_id, -1, -1 }; spans.collect(new SpanCollector() { @Override public void collectLeaf(PostingsEnum pe, int i, Term term) throws IOException { int s_off = pe.startOffset(); int e_off = pe.endOffset(); if (i == start_pos) off_x[1] = s_off; if (i + 1 == end_pos) off_x[2] = e_off; } @Override public void reset() { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } }); offs.add(off_x); } } } return offs; }
From source file:uk.co.flax.luwak.util.XNearSpansOrdered.java
License:Apache License
/** * Order the subSpans within the same document by using nextStartPosition on all subSpans * after the first as little as necessary. * Return true when the subSpans could be ordered in this way, * otherwise at least one is exhausted in the current doc. *//*from www . j a va 2s . c o m*/ private boolean stretchToOrder() throws IOException { Spans prevSpans = subSpans[0]; matchStart = prevSpans.startPosition(); assert prevSpans.startPosition() != NO_MORE_POSITIONS : "prevSpans no start position " + prevSpans; assert prevSpans.endPosition() != NO_MORE_POSITIONS; matchWidth = 0; for (int i = 1; i < subSpans.length; i++) { Spans spans = subSpans[i]; assert spans.startPosition() != NO_MORE_POSITIONS; assert spans.endPosition() != NO_MORE_POSITIONS; if (advancePosition(spans, prevSpans.endPosition()) == NO_MORE_POSITIONS) { oneExhaustedInCurrentDoc = true; return false; } matchWidth += (spans.startPosition() - prevSpans.endPosition()); prevSpans = spans; } matchEnd = subSpans[subSpans.length - 1].endPosition(); return true; // all subSpans ordered and non overlapping }