List of usage examples for org.apache.lucene.search.spans Spans nextStartPosition
public abstract int nextStartPosition() throws IOException;
From source file:it.cnr.ilc.lc.clavius.search.Tester.java
private static void searchWithContext(String term) { try {//from w w w . j ava 2 s .c om logger.info("searchWithContext(" + term + ")"); SpanQuery spanQuery = new SpanTermQuery(new Term("content", term)); Directory indexDirectory = FSDirectory.open( Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText")); DirectoryReader indexReader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(indexReader); IndexReader reader = searcher.getIndexReader(); //spanQuery = (SpanQuery) spanQuery.rewrite(reader); //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false); Spans spans = spanQuery.createWeight(searcher, false) .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS); // Spans spans2 = weight.getSpans(reader.leaves().get(0), // SpanWeight.Postings.OFFSETS); //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS); ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs; logger.info("hits :" + sc.length); int i; if (null != spans) { // while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) { for (int k = 0; k < sc.length; k++) { int docId = sc[k].doc; logger.info("docID: " + docId); int newDocID = spans.advance(docId); logger.info("newDocID: " + newDocID); int nextSpan = -1; while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) { logger.info("nextSpan : " + nextSpan); logger.info("spans.startPosition(): " + spans.startPosition()); logger.info("spans.endPosition() : " + spans.endPosition()); logger.info("spans.width() : " + spans.width()); Fields fields = reader.getTermVectors(docId); Terms terms = fields.terms("content"); TermsEnum termsEnum = terms.iterator(); BytesRef text; PostingsEnum postingEnum = null; int start = spans.startPosition() - 3; int end = spans.endPosition() + 3; while ((text = termsEnum.next()) != null) { //could store the BytesRef here, but String is easier for this example String s = new String(text.bytes, text.offset, text.length); // DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null); postingEnum = termsEnum.postings(postingEnum); if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { i = 0; int position = -1; while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) { if (position >= start && position <= end) { logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset + " length: " + text.length); } i++; } } } } } } else { logger.info("no " + term + " found!"); } } catch (IOException e) { logger.error(e.getMessage()); } logger.info("End."); }
From source file:nl.inl.blacklab.search.lucene.BLSpans.java
License:Apache License
/** * Advance the start position in the current doc to target or beyond. * * Always at least advances to the next hit, even if the current start * position is already at or beyond the target. * * @param spans the spans to operate on/*from w w w .j a va 2s . c o m*/ * @param target target start position to advance to * @return new start position, or Spans.NO_MORE_POSITIONS if we're done with this document * @throws IOException */ public static int advanceStartPosition(Spans spans, int target) throws IOException { if (spans instanceof BLSpans) { return ((BLSpans) spans).advanceStartPosition(target); } // Naive implementations; subclasses may provide a faster version. int pos; do { pos = spans.nextStartPosition(); } while (pos < target && pos != NO_MORE_POSITIONS); return pos; }
From source file:nl.inl.blacklab.TestUtil.java
License:Apache License
public static void assertEquals(Spans expected, Spans actual, boolean skipFirstNextDoc) throws IOException { int docNumber = 0, hitNumber; boolean firstDoc = true; while (true) { int actualDocId; if (firstDoc && skipFirstNextDoc) { // Actual Spans already skipped to document for testing. Don't .nextDoc() this time. firstDoc = false;/*from w w w . j a v a 2 s . c o m*/ actualDocId = actual.docID(); } else { actualDocId = actual.nextDoc(); } docNumber++; hitNumber = 0; Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.nextDoc(), actualDocId); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", expected.docID(), actual.docID()); Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); if (actualDocId == DocIdSetIterator.NO_MORE_DOCS) break; Assert.assertEquals(-1, actual.startPosition()); Assert.assertEquals(-1, actual.endPosition()); boolean first = true; while (true) { int actualStartPos = actual.nextStartPosition(); if (first) { // .nextDoc() should always place us in a document with at least 1 hit first = false; Assert.assertFalse(actualStartPos == Spans.NO_MORE_POSITIONS); } hitNumber++; Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.nextStartPosition(), actualStartPos); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", expected.startPosition(), actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", actualStartPos, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", expected.endPosition(), actual.endPosition()); if (actualStartPos == Spans.NO_MORE_POSITIONS) { Assert.assertEquals(StringUtil.ordinal(docNumber) + " doc id", actualDocId, actual.docID()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.nextStartPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": start pos", Spans.NO_MORE_POSITIONS, actual.startPosition()); Assert.assertEquals(hitDesc(docNumber, hitNumber) + ": end pos", Spans.NO_MORE_POSITIONS, actual.endPosition()); break; } } } }
From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java
License:Apache License
long countSpans(String field, Query q) throws Exception { List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext leafReaderContext = ctxs.get(0); SpanQuery sq = convert(field, q);//from w w w. j ava 2 s . c o m sq = (SpanQuery) sq.rewrite(reader); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS); long i = 0; if (spans != null) { while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { i++; } } } return i; }
From source file:org.tallison.lucene.queryparser.spans.SQPTestBase.java
License:Apache License
long countDocs(String field, Query q) throws Exception { BitSet docs = new BitSet(); List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext leafReaderContext = ctxs.get(0); SpanQuery sq = convert(field, q);//from w w w. j ava 2 s. c o m sq = (SpanQuery) sq.rewrite(reader); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS); if (spans != null) { while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { docs.set(spans.docID()); } } } long spanDocHits = docs.cardinality(); // double check with a regular searcher and original query TotalHitCountCollector coll = new TotalHitCountCollector(); searcher.search(q, coll); assertEquals(coll.getTotalHits(), spanDocHits); return spanDocHits; }
From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java
License:Apache License
private void testOffsetForSingleSpanMatch(SpanOnlyParser p, String s, int trueDocID, int trueSpanStart, int trueSpanEnd) throws Exception { SpanQuery sq = (SpanQuery) p.parse(s); List<LeafReaderContext> ctxs = reader.leaves(); assert (ctxs.size() == 1); LeafReaderContext ctx = ctxs.get(0); sq = (SpanQuery) sq.rewrite(ctx.reader()); SpanWeight sw = sq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); final Spans spans = sw.getSpans(ctx, SpanWeight.Postings.POSITIONS); int i = 0;/*from w w w . j a v a 2 s . co m*/ int spanStart = -1; int spanEnd = -1; int docID = -1; while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { spanStart = spans.startPosition(); spanEnd = spans.endPosition(); docID = spans.docID(); i++; } } assertEquals("should only be one matching span", 1, i); assertEquals("doc id", trueDocID, docID); assertEquals("span start", trueSpanStart, spanStart); assertEquals("span end", trueSpanEnd, spanEnd); }
From source file:org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java
License:Apache License
static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { Document document = leafCtx.reader().document(spans.docID(), visitor.getFields()); DocTokenOffsets offsets = visitor.getDocTokenOffsets(); offsets.reset(leafCtx.docBase, spans.docID(), document); while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { offsets.addOffset(spans.startPosition(), spans.endPosition()); }//from ww w .j a v a2 s.co m return visitor.visit(offsets); }
From source file:org.voyanttools.trombone.lucene.search.SpanQueryParserTest.java
License:Open Source License
@Test public void test() throws IOException { // File storageDirectory = TestHelper.getTemporaryTestStorageDirectory(); // Storage storage = new FileStorage(storageDirectory); Storage storage = new MemoryStorage(); Document document;//from ww w. j a v a2 s . c o m LuceneManager luceneManager = storage.getLuceneManager(); Bits bits = new Bits.MatchAllBits(2); Map<Term, TermContext> termsMap = new HashMap<Term, TermContext>(); document = new Document(); document.add(new TextField("lexical", "It was a dark and stormy night.", Field.Store.YES)); luceneManager.addDocument(document); document = new Document(); document.add( new TextField("lexical", "It was the best of times it was the worst of times.", Field.Store.YES)); luceneManager.addDocument(document); LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); IndexSearcher indexSearcher = new IndexSearcher(atomicReader); SpanQueryParser spanQueryParser = new SpanQueryParser(atomicReader, storage.getLuceneManager().getAnalyzer()); Map<String, SpanQuery> queriesMap; SpanQuery query; SpanWeight weight; Spans spans; // single term queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); spans.nextDoc(); assertEquals(0, spans.docID()); spans.nextStartPosition(); assertEquals(3, spans.startPosition()); assertEquals(spans.nextStartPosition(), Spans.NO_MORE_POSITIONS); assertEquals(spans.nextDoc(), Spans.NO_MORE_DOCS); // single term with case (this gets converted to lower case) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "It" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("It"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // single term (ignore quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark\"" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark", "best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark;best" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate terms (not collapsed), with spaces queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { " dark ; best " }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dark"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // comma-separated terms (collapased) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark,best" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark,best"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*,b*t" }, TokenType.lexical, true); // dark and best assertEquals(1, queriesMap.size()); query = queriesMap.get("dar*,b*t"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // two separate wildcards (not collapsed) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar*;bes*" }, TokenType.lexical, true); assertEquals(2, queriesMap.size()); query = queriesMap.get("dar*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); query = queriesMap.get("bes*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(1, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark and" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark and"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "it was" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("it was"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(1, spans.nextDoc()); assertEquals(0, spans.nextStartPosition()); assertEquals(6, spans.nextStartPosition()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dar* an*" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dar* an*"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(5, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "dark stormy~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); // phrase with wildcards (ignored quotes) queriesMap = spanQueryParser.getSpanQueriesMap(new String[] { "\"dark stormy\"~2" }, TokenType.lexical, true); assertEquals(1, queriesMap.size()); query = queriesMap.get("dark stormy~2"); weight = query.createWeight(indexSearcher, false); spans = weight.getSpans(atomicReader.getContext(), SpanWeight.Postings.POSITIONS); assertEquals(0, spans.nextDoc()); assertEquals(3, spans.nextStartPosition()); assertEquals(6, spans.endPosition()); assertEquals(spans.NO_MORE_POSITIONS, spans.nextStartPosition()); assertEquals(spans.NO_MORE_DOCS, spans.nextDoc()); storage.destroy(); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
protected Map<Integer, List<DocumentSpansData>> getDocumentSpansData(CorpusMapper corpusMapper, String[] queries) throws IOException { FieldPrefixAwareSimpleSpanQueryParser parser = new FieldPrefixAwareSimpleSpanQueryParser( corpusMapper.getLeafReader(), storage.getLuceneManager().getAnalyzer(corpusMapper.getCorpus().getId()), tokenType == TokenType.other ? parameters.getParameterValue("tokenType") : tokenType.name()); Map<String, SpanQuery> queriesMap; try {/*from w w w.j a v a 2s.c om*/ queriesMap = parser.getSpanQueriesMap(queries, false); } catch (Exception e) { throw new IllegalArgumentException("Unable to parse queries: " + StringUtils.join(queries, "; "), e); } Collection<DocumentSpansData> documentSpansDataList = new ArrayList<DocumentSpansData>(); List<String> ids = this.getCorpusStoredDocumentIdsFromParameters(corpusMapper.getCorpus()); BitSet bitSet = corpusMapper.getBitSetFromDocumentIds(ids); // CorpusTermsQueue queue = new CorpusTermsQueue(size, corpusTermSort); for (Map.Entry<String, SpanQuery> spanQueryEntry : queriesMap.entrySet()) { String queryString = spanQueryEntry.getKey(); SpanQuery spanQuery = spanQueryEntry.getValue(); Spans spans = corpusMapper.getFilteredSpans(spanQuery, bitSet); if (spans != null) { // map lucene document id to span offset information List<int[]> spansDocDataList = new ArrayList<int[]>(); // we're going to go through all the span for all documents so that we can then // parallelize the searching of kwics int doc = spans.nextDoc(); while (doc != spans.NO_MORE_DOCS) { int pos = spans.nextStartPosition(); while (pos != spans.NO_MORE_POSITIONS) { spansDocDataList.add(new int[] { spans.startPosition(), spans.endPosition() }); pos = spans.nextStartPosition(); } if (!spansDocDataList.isEmpty()) { int[][] data = new int[spansDocDataList.size()][2]; for (int i = 0, len = data.length; i < len; i++) { data[i] = spansDocDataList.get(i); } documentSpansDataList.add(new DocumentSpansData(doc, data, queryString)); spansDocDataList.clear(); // total++; } doc = spans.nextDoc(); } } } // build a map to organize by document for efficiency Map<Integer, List<DocumentSpansData>> documentSpansDataMap = new HashMap<Integer, List<DocumentSpansData>>(); for (DocumentSpansData dsd : documentSpansDataList) { if (!documentSpansDataMap.containsKey(dsd.luceneDoc)) { documentSpansDataMap.put(dsd.luceneDoc, new ArrayList<DocumentSpansData>()); } documentSpansDataMap.get(dsd.luceneDoc).add(dsd); } return documentSpansDataMap; }
From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java
License:Open Source License
private void addToQueueFromSpansWithDistributions(CorpusMapper corpusMapper, FlexibleQueue<CorpusTerm> queue, String queryString, Spans spans) throws IOException { Corpus corpus = corpusMapper.getCorpus(); int docIndexInCorpus = -1; // this should always be changed on the first span int tokensCounts[] = corpus.getTokensCounts(tokenType); Map<Integer, AtomicInteger> positionsMap = new HashMap<Integer, AtomicInteger>(); int totalTokens = corpus.getTokensCount(tokenType); int doc = spans.nextDoc(); while (doc != Spans.NO_MORE_DOCS) { docIndexInCorpus = corpusMapper.getDocumentPositionFromLuceneId(doc); int pos = spans.nextStartPosition(); while (pos != Spans.NO_MORE_POSITIONS) { if (positionsMap.containsKey(docIndexInCorpus) == false) { positionsMap.put(docIndexInCorpus, new AtomicInteger(1)); } else { positionsMap.get(docIndexInCorpus).incrementAndGet(); }/* w ww.j av a 2 s .c o m*/ pos = spans.nextStartPosition(); } doc = spans.nextDoc(); } int[] rawFreqs = new int[corpus.size()]; float[] relativeFreqs = new float[corpus.size()]; int freq = 0; int inDocumentsCount = 0; for (Map.Entry<Integer, AtomicInteger> entry : positionsMap.entrySet()) { int f = entry.getValue().intValue(); int documentPosition = entry.getKey(); if (f > 0) { freq += f; inDocumentsCount++; } rawFreqs[documentPosition] = f; relativeFreqs[documentPosition] = (float) f / tokensCounts[documentPosition]; } CorpusTerm corpusTerm = new CorpusTerm(queryString, freq, totalTokens, inDocumentsCount, corpus.size(), rawFreqs, relativeFreqs, parameters.getParameterIntValue("bins", corpus.size())); offer(queue, corpusTerm); }