List of usage examples for org.apache.lucene.search DocIdSetIterator nextDoc
public abstract int nextDoc() throws IOException;
From source file:org.neo4j.kernel.api.impl.index.collector.DocValuesCollectorTest.java
License:Open Source License
@Test public void shouldCollectAllHitsPerSegment() throws Exception { // given//from w ww . j a va 2 s . c om DocValuesCollector collector = new DocValuesCollector(); IndexReaderStub readerStub = indexReaderWithMaxDocs(42); // when collector.doSetNextReader(readerStub.getContext()); collector.collect(1); collector.collect(3); collector.collect(5); collector.collect(9); // then assertEquals(4, collector.getTotalHits()); List<DocValuesCollector.MatchingDocs> allMatchingDocs = collector.getMatchingDocs(); assertEquals(1, allMatchingDocs.size()); DocValuesCollector.MatchingDocs matchingDocs = allMatchingDocs.get(0); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(4, matchingDocs.totalHits); DocIdSetIterator idIterator = matchingDocs.docIdSet.iterator(); assertEquals(1, idIterator.nextDoc()); assertEquals(3, idIterator.nextDoc()); assertEquals(5, idIterator.nextDoc()); assertEquals(9, idIterator.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, idIterator.nextDoc()); }
From source file:org.neo4j.kernel.api.impl.index.collector.DocValuesCollectorTest.java
License:Open Source License
@Test public void shouldCollectOneMatchingDocsPerSegment() throws Exception { // given/*from ww w .j a va2 s .c om*/ DocValuesCollector collector = new DocValuesCollector(); IndexReaderStub readerStub = indexReaderWithMaxDocs(42); // when collector.doSetNextReader(readerStub.getContext()); collector.collect(1); collector.collect(3); collector.doSetNextReader(readerStub.getContext()); collector.collect(5); collector.collect(9); // then assertEquals(4, collector.getTotalHits()); List<DocValuesCollector.MatchingDocs> allMatchingDocs = collector.getMatchingDocs(); assertEquals(2, allMatchingDocs.size()); DocValuesCollector.MatchingDocs matchingDocs = allMatchingDocs.get(0); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(2, matchingDocs.totalHits); DocIdSetIterator idIterator = matchingDocs.docIdSet.iterator(); assertEquals(1, idIterator.nextDoc()); assertEquals(3, idIterator.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, idIterator.nextDoc()); matchingDocs = allMatchingDocs.get(1); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(2, matchingDocs.totalHits); idIterator = matchingDocs.docIdSet.iterator(); assertEquals(5, idIterator.nextDoc()); assertEquals(9, idIterator.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, idIterator.nextDoc()); }
From source file:org.neo4j.kernel.api.impl.index.DocValuesCollector.java
License:Open Source License
private void replayTo(Collector collector) throws IOException { for (MatchingDocs docs : getMatchingDocs()) { LeafCollector leafCollector = collector.getLeafCollector(docs.context); Scorer scorer;//from w w w . ja v a 2 s.c o m DocIdSetIterator disi = docs.docIdSet.iterator(); if (isKeepScores()) { scorer = new ReplayingScorer(docs.scores); } else { scorer = new ConstantScoreScorer(null, Float.NaN, disi); } leafCollector.setScorer(scorer); int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { leafCollector.collect(doc); } } }
From source file:org.neo4j.kernel.api.impl.index.DocValuesCollectorTest.java
License:Open Source License
@Test public void shouldCollectAllHitsPerSegment() throws Exception { // given/* w w w . jav a 2s . co m*/ DocValuesCollector collector = new DocValuesCollector(); IndexReaderStub readerStub = indexReaderWithMaxDocs(42); // when collector.doSetNextReader(readerStub.getContext()); collector.collect(1); collector.collect(3); collector.collect(5); collector.collect(9); // then assertEquals(4, collector.getTotalHits()); List<DocValuesCollector.MatchingDocs> allMatchingDocs = collector.getMatchingDocs(); assertEquals(1, allMatchingDocs.size()); DocValuesCollector.MatchingDocs matchingDocs = allMatchingDocs.get(0); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(4, matchingDocs.totalHits); DocIdSetIterator disi = matchingDocs.docIdSet.iterator(); assertEquals(1, disi.nextDoc()); assertEquals(3, disi.nextDoc()); assertEquals(5, disi.nextDoc()); assertEquals(9, disi.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc()); }
From source file:org.neo4j.kernel.api.impl.index.DocValuesCollectorTest.java
License:Open Source License
@Test public void shouldCollectOneMatchingDocsPerSegment() throws Exception { // given/* w w w. ja v a2 s. c o m*/ DocValuesCollector collector = new DocValuesCollector(); IndexReaderStub readerStub = indexReaderWithMaxDocs(42); // when collector.doSetNextReader(readerStub.getContext()); collector.collect(1); collector.collect(3); collector.doSetNextReader(readerStub.getContext()); collector.collect(5); collector.collect(9); // then assertEquals(4, collector.getTotalHits()); List<DocValuesCollector.MatchingDocs> allMatchingDocs = collector.getMatchingDocs(); assertEquals(2, allMatchingDocs.size()); DocValuesCollector.MatchingDocs matchingDocs = allMatchingDocs.get(0); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(2, matchingDocs.totalHits); DocIdSetIterator disi = matchingDocs.docIdSet.iterator(); assertEquals(1, disi.nextDoc()); assertEquals(3, disi.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc()); matchingDocs = allMatchingDocs.get(1); assertSame(readerStub.getContext(), matchingDocs.context); assertEquals(2, matchingDocs.totalHits); disi = matchingDocs.docIdSet.iterator(); assertEquals(5, disi.nextDoc()); assertEquals(9, disi.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc()); }
From source file:org.opengrok.indexer.search.context.OGKUnifiedHighlighter.java
License:Apache License
/** * Produces original text by reading from OpenGrok source content relative * to {@link RuntimeEnvironment#getSourceRootPath()} and returns the content * for each document if the timestamp matches -- or else just {@code null} * for a missing file or a timestamp mismatch (as "the returned Strings must * be identical to what was indexed.")// ww w .ja va 2s . c om * <p> * "This method must load fields for at least one document from the given * {@link DocIdSetIterator} but need not return all of them; by default the * character lengths are summed and this method will return early when * {@code cacheCharsThreshold} is exceeded. Specifically if that number is * 0, then only one document is fetched no matter what. Values in the array * of {@link CharSequence} will be {@code null} if no value was found." * @return a defined instance * @throws IOException if an I/O error occurs */ @Override protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { List<CharSequence[]> docListOfFields = new ArrayList<>( cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost())); int sumChars = 0; do { int docId = docIter.nextDoc(); if (docId == DocIdSetIterator.NO_MORE_DOCS) { break; } Document doc = searcher.doc(docId); String path = doc.get(QueryBuilder.PATH); String storedU = doc.get(QueryBuilder.U); String content = getRepoFileContent(path, storedU); CharSequence[] seqs = new CharSequence[fields.length]; Arrays.fill(seqs, content); docListOfFields.add(seqs); if (content != null) { sumChars += content.length(); } } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0); return docListOfFields; }
From source file:org.opensextant.solrtexttagger.TaggerRequestHandler.java
License:Open Source License
private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException { //Now we must supply a Solr DocList and add it to the response. // Typically this is gotten via a SolrIndexSearcher.search(), but in this case we // know exactly what documents to return, the order doesn't matter nor does // scoring.//from www . ja va 2s . c om // Ideally an implementation of DocList could be directly implemented off // of a BitSet, but there are way too many methods to implement for a minor // payoff. int matchDocs = matchDocIdsBS.cardinality(); int[] docIds = new int[Math.min(rows, matchDocs)]; DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1); for (int i = 0; i < docIds.length; i++) { docIds[i] = docIdIter.nextDoc(); } return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f); }
From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java
License:Open Source License
List<DocumentNgram> getNgrams(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { Corpus corpus = corpusMapper.getCorpus(); int[] totalTokens = corpus.getLastTokenPositions(tokenType); FlexibleQueue<DocumentNgram> queue = new FlexibleQueue<DocumentNgram>(comparator, start + limit); Set<String> validIds = new HashSet<String>(); validIds.addAll(this.getCorpusStoredDocumentIdsFromParameters(corpus)); OverlapFilter filter = getDocumentNgramsOverlapFilter(parameters); DocIdSetIterator it = corpusMapper.getDocIdSet().iterator(); while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int luceneDoc = it.docID(); String docId = corpusMapper.getDocumentIdFromLuceneId(luceneDoc); if (validIds.contains(docId) == false) { continue; }// w w w . j a v a2 s . c om int corpusDocumentIndex = corpusMapper.getDocumentPositionFromLuceneId(luceneDoc); int lastToken = totalTokens[corpusDocumentIndex]; // build single grams as seed for ngrams SimplifiedTermInfo[] sparseSimplifiedTermInfoArray = getSparseSimplifiedTermInfoArray(corpusMapper, luceneDoc, lastToken); Map<String, List<int[]>> stringPositionsMap = new HashMap<String, List<int[]>>(); for (int i = 0, len = sparseSimplifiedTermInfoArray.length; i < len; i++) { if (sparseSimplifiedTermInfoArray[i] != null && sparseSimplifiedTermInfoArray[i].term.isEmpty() == false) { if (stringPositionsMap.containsKey(sparseSimplifiedTermInfoArray[i].term) == false) { List<int[]> l = new ArrayList<int[]>(); l.add(new int[] { i, i }); stringPositionsMap.put(sparseSimplifiedTermInfoArray[i].term, l); } else { stringPositionsMap.get(sparseSimplifiedTermInfoArray[i].term).add(new int[] { i, i }); } } } List<DocumentNgram> ngrams = getNgramsFromStringPositions(stringPositionsMap, corpusDocumentIndex, 1); ngrams = getNextNgrams(ngrams, sparseSimplifiedTermInfoArray, corpusDocumentIndex, 2); ngrams = filter.getFilteredNgrams(ngrams, lastToken); for (DocumentNgram ngram : ngrams) { if (ngram.getLength() >= minLength && ngram.getLength() <= maxLength) { queue.offer(ngram); } } } return queue.getOrderedList(start); }
From source file:org.zenoss.zep.index.impl.lucene.LuceneEventIndexBackend.java
License:Open Source License
protected void searchEventTagSeverities(EventFilter filter, EventTagSeverityCounter counter) throws ZepException { final boolean hasTagsFilter = filter.getTagFilterCount() > 0; IndexSearcher searcher = null;//from w w w .j a v a2 s. c om try { searcher = getSearcher(); final Query query = buildQueryFromFilter(searcher.getIndexReader(), filter); final OpenBitSet docs = new OpenBitSet(searcher.getIndexReader().maxDoc()); searcher.search(query, new Collector() { private int docBase; @Override public void setScorer(Scorer scorer) throws IOException { } @Override public void collect(int doc) throws IOException { docs.set(docBase + doc); } @Override public void setNextReader(AtomicReaderContext atomicReaderContext) throws IOException { this.docBase = atomicReaderContext.docBase; } @Override public boolean acceptsDocsOutOfOrder() { return true; } }); int docId; final DocIdSetIterator it = docs.iterator(); while ((docId = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { final EventSummary summary; if (this.archive) { // TODO: This isn't very cheap - would be better to batch by UUID in separate calls // This doesn't get called on the event archive right now, so leave it until need to optimize. Document doc = searcher.doc(docId, UUID_FIELDS); summary = this.eventSummaryBaseDao.findByUuid(doc.get(FIELD_UUID)); } else { Document doc = searcher.doc(docId); // this is an optimization for getting the non-archived tags from an organizer for ticket // see ZEN-7239. For this ticket we updated the index to store what we needed for generating the // tags severities. Since we do not want a migrate of completely deleting the index this // method is backwards compatible by uncompressing the protobuf if (doc.get(FIELD_SEVERITY) != null) { int count = Integer.parseInt(doc.get(FIELD_COUNT)); boolean acknowledged = EventStatus.STATUS_ACKNOWLEDGED .equals(EventStatus.valueOf(Integer.parseInt(doc.get(FIELD_STATUS)))); EventSeverity severity = EventSeverity.valueOf(Integer.parseInt(doc.get(FIELD_SEVERITY))); // get the map for each filter and update the count for (String tag : doc.getValues(FIELD_TAGS)) counter.update(tag, severity, count, acknowledged); continue; } else { summary = LuceneEventIndexMapper.toEventSummary(doc); } } boolean acknowledged = EventStatus.STATUS_ACKNOWLEDGED == summary.getStatus(); Event occurrence = summary.getOccurrence(0); EventSeverity severity = occurrence.getSeverity(); int count = occurrence.getCount(); EventActor actor = occurrence.getActor(); // Build tags from element_uuids - no tags specified in filter if (!hasTagsFilter) { if (actor.hasElementUuid()) counter.update(actor.getElementUuid(), severity, count, acknowledged); } // Build tag severities from passed in filter else { for (String uuid : Arrays.asList(actor.getElementUuid(), actor.getElementSubUuid())) counter.update(uuid, severity, count, acknowledged); for (EventTag tag : occurrence.getTagsList()) for (String tagUuid : tag.getUuidList()) counter.update(tagUuid, severity, count, acknowledged); } } } catch (IOException e) { throw new ZepException(e); } catch (OutOfMemoryError e) { closeSearcherManager(); throw e; } finally { returnSearcher(searcher); } }
From source file:perf.DiskUsage.java
License:Apache License
static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception { Map<String, FieldStats> stats = new HashMap<>(); Map<String, String> dvSuffixes = new HashMap<>(); Map<String, String> postingsSuffixes = new HashMap<>(); for (FieldInfo field : reader.getFieldInfos()) { FieldStats fieldStats = new FieldStats(field.name); stats.put(field.name, fieldStats); Map<String, String> attributes = field.attributes(); if (attributes != null) { String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY); if (postingsSuffix != null) { postingsSuffixes.put(postingsSuffix, field.name); }/* w w w . jav a 2 s . c o m*/ String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY); if (dvSuffix != null) { dvSuffixes.put(dvSuffix, field.name); } } DocIdSetIterator docsWithField; switch (field.getDocValuesType()) { case NUMERIC: docsWithField = reader.getNumericDocValues(field.name); break; case BINARY: docsWithField = reader.getBinaryDocValues(field.name); break; case SORTED: docsWithField = reader.getSortedDocValues(field.name); break; case SORTED_NUMERIC: docsWithField = reader.getSortedNumericDocValues(field.name); break; case SORTED_SET: docsWithField = reader.getSortedSetDocValues(field.name); break; case NONE: docsWithField = null; break; default: docsWithField = null; break; } if (docsWithField != null) { int count = 0; while (docsWithField.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { count++; } fieldStats.docCountWithField = count; } } Directory directory = reader.directory(); for (String file : directory.listAll()) { String suffix = parseSuffix(file); long bytes = directory.fileLength(file); if (suffix != null) { switch (IndexFileNames.getExtension(file)) { case "dvd": case "dvm": stats.get(dvSuffixes.get(suffix)).dvBytes += bytes; break; case "tim": case "tip": stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes; break; case "doc": stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes; break; case "pos": case "pay": stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes; break; default: throw new AssertionError("unexpected suffixed file: " + file); } } else { // not a per-field file, but we can hackishly do this for the points case. if ("dii".equals(IndexFileNames.getExtension(file))) { System.err.println( "retrieving per-field point usage, if you see a scary corruption error, its probably just this tool!!!!"); try (ChecksumIndexInput in = directory.openChecksumInput(file, IOContext.READONCE)) { // fail hard if its not exactly the version we do this hack for. CodecUtil.checkIndexHeader(in, "Lucene60PointsFormatMeta", 0, 0, reader.getSegmentInfo().info.getId(), ""); int fieldCount = in.readVInt(); // strangely, bkd offsets are not in any guaranteed order TreeMap<Long, String> offsetToField = new TreeMap<>(); for (int i = 0; i < fieldCount; i++) { int field = in.readVInt(); long offset = in.readVLong(); offsetToField.put(offset, reader.getFieldInfos().fieldInfo(field).name); } // now we can traverse in order long previousOffset = 0; for (Map.Entry<Long, String> entry : offsetToField.entrySet()) { long offset = entry.getKey(); String field = entry.getValue(); stats.get(field).pointsBytes += (offset - previousOffset); previousOffset = offset; } CodecUtil.checkFooter(in); } } } } return new TreeSet<FieldStats>(stats.values()); }