List of usage examples for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS
int NO_MORE_DOCS
To view the source code for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.
Click Source Link
From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java
License:Apache License
/** * Get the list of labels, sorted by best score. *///from w w w. j a v a2 s . c o m protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, Collection<WeightedPropertyVectorWritable> wpvws) throws IOException { if (wpvws.size() < minNumIds) { log.info("Skipping small cluster {} with size: {}", integer, wpvws.size()); return null; } log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); Directory dir = FSDirectory.open(new File(this.indexDir)); IndexReader reader = DirectoryReader.open(dir); log.info("# of documents in the index {}", reader.numDocs()); Collection<String> idSet = Sets.newHashSet(); for (WeightedPropertyVectorWritable wpvw : wpvws) { Vector vector = wpvw.getVector(); if (vector instanceof NamedVector) { idSet.add(((NamedVector) vector).getName()); } } int numDocs = reader.numDocs(); OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); log.info("Populating term infos from the index"); /** * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency. * * Since we have deleted the documents out of the cluster, the document frequency for a term should only * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency * in the entire index. To get the in-cluster frequency, we need to query the index to get the term * frequencies in each document. The number of results of this call will be the in-cluster document * frequency. */ Terms t = MultiFields.getTerms(reader, contentField); TermsEnum te = t.iterator(null); Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>(); Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions int count = 0; BytesRef term; while ((term = te.next()) != null) { OpenBitSet termBitset = new OpenBitSet(reader.maxDoc()); DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //check to see if we don't have an deletions (null) or if document is live if (liveDocs != null && !liveDocs.get(docID)) { // document is deleted... termBitset.set(docsEnum.docID()); } } // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency. // This modifies the termBitset, but that's fine as we are not using it anywhere else. termBitset.and(clusterDocBitset); int inclusterDF = (int) termBitset.cardinality(); TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF); termEntryMap.put(entry.getTerm(), entry); } List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList(); int clusterSize = wpvws.size(); for (TermEntry termEntry : termEntryMap.values()) { int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm())); int outDF = corpusDF - termEntry.getDocFreq(); int inDF = termEntry.getDocFreq(); double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); clusteredTermInfo.add(termInfoCluster); } Collections.sort(clusteredTermInfo); // Cleanup Closeables.close(reader, true); termEntryMap.clear(); return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); }
From source file:org.apache.solr.analytics.AnalyticsDriver.java
License:Apache License
/** * Drive the collection of reduction data. This includes overall data as well as faceted data. * /* www. j a v a 2 s.c o m*/ * @param manager of the request to drive * @param searcher the results of the query * @param filter that represents the overall query * @param queryRequest used for the search request * @throws IOException if an error occurs while reading from Solr */ public static void drive(AnalyticsRequestManager manager, SolrIndexSearcher searcher, Filter filter, SolrQueryRequest queryRequest) throws IOException { StreamingInfo streamingInfo = manager.getStreamingFacetInfo(); Iterable<StreamingFacet> streamingFacets = streamingInfo.streamingFacets; ReductionCollectionManager collectionManager = streamingInfo.streamingCollectionManager; Iterable<FacetValueQueryExecuter> facetExecuters = manager.getFacetExecuters(filter, queryRequest); // Streaming phase (Overall results & Value/Pivot Facets) // Loop through all documents and collect reduction data for streaming facets and overall results if (collectionManager.needsCollection()) { List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves(); for (int leafNum = 0; leafNum < contexts.size(); leafNum++) { LeafReaderContext context = contexts.get(leafNum); DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs if (dis == null) { continue; } DocIdSetIterator disi = dis.iterator(); if (disi != null) { collectionManager.doSetNextReader(context); int doc = disi.nextDoc(); while (doc != DocIdSetIterator.NO_MORE_DOCS) { // Add a document to the statistics being generated collectionManager.collect(doc); streamingFacets.forEach(facet -> facet.addFacetValueCollectionTargets()); collectionManager.apply(); doc = disi.nextDoc(); } } } } // Executing phase (Query/Range Facets) // Send additional Solr Queries to compute facet values for (FacetValueQueryExecuter executer : facetExecuters) { executer.execute(searcher); } }
From source file:org.apache.solr.analytics.function.field.AbstractAnalyticsFieldTest.java
License:Apache License
protected Set<String> collectFieldValues(AnalyticsField testField, Predicate<String> valuesFiller) throws IOException { StringField idField = new StringField("id"); Filter filter = new QueryWrapperFilter(new MatchAllDocsQuery()); Set<String> missing = new HashSet<>(); List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves(); for (int leafNum = 0; leafNum < contexts.size(); leafNum++) { LeafReaderContext context = contexts.get(leafNum); DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs if (dis == null) { continue; }//from www.jav a2 s. c o m DocIdSetIterator disi = dis.iterator(); if (disi != null) { testField.doSetNextReader(context); idField.doSetNextReader(context); int doc = disi.nextDoc(); while (doc != DocIdSetIterator.NO_MORE_DOCS) { // Add a document to the statistics being generated testField.collect(doc); idField.collect(doc); String id = idField.getString(); if (!valuesFiller.test(id)) { missing.add(id); } doc = disi.nextDoc(); } } } return missing; }
From source file:org.apache.solr.analytics.request.AnalyticsStats.java
License:Apache License
/** * Calculates the analytics requested in the Parameters. * /*from ww w .j a va2 s .co m*/ * @return List of results formated to mirror the input XML. * @throws IOException if execution fails */ public NamedList<?> execute() throws IOException { statsCollector.startRequest(); NamedList<Object> res = new NamedList<>(); List<AnalyticsRequest> requests; requests = AnalyticsRequestFactory.parse(searcher.getSchema(), params); if (requests == null || requests.size() == 0) { return res; } statsCollector.addRequests(requests.size()); // Get filter to all docs Filter filter = docs.getTopFilter(); // Computing each Analytics Request Seperately for (AnalyticsRequest areq : requests) { // The Accumulator which will control the statistics generation // for the entire analytics request ValueAccumulator accumulator; // The number of total facet requests int facets = areq.getFieldFacets().size() + areq.getRangeFacets().size() + areq.getQueryFacets().size(); try { if (facets == 0) { accumulator = BasicAccumulator.create(searcher, docs, areq); } else { accumulator = FacetingAccumulator.create(searcher, docs, areq, req); } } catch (IOException e) { log.warn("Analytics request '" + areq.getName() + "' failed", e); continue; } statsCollector.addStatsCollected(((BasicAccumulator) accumulator).getNumStatsCollectors()); statsCollector.addStatsRequests(areq.getExpressions().size()); statsCollector.addFieldFacets(areq.getFieldFacets().size()); statsCollector.addRangeFacets(areq.getRangeFacets().size()); statsCollector.addQueryFacets(areq.getQueryFacets().size()); statsCollector.addQueries(((BasicAccumulator) accumulator).getNumQueries()); // Loop through the documents returned by the query and add to accumulator List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves(); for (int leafNum = 0; leafNum < contexts.size(); leafNum++) { LeafReaderContext context = contexts.get(leafNum); DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { accumulator.getLeafCollector(context); int doc = disi.nextDoc(); while (doc != DocIdSetIterator.NO_MORE_DOCS) { // Add a document to the statistics being generated accumulator.collect(doc); doc = disi.nextDoc(); } } } // do some post-processing accumulator.postProcess(); // compute the stats accumulator.compute(); res.add(areq.getName(), accumulator.export()); } statsCollector.endRequest(); return res; }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
private static Document getFirstLiveDoc(Terms terms, AtomicReader reader) throws IOException { DocsEnum docsEnum = null;// w ww. ja v a 2s . co m TermsEnum termsEnum = terms.iterator(null); BytesRef text; // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way? for (int idx = 0; idx < 1000 && docsEnum == null; ++idx) { text = termsEnum.next(); if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; } docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { return reader.document(docsEnum.docID()); } } return null; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { TermsEnum termsEnum = terms.iterator(); if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; }//from w w w . j a v a2 s .c o m PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) { return null; } return reader.document(postingsEnum.docID()); }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandlerTest.java
License:Open Source License
/** Check the behaviour when there are no more documents matching the terms. */ @Test//from w w w .ja v a 2s .co m public void testNoMoreDocs() throws IOException { // There is a search term but no matching documents. when(mockTermsEnum.next()).thenReturn(TERM_TEXT); when(mockPostingsEnum.nextDoc()).thenReturn(DocIdSetIterator.NO_MORE_DOCS); // Call the method under test. Document firstLiveDoc = AlfrescoLukeRequestHandler.getFirstLiveDoc(mockTerms, mockReader); // Check the returned value. assertNull("Expected no document to be returned.", firstLiveDoc); }
From source file:org.apache.solr.handler.ExportWriter.java
License:Apache License
protected void writeDocs(SolrQueryRequest req, IteratorWriter.ItemWriter writer, Sort sort) throws IOException { //Write the data. List<LeafReaderContext> leaves = req.getSearcher().getTopReaderContext().leaves(); SortDoc sortDoc = getSortDoc(req.getSearcher(), sort.getSort()); int count = 0; int queueSize = 30000; SortQueue queue = new SortQueue(queueSize, sortDoc); SortDoc[] outDocs = new SortDoc[queueSize]; while (count < totalHits) { //long begin = System.nanoTime(); queue.reset();//from www . j a v a2 s . com SortDoc top = queue.top(); for (int i = 0; i < leaves.size(); i++) { sortDoc.setNextReader(leaves.get(i)); DocIdSetIterator it = new BitSetIterator(sets[i], 0); // cost is not useful here int docId = -1; while ((docId = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { sortDoc.setValues(docId); if (top.lessThan(sortDoc)) { top.setValues(sortDoc); top = queue.updateTop(); } } } int outDocsIndex = -1; for (int i = 0; i < queueSize; i++) { SortDoc s = queue.pop(); if (s.docId > -1) { outDocs[++outDocsIndex] = s; } } //long end = System.nanoTime(); count += (outDocsIndex + 1); try { for (int i = outDocsIndex; i >= 0; --i) { SortDoc s = outDocs[i]; writer.add((MapWriter) ew -> { writeDoc(s, leaves, ew); s.reset(); }); } } catch (Throwable e) { Throwable ex = e; e.printStackTrace(); while (ex != null) { String m = ex.getMessage(); if (m != null && m.contains("Broken pipe")) { throw new IgnoreException(); } ex = ex.getCause(); } if (e instanceof IOException) { throw ((IOException) e); } else { throw new IOException(e); } } } }
From source file:org.apache.solr.request.DocValuesFacets.java
License:Apache License
/** accumulates per-segment single-valued facet counts, mapping to global ordinal space */ // specialized since the single-valued case is different static void accumSingle(int counts[], int startTermIndex, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException { int doc;/*from w w w . ja v a 2 s. co m*/ while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int term = si.getOrd(doc); if (map != null && term >= 0) { term = (int) map.getGlobalOrd(subIndex, term); } int arrIdx = term - startTermIndex; if (arrIdx >= 0 && arrIdx < counts.length) counts[arrIdx]++; } }
From source file:org.apache.solr.request.DocValuesFacets.java
License:Apache License
/** accumulates per-segment multi-valued facet counts, mapping to global ordinal space */ static void accumMulti(int counts[], int startTermIndex, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException { int doc;/*from ww w .jav a 2s . c o m*/ while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { si.setDocument(doc); // strange do-while to collect the missing count (first ord is NO_MORE_ORDS) int term = (int) si.nextOrd(); if (term < 0) { if (startTermIndex == -1) { counts[0]++; // missing count } continue; } do { if (map != null) { term = (int) map.getGlobalOrd(subIndex, term); } int arrIdx = term - startTermIndex; if (arrIdx >= 0 && arrIdx < counts.length) counts[arrIdx]++; } while ((term = (int) si.nextOrd()) >= 0); } }