Example usage for org.apache.lucene.search DocIdSetIterator NO_MORE

Introduction

In this page you can find the example usage for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.

Prototype

int NO_MORE_DOCS

To view the source code for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.

Click Source Link

Document

When returned by #nextDoc() , #advance(int) and #docID() it means there are no more docs in the iterator.

Usage

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

/**
 * Get the list of labels, sorted by best score.
 *///from   w  w w.  j a  v a2 s  .  c  o  m
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {

    if (wpvws.size() < minNumIds) {
        log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
        return null;
    }

    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
    Directory dir = FSDirectory.open(new File(this.indexDir));
    IndexReader reader = DirectoryReader.open(dir);

    log.info("# of documents in the index {}", reader.numDocs());

    Collection<String> idSet = Sets.newHashSet();
    for (WeightedPropertyVectorWritable wpvw : wpvws) {
        Vector vector = wpvw.getVector();
        if (vector instanceof NamedVector) {
            idSet.add(((NamedVector) vector).getName());
        }
    }

    int numDocs = reader.numDocs();

    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);

    log.info("Populating term infos from the index");

    /**
     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
     * 
     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
     * frequencies in each document. The number of results of this call will be the in-cluster document
     * frequency.
     */
    Terms t = MultiFields.getTerms(reader, contentField);
    TermsEnum te = t.iterator(null);
    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions

    int count = 0;
    BytesRef term;
    while ((term = te.next()) != null) {
        OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
        DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
        int docID;
        while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //check to see if we don't have an deletions (null) or if document is live
            if (liveDocs != null && !liveDocs.get(docID)) {
                // document is deleted...
                termBitset.set(docsEnum.docID());
            }
        }
        // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
        // This modifies the termBitset, but that's fine as we are not using it anywhere else.
        termBitset.and(clusterDocBitset);
        int inclusterDF = (int) termBitset.cardinality();

        TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
        termEntryMap.put(entry.getTerm(), entry);

    }

    List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();

    int clusterSize = wpvws.size();

    for (TermEntry termEntry : termEntryMap.values()) {

        int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
        int outDF = corpusDF - termEntry.getDocFreq();
        int inDF = termEntry.getDocFreq();
        double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
        TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF,
                logLikelihoodRatio);
        clusteredTermInfo.add(termInfoCluster);
    }

    Collections.sort(clusteredTermInfo);
    // Cleanup
    Closeables.close(reader, true);
    termEntryMap.clear();

    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}

From source file:org.apache.solr.analytics.AnalyticsDriver.java

License:Apache License

/**
 * Drive the collection of reduction data. This includes overall data as well as faceted data.
 * /* www. j a v  a 2  s.c o m*/
 * @param manager of the request to drive
 * @param searcher the results of the query
 * @param filter that represents the overall query
 * @param queryRequest used for the search request
 * @throws IOException if an error occurs while reading from Solr
 */
public static void drive(AnalyticsRequestManager manager, SolrIndexSearcher searcher, Filter filter,
        SolrQueryRequest queryRequest) throws IOException {
    StreamingInfo streamingInfo = manager.getStreamingFacetInfo();
    Iterable<StreamingFacet> streamingFacets = streamingInfo.streamingFacets;
    ReductionCollectionManager collectionManager = streamingInfo.streamingCollectionManager;

    Iterable<FacetValueQueryExecuter> facetExecuters = manager.getFacetExecuters(filter, queryRequest);

    // Streaming phase (Overall results & Value/Pivot Facets)
    // Loop through all documents and collect reduction data for streaming facets and overall results
    if (collectionManager.needsCollection()) {
        List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves();
        for (int leafNum = 0; leafNum < contexts.size(); leafNum++) {
            LeafReaderContext context = contexts.get(leafNum);
            DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs
            if (dis == null) {
                continue;
            }
            DocIdSetIterator disi = dis.iterator();
            if (disi != null) {
                collectionManager.doSetNextReader(context);
                int doc = disi.nextDoc();
                while (doc != DocIdSetIterator.NO_MORE_DOCS) {
                    // Add a document to the statistics being generated
                    collectionManager.collect(doc);
                    streamingFacets.forEach(facet -> facet.addFacetValueCollectionTargets());
                    collectionManager.apply();
                    doc = disi.nextDoc();
                }
            }
        }
    }

    // Executing phase (Query/Range Facets)
    // Send additional Solr Queries to compute facet values
    for (FacetValueQueryExecuter executer : facetExecuters) {
        executer.execute(searcher);
    }
}

From source file:org.apache.solr.analytics.function.field.AbstractAnalyticsFieldTest.java

License:Apache License

protected Set<String> collectFieldValues(AnalyticsField testField, Predicate<String> valuesFiller)
        throws IOException {
    StringField idField = new StringField("id");
    Filter filter = new QueryWrapperFilter(new MatchAllDocsQuery());
    Set<String> missing = new HashSet<>();

    List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves();
    for (int leafNum = 0; leafNum < contexts.size(); leafNum++) {
        LeafReaderContext context = contexts.get(leafNum);
        DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs
        if (dis == null) {
            continue;
        }//from   www.jav a2 s.  c o  m
        DocIdSetIterator disi = dis.iterator();
        if (disi != null) {
            testField.doSetNextReader(context);
            idField.doSetNextReader(context);
            int doc = disi.nextDoc();
            while (doc != DocIdSetIterator.NO_MORE_DOCS) {
                // Add a document to the statistics being generated
                testField.collect(doc);
                idField.collect(doc);

                String id = idField.getString();
                if (!valuesFiller.test(id)) {
                    missing.add(id);
                }
                doc = disi.nextDoc();
            }
        }
    }
    return missing;
}

From source file:org.apache.solr.analytics.request.AnalyticsStats.java

License:Apache License

/**
 * Calculates the analytics requested in the Parameters.
 * /*from   ww  w .j  a  va2 s  .co m*/
 * @return List of results formated to mirror the input XML.
 * @throws IOException if execution fails
 */
public NamedList<?> execute() throws IOException {
    statsCollector.startRequest();
    NamedList<Object> res = new NamedList<>();
    List<AnalyticsRequest> requests;

    requests = AnalyticsRequestFactory.parse(searcher.getSchema(), params);

    if (requests == null || requests.size() == 0) {
        return res;
    }
    statsCollector.addRequests(requests.size());

    // Get filter to all docs
    Filter filter = docs.getTopFilter();

    // Computing each Analytics Request Seperately
    for (AnalyticsRequest areq : requests) {
        // The Accumulator which will control the statistics generation
        // for the entire analytics request
        ValueAccumulator accumulator;

        // The number of total facet requests
        int facets = areq.getFieldFacets().size() + areq.getRangeFacets().size() + areq.getQueryFacets().size();
        try {
            if (facets == 0) {
                accumulator = BasicAccumulator.create(searcher, docs, areq);
            } else {
                accumulator = FacetingAccumulator.create(searcher, docs, areq, req);
            }
        } catch (IOException e) {
            log.warn("Analytics request '" + areq.getName() + "' failed", e);
            continue;
        }

        statsCollector.addStatsCollected(((BasicAccumulator) accumulator).getNumStatsCollectors());
        statsCollector.addStatsRequests(areq.getExpressions().size());
        statsCollector.addFieldFacets(areq.getFieldFacets().size());
        statsCollector.addRangeFacets(areq.getRangeFacets().size());
        statsCollector.addQueryFacets(areq.getQueryFacets().size());
        statsCollector.addQueries(((BasicAccumulator) accumulator).getNumQueries());

        // Loop through the documents returned by the query and add to accumulator
        List<LeafReaderContext> contexts = searcher.getTopReaderContext().leaves();
        for (int leafNum = 0; leafNum < contexts.size(); leafNum++) {
            LeafReaderContext context = contexts.get(leafNum);
            DocIdSet dis = filter.getDocIdSet(context, null); // solr docsets already exclude any deleted docs
            DocIdSetIterator disi = null;
            if (dis != null) {
                disi = dis.iterator();
            }

            if (disi != null) {
                accumulator.getLeafCollector(context);
                int doc = disi.nextDoc();
                while (doc != DocIdSetIterator.NO_MORE_DOCS) {
                    // Add a document to the statistics being generated
                    accumulator.collect(doc);
                    doc = disi.nextDoc();
                }
            }
        }

        // do some post-processing
        accumulator.postProcess();

        // compute the stats
        accumulator.compute();

        res.add(areq.getName(), accumulator.export());
    }

    statsCollector.endRequest();
    return res;
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

private static Document getFirstLiveDoc(Terms terms, AtomicReader reader) throws IOException {
    DocsEnum docsEnum = null;// w  ww. ja v  a  2s  . co  m
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef text;
    // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
    for (int idx = 0; idx < 1000 && docsEnum == null; ++idx) {
        text = termsEnum.next();
        if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
            return null;
        }
        docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
        if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            return reader.document(docsEnum.docID());
        }
    }
    return null;
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
        return null;
    }//from   w  w w . j a v  a2  s .c o m
    PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
    final Bits liveDocs = reader.getLiveDocs();
    if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS
            || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) {
        return null;
    }
    return reader.document(postingsEnum.docID());
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandlerTest.java

License:Open Source License

/** Check the behaviour when there are no more documents matching the terms. */
@Test//from   w w  w  .ja  v  a 2s .co  m
public void testNoMoreDocs() throws IOException {
    // There is a search term but no matching documents.
    when(mockTermsEnum.next()).thenReturn(TERM_TEXT);
    when(mockPostingsEnum.nextDoc()).thenReturn(DocIdSetIterator.NO_MORE_DOCS);

    // Call the method under test.
    Document firstLiveDoc = AlfrescoLukeRequestHandler.getFirstLiveDoc(mockTerms, mockReader);

    // Check the returned value.
    assertNull("Expected no document to be returned.", firstLiveDoc);
}

From source file:org.apache.solr.handler.ExportWriter.java

License:Apache License

protected void writeDocs(SolrQueryRequest req, IteratorWriter.ItemWriter writer, Sort sort) throws IOException {
    //Write the data.
    List<LeafReaderContext> leaves = req.getSearcher().getTopReaderContext().leaves();
    SortDoc sortDoc = getSortDoc(req.getSearcher(), sort.getSort());
    int count = 0;
    int queueSize = 30000;
    SortQueue queue = new SortQueue(queueSize, sortDoc);
    SortDoc[] outDocs = new SortDoc[queueSize];

    while (count < totalHits) {
        //long begin = System.nanoTime();
        queue.reset();//from www . j  a v  a2 s . com
        SortDoc top = queue.top();
        for (int i = 0; i < leaves.size(); i++) {
            sortDoc.setNextReader(leaves.get(i));
            DocIdSetIterator it = new BitSetIterator(sets[i], 0); // cost is not useful here
            int docId = -1;
            while ((docId = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                sortDoc.setValues(docId);
                if (top.lessThan(sortDoc)) {
                    top.setValues(sortDoc);
                    top = queue.updateTop();
                }
            }
        }

        int outDocsIndex = -1;

        for (int i = 0; i < queueSize; i++) {
            SortDoc s = queue.pop();
            if (s.docId > -1) {
                outDocs[++outDocsIndex] = s;
            }
        }

        //long end = System.nanoTime();

        count += (outDocsIndex + 1);

        try {
            for (int i = outDocsIndex; i >= 0; --i) {
                SortDoc s = outDocs[i];
                writer.add((MapWriter) ew -> {
                    writeDoc(s, leaves, ew);
                    s.reset();
                });
            }
        } catch (Throwable e) {
            Throwable ex = e;
            e.printStackTrace();
            while (ex != null) {
                String m = ex.getMessage();
                if (m != null && m.contains("Broken pipe")) {
                    throw new IgnoreException();
                }
                ex = ex.getCause();
            }

            if (e instanceof IOException) {
                throw ((IOException) e);
            } else {
                throw new IOException(e);
            }
        }
    }
}

From source file:org.apache.solr.request.DocValuesFacets.java

License:Apache License

/** accumulates per-segment single-valued facet counts, mapping to global ordinal space */
// specialized since the single-valued case is different
static void accumSingle(int counts[], int startTermIndex, SortedDocValues si, DocIdSetIterator disi,
        int subIndex, OrdinalMap map) throws IOException {
    int doc;/*from  w  w w  .  ja  v  a  2  s.  co m*/
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        int term = si.getOrd(doc);
        if (map != null && term >= 0) {
            term = (int) map.getGlobalOrd(subIndex, term);
        }
        int arrIdx = term - startTermIndex;
        if (arrIdx >= 0 && arrIdx < counts.length)
            counts[arrIdx]++;
    }
}

From source file:org.apache.solr.request.DocValuesFacets.java

License:Apache License

/** accumulates per-segment multi-valued facet counts, mapping to global ordinal space */
static void accumMulti(int counts[], int startTermIndex, SortedSetDocValues si, DocIdSetIterator disi,
        int subIndex, OrdinalMap map) throws IOException {
    int doc;/*from ww  w  .jav  a  2s  . c o m*/
    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        si.setDocument(doc);
        // strange do-while to collect the missing count (first ord is NO_MORE_ORDS)
        int term = (int) si.nextOrd();
        if (term < 0) {
            if (startTermIndex == -1) {
                counts[0]++; // missing count
            }
            continue;
        }

        do {
            if (map != null) {
                term = (int) map.getGlobalOrd(subIndex, term);
            }
            int arrIdx = term - startTermIndex;
            if (arrIdx >= 0 && arrIdx < counts.length)
                counts[arrIdx]++;
        } while ((term = (int) si.nextOrd()) >= 0);
    }
}

Example usage for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS

Introduction

Prototype

Document

Usage