org.apache.solr.search.facet.FacetFieldProcessorByEnumTermsStream.java Source code

Introduction

Here is the source code for org.apache.solr.search.facet.FacetFieldProcessorByEnumTermsStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search.facet;

import java.io.Closeable;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiPostingsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.HashDocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SortedIntDocSet;

/**
 * Enumerates indexed terms in order in a streaming fashion.
 * It's able to stream since no data needs to be accumulated so long as it's index order.
 */
class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implements Closeable {
    long bucketsToSkip;
    long bucketsReturned;

    boolean closed;
    boolean countOnly;
    boolean hasSubFacets; // true if there are subfacets
    int minDfFilterCache;
    DocSet docs;
    DocSet fastForRandomSet;
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    PostingsEnum postingsEnum;
    BytesRef startTermBytes;
    BytesRef term;
    LeafReaderContext[] leaves;

    FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) {
        super(fcontext, freq, sf);
    }

    @Override
    public void close() throws IOException {
        if (!closed) {
            closed = true;
            // fcontext.base.decref();  // OFF-HEAP
        }
    }

    @Override
    public void process() throws IOException {
        super.process();

        // We need to keep the fcontext open after processing is done (since we will be streaming in the response writer).
        // But if the connection is broken, we want to clean up.
        // fcontext.base.incref();  // OFF-HEAP
        fcontext.qcontext.addCloseHook(this);

        setup();
        response = new SimpleOrderedMap<>();
        response.add("buckets", new Iterator() {
            boolean retrieveNext = true;
            Object val;

            @Override
            public boolean hasNext() {
                if (retrieveNext) {
                    val = nextBucket();
                }
                retrieveNext = false;
                return val != null;
            }

            @Override
            public Object next() {
                if (retrieveNext) {
                    val = nextBucket();
                }
                retrieveNext = true;
                if (val == null) {
                    // Last value, so clean up.  In the case that we are doing streaming facets within streaming facets,
                    // the number of close hooks could grow very large, so we want to remove ourselves.
                    boolean removed = fcontext.qcontext.removeCloseHook(FacetFieldProcessorByEnumTermsStream.this);
                    assert removed;
                    try {
                        close();
                    } catch (IOException e) {
                        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                                "Error during facet streaming close", e);
                    }
                }
                return val;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        });
    }

    private void setup() throws IOException {

        countOnly = freq.facetStats.size() == 0 || freq.facetStats.values().iterator().next() instanceof CountAgg;
        hasSubFacets = freq.subFacets.size() > 0;
        bucketsToSkip = freq.offset;

        createAccs(-1, 1);

        // Minimum term docFreq in order to use the filterCache for that term.
        if (freq.cacheDf == -1) { // -1 means never cache
            minDfFilterCache = Integer.MAX_VALUE;
        } else if (freq.cacheDf == 0) { // default; compute as fraction of maxDoc
            minDfFilterCache = Math.max(fcontext.searcher.maxDoc() >> 4, 3); // (minimum of 3 is for test coverage purposes)
        } else {
            minDfFilterCache = freq.cacheDf;
        }

        docs = fcontext.base;
        fastForRandomSet = null;

        if (freq.prefix != null) {
            String indexedPrefix = sf.getType().toInternal(freq.prefix);
            startTermBytes = new BytesRef(indexedPrefix);
        } else if (sf.getType().getNumericType() != null) {
            String triePrefix = TrieField.getMainValuePrefix(sf.getType());
            if (triePrefix != null) {
                startTermBytes = new BytesRef(triePrefix);
            }
        }

        Fields fields = fcontext.searcher.getSlowAtomicReader().fields();
        Terms terms = fields == null ? null : fields.terms(sf.getName());

        termsEnum = null;
        deState = null;
        term = null;

        if (terms != null) {

            termsEnum = terms.iterator();

            // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
            // facet.offset when sorting by index order.

            if (startTermBytes != null) {
                if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) {
                    termsEnum = null;
                } else {
                    term = termsEnum.term();
                }
            } else {
                // position termsEnum on first term
                term = termsEnum.next();
            }
        }

        List<LeafReaderContext> leafList = fcontext.searcher.getTopReaderContext().leaves();
        leaves = leafList.toArray(new LeafReaderContext[leafList.size()]);
    }

    private SimpleOrderedMap<Object> nextBucket() {
        try {
            return _nextBucket();
        } catch (Exception e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming", e);
        }
    }

    private SimpleOrderedMap<Object> _nextBucket() throws IOException {
        DocSet termSet = null;

        try {
            while (term != null) {

                if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) {
                    break;
                }

                int df = termsEnum.docFreq();
                if (df < effectiveMincount) {
                    term = termsEnum.next();
                    continue;
                }

                if (termSet != null) {
                    // termSet.decref(); // OFF-HEAP
                    termSet = null;
                }

                int c = 0;

                if (hasSubFacets || df >= minDfFilterCache) {
                    // use the filter cache

                    if (deState == null) {
                        deState = new SolrIndexSearcher.DocsEnumState();
                        deState.fieldName = sf.getName();
                        deState.liveDocs = fcontext.searcher.getSlowAtomicReader().getLiveDocs();
                        deState.termsEnum = termsEnum;
                        deState.postingsEnum = postingsEnum;
                        deState.minSetSizeCached = minDfFilterCache;
                    }

                    if (hasSubFacets || !countOnly) {
                        DocSet termsAll = fcontext.searcher.getDocSet(deState);
                        termSet = docs.intersection(termsAll);
                        // termsAll.decref(); // OFF-HEAP
                        c = termSet.size();
                    } else {
                        c = fcontext.searcher.numDocs(docs, deState);
                    }
                    postingsEnum = deState.postingsEnum;

                    resetStats();

                    if (!countOnly) {
                        collect(termSet, 0);
                    }

                } else {
                    // We don't need the docset here (meaning no sub-facets).
                    // if countOnly, then we are calculating some other stats...
                    resetStats();

                    // lazy convert to fastForRandomSet
                    if (fastForRandomSet == null) {
                        fastForRandomSet = docs;
                        if (docs instanceof SortedIntDocSet) { // OFF-HEAP todo: also check for native version
                            SortedIntDocSet sset = (SortedIntDocSet) docs;
                            fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
                        }
                    }
                    // iterate over TermDocs to calculate the intersection
                    postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);

                    if (postingsEnum instanceof MultiPostingsEnum) {
                        MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
                        int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                        for (int subindex = 0; subindex < numSubs; subindex++) {
                            MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                            if (sub.postingsEnum == null)
                                continue;
                            int base = sub.slice.start;
                            int docid;

                            if (countOnly) {
                                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                    if (fastForRandomSet.exists(docid + base))
                                        c++;
                                }
                            } else {
                                setNextReader(leaves[sub.slice.readerIndex]);
                                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                    if (fastForRandomSet.exists(docid + base)) {
                                        c++;
                                        collect(docid, 0);
                                    }
                                }
                            }

                        }
                    } else {
                        int docid;
                        if (countOnly) {
                            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid))
                                    c++;
                            }
                        } else {
                            setNextReader(leaves[0]);
                            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid)) {
                                    c++;
                                    collect(docid, 0);
                                }
                            }
                        }
                    }

                }

                if (c < effectiveMincount) {
                    term = termsEnum.next();
                    continue;
                }

                // handle offset and limit
                if (bucketsToSkip > 0) {
                    bucketsToSkip--;
                    term = termsEnum.next();
                    continue;
                }

                if (freq.limit >= 0 && ++bucketsReturned > freq.limit) {
                    return null;
                }

                // set count in case other stats depend on it
                countAcc.incrementCount(0, c);

                // OK, we have a good bucket to return... first get bucket value before moving to next term
                Object bucketVal = sf.getType().toObject(sf, term);
                TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null;
                term = termsEnum.next();

                SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>();
                bucket.add("val", bucketVal);
                addStats(bucket, 0);
                if (hasSubFacets) {
                    processSubs(bucket, bucketQuery, termSet);
                }

                // TODO... termSet needs to stick around for streaming sub-facets?

                return bucket;

            }

        } finally {
            if (termSet != null) {
                // termSet.decref();  // OFF-HEAP
                termSet = null;
            }
        }

        // end of the iteration
        return null;
    }

}