org.apache.solr.search.DocSetBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.solr.search.DocSetBuilder.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search;

import java.io.IOException;

import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LSBRadixSorter;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Adapted from DocIdSetBuilder to build DocSets
 *
 * @lucene.internal
 */
public final class DocSetBuilder {

    private final int maxDoc;
    private final int threshold;

    private int[] buffer;
    private int pos;

    private FixedBitSet bitSet;

    public DocSetBuilder(int maxDoc, long costEst) {
        this.maxDoc = maxDoc;
        // For ridiculously small sets, we'll just use a sorted int[]
        // maxDoc >>> 7 is a good value if you want to save memory, lower values
        // such as maxDoc >>> 11 should provide faster building but at the expense
        // of using a full bitset even for quite sparse data
        this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes

        if (costEst > threshold) {
            bitSet = new FixedBitSet(maxDoc);
        } else {
            this.buffer = new int[Math.max((int) costEst, 1)];
        }
    }

    private void upgradeToBitSet() {
        assert bitSet == null;
        bitSet = new FixedBitSet(maxDoc);
        for (int i = 0; i < pos; ++i) {
            bitSet.set(buffer[i]);
        }
        this.buffer = null;
        this.pos = 0;
    }

    private void growBuffer(int minSize) {
        if (minSize < buffer.length)
            return;

        int newSize = buffer.length;
        while (newSize < minSize) {
            newSize = newSize << 1;
        }
        newSize = Math.min(newSize, threshold);

        int[] newBuffer = new int[newSize];
        System.arraycopy(buffer, 0, newBuffer, 0, pos);
        buffer = newBuffer;
    }

    public void add(DocIdSetIterator iter, int base) throws IOException {
        grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));

        if (bitSet != null) {
            add(bitSet, iter, base);
        } else {
            while (true) {
                for (int i = pos; i < buffer.length; ++i) {
                    final int doc = iter.nextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                        pos = i; // update pos
                        return;
                    }
                    buffer[i] = doc + base; // using the loop counter may help with removal of bounds checking
                }

                pos = buffer.length; // update pos
                if (pos + 1 >= threshold) {
                    break;
                }

                growBuffer(pos + 1);
            }

            upgradeToBitSet();
            add(bitSet, iter, base);
        }
    }

    public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException {
        for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
            bitSet.set(doc + base);
        }
    }

    /** Returns the number of terms visited */
    public int add(TermsEnum te, int base) throws IOException {
        PostingsEnum postings = null;

        int termCount = 0;
        for (;;) {
            BytesRef term = te.next();
            if (term == null)
                break;
            termCount++;
            postings = te.postings(postings, PostingsEnum.NONE);
            add(postings, base);
        }

        return termCount;
    }

    public void grow(int numDocs) {
        if (bitSet == null) {
            final long newLength = pos + numDocs;
            if (newLength < threshold) {
                growBuffer((int) newLength);
            } else {
                upgradeToBitSet();
            }
        }
    }

    public void add(int doc) {
        if (bitSet != null) {
            bitSet.set(doc);
        } else {
            if (pos >= buffer.length) {
                if (pos + 1 >= threshold) {
                    upgradeToBitSet();
                    bitSet.set(doc);
                    return;
                }
                growBuffer(pos + 1);
            }
            buffer[pos++] = doc;
        }
    }

    private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) {
        int pos = 0;
        int previous = -1;
        for (int i = 0; i < length; ++i) {
            final int value = arr[i];
            // assert value >= previous;
            if (value != previous && (acceptDocs == null || acceptDocs.get(value))) {
                arr[pos++] = value;
                previous = value;
            }
        }
        return pos;
    }

    public DocSet build(FixedBitSet filter) {
        if (bitSet != null) {
            if (filter != null) {
                bitSet.and(filter);
            }
            return new BitDocSet(bitSet);
            // TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize?
        } else {
            LSBRadixSorter sorter = new LSBRadixSorter();
            sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos);
            final int l = dedup(buffer, pos, filter);
            assert l <= pos;
            return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
        }
    }

    /** Only use this if you know there were no duplicates and that docs were collected in-order! */
    public DocSet buildUniqueInOrder(FixedBitSet filter) {
        if (bitSet != null) {
            if (filter != null) {
                bitSet.and(filter);
            }
            return new BitDocSet(bitSet);
        } else {
            // don't need to sort, but still need to remove non accepted docs
            int l = pos;
            if (filter != null) {
                l = dedup(buffer, pos, filter);
            }
            return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
        }
    }

}