org.apache.lucene.util.SparseFixedBitSet.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.util.SparseFixedBitSet.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.io.IOException;

import org.apache.lucene.search.DocIdSetIterator;

/**
 * A bit set that only stores longs that have at least one bit which is set.
 * The way it works is that the space of bits is divided into blocks of
 * 4096 bits, which is 64 longs. Then for each block, we have:<ul>
 * <li>a long[] which stores the non-zero longs for that block</li>
 * <li>a long so that bit <tt>i</tt> being set means that the <code>i-th</code>
 *     long of the block is non-null, and its offset in the array of longs is
 *     the number of one bits on the right of the <code>i-th</code> bit.</li></ul>
 *
 * @lucene.internal
 */
public class SparseFixedBitSet extends BitSet implements Bits, Accountable {

    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator
            .shallowSizeOfInstance(SparseFixedBitSet.class);
    private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]);
    private static final int MASK_4096 = (1 << 12) - 1;

    private static int blockCount(int length) {
        int blockCount = length >>> 12;
        if ((blockCount << 12) < length) {
            ++blockCount;
        }
        assert (blockCount << 12) >= length;
        return blockCount;
    }

    final long[] indices;
    final long[][] bits;
    final int length;
    int nonZeroLongCount;
    long ramBytesUsed;

    /** Create a {@link SparseFixedBitSet} that can contain bits between
     *  <code>0</code> included and <code>length</code> excluded. */
    public SparseFixedBitSet(int length) {
        if (length < 1) {
            throw new IllegalArgumentException("length needs to be >= 1");
        }
        this.length = length;
        final int blockCount = blockCount(length);
        indices = new long[blockCount];
        bits = new long[blockCount][];
        ramBytesUsed = BASE_RAM_BYTES_USED + RamUsageEstimator.shallowSizeOf(indices)
                + RamUsageEstimator.shallowSizeOf(bits);
    }

    @Override
    public int length() {
        return length;
    }

    private boolean consistent(int index) {
        assert index >= 0 && index < length : "index=" + index + ",length=" + length;
        return true;
    }

    @Override
    public int cardinality() {
        int cardinality = 0;
        for (long[] bitArray : bits) {
            if (bitArray != null) {
                for (long bits : bitArray) {
                    cardinality += Long.bitCount(bits);
                }
            }
        }
        return cardinality;
    }

    @Override
    public int approximateCardinality() {
        // we are assuming that bits are uniformly set and use the linear counting
        // algorithm to estimate the number of bits that are set based on the number
        // of longs that are different from zero
        final int totalLongs = (length + 63) >>> 6; // total number of longs in the space
        assert totalLongs >= nonZeroLongCount;
        final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros
        // No need to guard against division by zero, it will return +Infinity and things will work as expected
        final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs));
        return (int) Math.min(length, estimate);
    }

    @Override
    public boolean get(int i) {
        assert consistent(i);
        final int i4096 = i >>> 12;
        final long index = indices[i4096];
        final int i64 = i >>> 6;
        // first check the index, if the i64-th bit is not set, then i is not set
        // note: this relies on the fact that shifts are mod 64 in java
        if ((index & (1L << i64)) == 0) {
            return false;
        }

        // if it is set, then we count the number of bits that are set on the right
        // of i64, and that gives us the index of the long that stores the bits we
        // are interested in
        final long bits = this.bits[i4096][Long.bitCount(index & ((1L << i64) - 1))];
        return (bits & (1L << i)) != 0;
    }

    private static int oversize(int s) {
        int newSize = s + (s >>> 1);
        if (newSize > 50) {
            newSize = 64;
        }
        return newSize;
    }

    /**
     * Set the bit at index <tt>i</tt>.
     */
    public void set(int i) {
        assert consistent(i);
        final int i4096 = i >>> 12;
        final long index = indices[i4096];
        final int i64 = i >>> 6;
        if ((index & (1L << i64)) != 0) {
            // in that case the sub 64-bits block we are interested in already exists,
            // we just need to set a bit in an existing long: the number of ones on
            // the right of i64 gives us the index of the long we need to update
            bits[i4096][Long.bitCount(index & ((1L << i64) - 1))] |= 1L << i; // shifts are mod 64 in java
        } else if (index == 0) {
            // if the index is 0, it means that we just found a block of 4096 bits
            // that has no bit that is set yet. So let's initialize a new block:
            insertBlock(i4096, i64, i);
        } else {
            // in that case we found a block of 4096 bits that has some values, but
            // the sub-block of 64 bits that we are interested in has no value yet,
            // so we need to insert a new long
            insertLong(i4096, i64, i, index);
        }
    }

    private void insertBlock(int i4096, int i64, int i) {
        indices[i4096] = 1L << i64; // shifts are mod 64 in java
        assert bits[i4096] == null;
        bits[i4096] = new long[] { 1L << i }; // shifts are mod 64 in java
        ++nonZeroLongCount;
        ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED;
    }

    private void insertLong(int i4096, int i64, int i, long index) {
        indices[i4096] |= 1L << i64; // shifts are mod 64 in java
        // we count the number of bits that are set on the right of i64
        // this gives us the index at which to perform the insertion
        final int o = Long.bitCount(index & ((1L << i64) - 1));
        final long[] bitArray = bits[i4096];
        if (bitArray[bitArray.length - 1] == 0) {
            // since we only store non-zero longs, if the last value is 0, it means
            // that we alreay have extra space, make use of it
            System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1);
            bitArray[o] = 1L << i;
        } else {
            // we don't have extra space so we need to resize to insert the new long
            final int newSize = oversize(bitArray.length + 1);
            final long[] newBitArray = new long[newSize];
            System.arraycopy(bitArray, 0, newBitArray, 0, o);
            newBitArray[o] = 1L << i;
            System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o);
            bits[i4096] = newBitArray;
            ramBytesUsed += RamUsageEstimator.sizeOf(newBitArray) - RamUsageEstimator.sizeOf(bitArray);
        }
        ++nonZeroLongCount;
    }

    /**
     * Clear the bit at index <tt>i</tt>.
     */
    public void clear(int i) {
        assert consistent(i);
        final int i4096 = i >>> 12;
        final int i64 = i >>> 6;
        and(i4096, i64, ~(1L << i));
    }

    private void and(int i4096, int i64, long mask) {
        final long index = indices[i4096];
        if ((index & (1L << i64)) != 0) {
            // offset of the long bits we are interested in in the array
            final int o = Long.bitCount(index & ((1L << i64) - 1));
            long bits = this.bits[i4096][o] & mask;
            if (bits == 0) {
                removeLong(i4096, i64, index, o);
            } else {
                this.bits[i4096][o] = bits;
            }
        }
    }

    private void removeLong(int i4096, int i64, long index, int o) {
        index &= ~(1L << i64);
        indices[i4096] = index;
        if (index == 0) {
            // release memory, there is nothing in this block anymore
            this.bits[i4096] = null;
        } else {
            final int length = Long.bitCount(index);
            final long[] bitArray = bits[i4096];
            System.arraycopy(bitArray, o + 1, bitArray, o, length - o);
            bitArray[length] = 0L;
        }
        nonZeroLongCount -= 1;
    }

    @Override
    public void clear(int from, int to) {
        assert from >= 0;
        assert to <= length;
        if (from >= to) {
            return;
        }
        final int firstBlock = from >>> 12;
        final int lastBlock = (to - 1) >>> 12;
        if (firstBlock == lastBlock) {
            clearWithinBlock(firstBlock, from & MASK_4096, (to - 1) & MASK_4096);
        } else {
            clearWithinBlock(firstBlock, from & MASK_4096, MASK_4096);
            for (int i = firstBlock + 1; i < lastBlock; ++i) {
                nonZeroLongCount -= Long.bitCount(indices[i]);
                indices[i] = 0;
                bits[i] = null;
            }
            clearWithinBlock(lastBlock, 0, (to - 1) & MASK_4096);
        }
    }

    // create a long that has bits set to one between from and to
    private static long mask(int from, int to) {
        return ((1L << (to - from) << 1) - 1) << from;
    }

    private void clearWithinBlock(int i4096, int from, int to) {
        int firstLong = from >>> 6;
        int lastLong = to >>> 6;

        if (firstLong == lastLong) {
            and(i4096, firstLong, ~mask(from, to));
        } else {
            assert firstLong < lastLong;
            and(i4096, lastLong, ~mask(0, to));
            for (int i = lastLong - 1; i >= firstLong + 1; --i) {
                and(i4096, i, 0L);
            }
            and(i4096, firstLong, ~mask(from, 63));
        }
    }

    /** Return the first document that occurs on or after the provided block index. */
    private int firstDoc(int i4096) {
        long index = 0;
        while (i4096 < indices.length) {
            index = indices[i4096];
            if (index != 0) {
                final int i64 = Long.numberOfTrailingZeros(index);
                return (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]);
            }
            i4096 += 1;
        }
        return DocIdSetIterator.NO_MORE_DOCS;
    }

    @Override
    public int nextSetBit(int i) {
        assert i < length;
        final int i4096 = i >>> 12;
        final long index = indices[i4096];
        final long[] bitArray = this.bits[i4096];
        int i64 = i >>> 6;
        int o = Long.bitCount(index & ((1L << i64) - 1));
        if ((index & (1L << i64)) != 0) {
            // There is at least one bit that is set in the current long, check if
            // one of them is after i
            final long bits = bitArray[o] >>> i; // shifts are mod 64
            if (bits != 0) {
                return i + Long.numberOfTrailingZeros(bits);
            }
            o += 1;
        }
        final long indexBits = index >>> i64 >>> 1;
        if (indexBits == 0) {
            // no more bits are set in the current block of 4096 bits, go to the next one
            return firstDoc(i4096 + 1);
        }
        // there are still set bits
        i64 += 1 + Long.numberOfTrailingZeros(indexBits);
        final long bits = bitArray[o];
        return (i64 << 6) | Long.numberOfTrailingZeros(bits);
    }

    /** Return the last document that occurs on or before the provided block index. */
    private int lastDoc(int i4096) {
        long index;
        while (i4096 >= 0) {
            index = indices[i4096];
            if (index != 0) {
                final int i64 = 63 - Long.numberOfLeadingZeros(index);
                final long bits = this.bits[i4096][Long.bitCount(index) - 1];
                return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
            }
            i4096 -= 1;
        }
        return -1;
    }

    @Override
    public int prevSetBit(int i) {
        assert i >= 0;
        final int i4096 = i >>> 12;
        final long index = indices[i4096];
        final long[] bitArray = this.bits[i4096];
        int i64 = i >>> 6;
        final long indexBits = index & ((1L << i64) - 1);
        final int o = Long.bitCount(indexBits);
        if ((index & (1L << i64)) != 0) {
            // There is at least one bit that is set in the same long, check if there
            // is one bit that is set that is lower than i
            final long bits = bitArray[o] & ((1L << i << 1) - 1);
            if (bits != 0) {
                return (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
            }
        }
        if (indexBits == 0) {
            // no more bits are set in this block, go find the last bit in the
            // previous block
            return lastDoc(i4096 - 1);
        }
        // go to the previous long
        i64 = 63 - Long.numberOfLeadingZeros(indexBits);
        final long bits = bitArray[o - 1];
        return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
    }

    /** Return the long bits at the given <code>i64</code> index. */
    private long longBits(long index, long[] bits, int i64) {
        if ((index & (1L << i64)) == 0) {
            return 0L;
        } else {
            return bits[Long.bitCount(index & ((1L << i64) - 1))];
        }
    }

    private void or(final int i4096, final long index, long[] bits, int nonZeroLongCount) {
        assert Long.bitCount(index) == nonZeroLongCount;
        final long currentIndex = indices[i4096];
        if (currentIndex == 0) {
            // fast path: if we currently have nothing in the block, just copy the data
            // this especially happens all the time if you call OR on an empty set
            indices[i4096] = index;
            this.bits[i4096] = ArrayUtil.copyOfSubArray(bits, 0, nonZeroLongCount);
            this.nonZeroLongCount += nonZeroLongCount;
            return;
        }
        final long[] currentBits = this.bits[i4096];
        final long[] newBits;
        final long newIndex = currentIndex | index;
        final int requiredCapacity = Long.bitCount(newIndex);
        if (currentBits.length >= requiredCapacity) {
            newBits = currentBits;
        } else {
            newBits = new long[oversize(requiredCapacity)];
        }
        // we iterate backwards in order to not override data we might need on the next iteration if the
        // array is reused
        for (int i = Long.numberOfLeadingZeros(newIndex), newO = Long.bitCount(newIndex) - 1; i < 64; i += 1
                + Long.numberOfLeadingZeros(newIndex << (i + 1)), newO -= 1) {
            // bitIndex is the index of a bit which is set in newIndex and newO is the number of 1 bits on its right
            final int bitIndex = 63 - i;
            assert newO == Long.bitCount(newIndex & ((1L << bitIndex) - 1));
            newBits[newO] = longBits(currentIndex, currentBits, bitIndex) | longBits(index, bits, bitIndex);
        }
        indices[i4096] = newIndex;
        this.bits[i4096] = newBits;
        this.nonZeroLongCount += nonZeroLongCount - Long.bitCount(currentIndex & index);
    }

    private void or(SparseFixedBitSet other) {
        for (int i = 0; i < other.indices.length; ++i) {
            final long index = other.indices[i];
            if (index != 0) {
                or(i, index, other.bits[i], Long.bitCount(index));
            }
        }
    }

    /**
     * {@link #or(DocIdSetIterator)} impl that works best when <code>it</code> is dense
     */
    private void orDense(DocIdSetIterator it) throws IOException {
        checkUnpositioned(it);
        // The goal here is to try to take advantage of the ordering of documents
        // to build the data-structure more efficiently
        // NOTE: this heavily relies on the fact that shifts are mod 64
        final int firstDoc = it.nextDoc();
        if (firstDoc == DocIdSetIterator.NO_MORE_DOCS) {
            return;
        }
        int i4096 = firstDoc >>> 12;
        int i64 = firstDoc >>> 6;
        long index = 1L << i64;
        long currentLong = 1L << firstDoc;
        // we store at most 64 longs per block so preallocate in order never to have to resize
        long[] longs = new long[64];
        int numLongs = 0;

        for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
            final int doc64 = doc >>> 6;
            if (doc64 == i64) {
                // still in the same long, just set the bit
                currentLong |= 1L << doc;
            } else {
                longs[numLongs++] = currentLong;

                final int doc4096 = doc >>> 12;
                if (doc4096 == i4096) {
                    index |= 1L << doc64;
                } else {
                    // we are on a new block, flush what we buffered
                    or(i4096, index, longs, numLongs);
                    // and reset state for the new block
                    i4096 = doc4096;
                    index = 1L << doc64;
                    numLongs = 0;
                }

                // we are on a new long, reset state
                i64 = doc64;
                currentLong = 1L << doc;
            }
        }

        // flush
        longs[numLongs++] = currentLong;
        or(i4096, index, longs, numLongs);
    }

    @Override
    public void or(DocIdSetIterator it) throws IOException {
        {
            // specialize union with another SparseFixedBitSet
            final SparseFixedBitSet other = BitSetIterator.getSparseFixedBitSetOrNull(it);
            if (other != null) {
                checkUnpositioned(it);
                or(other);
                return;
            }
        }

        // We do not specialize the union with a FixedBitSet since FixedBitSets are
        // supposed to be used for dense data and sparse fixed bit sets for sparse
        // data, so a sparse set would likely get upgraded by DocIdSetBuilder before
        // being or'ed with a FixedBitSet

        if (it.cost() < indices.length) {
            // the default impl is good for sparse iterators
            super.or(it);
        } else {
            orDense(it);
        }
    }

    @Override
    public long ramBytesUsed() {
        return ramBytesUsed;
    }

    @Override
    public String toString() {
        return "SparseFixedBitSet(size=" + length + ",cardinality=~" + approximateCardinality();
    }
}