org.apache.solr.legacy.LegacyNumericUtils.java Source code

Introduction

Here is the source code for org.apache.solr.legacy.LegacyNumericUtils.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.legacy;

import java.io.IOException;

import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

/**
 * This is a helper class to generate prefix-encoded representations for numerical values
 * and supplies converters to represent float/double values as sortable integers/longs.
 *
 * <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
 * into multiple intervals for searching: The center of the range is searched only with
 * the lowest possible precision in the trie, while the boundaries are matched
 * more exactly. This reduces the number of terms dramatically.
 *
 * <p>This class generates terms to achieve this: First the numerical integer values need to
 * be converted to bytes. For that integer values (32 bit or 64 bit) are made unsigned
 * and the bits are converted to ASCII chars with each 7 bit. The resulting byte[] is
 * sortable like the original integer value (even using UTF-8 sort order). Each value is also
 * prefixed (in the first char) by the <code>shift</code> value (number of bits removed) used
 * during encoding.
 *
 * <p>For easy usage, the trie algorithm is implemented for indexing inside
 * {@link org.apache.solr.legacy.LegacyNumericTokenStream} that can index <code>int</code>, <code>long</code>,
 * <code>float</code>, and <code>double</code>. For querying,
 * {@link org.apache.solr.legacy.LegacyNumericRangeQuery} implements the query part
 * for the same data types.
 *
 * @lucene.internal
 *
 * @deprecated Please use {@link org.apache.lucene.index.PointValues} instead.
 *
 * @since 2.9, API changed non backwards-compliant in 4.0
 */

@Deprecated
public final class LegacyNumericUtils {

    private LegacyNumericUtils() {
    } // no instance!

    /**
     * The default precision step used by {@link org.apache.solr.legacy.LegacyLongField},
     * {@link org.apache.solr.legacy.LegacyDoubleField}, {@link org.apache.solr.legacy.LegacyNumericTokenStream}, {@link
     * org.apache.solr.legacy.LegacyNumericRangeQuery}.
     */
    public static final int PRECISION_STEP_DEFAULT = 16;

    /**
     * The default precision step used by {@link org.apache.solr.legacy.LegacyIntField} and
     * {@link org.apache.solr.legacy.LegacyFloatField}.
     */
    public static final int PRECISION_STEP_DEFAULT_32 = 8;

    /**
     * Longs are stored at lower precision by shifting off lower bits. The shift count is
     * stored as <code>SHIFT_START_LONG+shift</code> in the first byte
     */
    public static final byte SHIFT_START_LONG = 0x20;

    /**
     * The maximum term length (used for <code>byte[]</code> buffer size)
     * for encoding <code>long</code> values.
     * @see #longToPrefixCoded
     */
    public static final int BUF_SIZE_LONG = 63 / 7 + 2;

    /**
     * Integers are stored at lower precision by shifting off lower bits. The shift count is
     * stored as <code>SHIFT_START_INT+shift</code> in the first byte
     */
    public static final byte SHIFT_START_INT = 0x60;

    /**
     * The maximum term length (used for <code>byte[]</code> buffer size)
     * for encoding <code>int</code> values.
     * @see #intToPrefixCoded
     */
    public static final int BUF_SIZE_INT = 31 / 7 + 2;

    /**
     * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
     * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
     * After encoding, {@code bytes.offset} will always be 0. 
     * @param val the numeric value
     * @param shift how many bits to strip from the right
     * @param bytes will contain the encoded value
     */
    public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) {
        // ensure shift is 0..63
        if ((shift & ~0x3f) != 0) {
            throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift);
        }
        int nChars = (((63 - shift) * 37) >> 8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63
        bytes.setLength(nChars + 1); // one extra for the byte that contains the shift info
        bytes.grow(BUF_SIZE_LONG);
        bytes.setByteAt(0, (byte) (SHIFT_START_LONG + shift));
        long sortableBits = val ^ 0x8000000000000000L;
        sortableBits >>>= shift;
        while (nChars > 0) {
            // Store 7 bits per byte for compatibility
            // with UTF-8 encoding of terms
            bytes.setByteAt(nChars--, (byte) (sortableBits & 0x7f));
            sortableBits >>>= 7;
        }
    }

    /**
     * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
     * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
     * After encoding, {@code bytes.offset} will always be 0.
     * @param val the numeric value
     * @param shift how many bits to strip from the right
     * @param bytes will contain the encoded value
     */
    public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) {
        // ensure shift is 0..31
        if ((shift & ~0x1f) != 0) {
            throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift);
        }
        int nChars = (((31 - shift) * 37) >> 8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63
        bytes.setLength(nChars + 1); // one extra for the byte that contains the shift info
        bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG); // use the max
        bytes.setByteAt(0, (byte) (SHIFT_START_INT + shift));
        int sortableBits = val ^ 0x80000000;
        sortableBits >>>= shift;
        while (nChars > 0) {
            // Store 7 bits per byte for compatibility
            // with UTF-8 encoding of terms
            bytes.setByteAt(nChars--, (byte) (sortableBits & 0x7f));
            sortableBits >>>= 7;
        }
    }

    /**
     * Returns the shift value from a prefix encoded {@code long}.
     * @throws NumberFormatException if the supplied {@link BytesRef} is
     * not correctly prefix encoded.
     */
    public static int getPrefixCodedLongShift(final BytesRef val) {
        final int shift = val.bytes[val.offset] - SHIFT_START_LONG;
        if (shift > 63 || shift < 0)
            throw new NumberFormatException(
                    "Invalid shift value (" + shift + ") in prefixCoded bytes (is encoded value really an INT?)");
        return shift;
    }

    /**
     * Returns the shift value from a prefix encoded {@code int}.
     * @throws NumberFormatException if the supplied {@link BytesRef} is
     * not correctly prefix encoded.
     */
    public static int getPrefixCodedIntShift(final BytesRef val) {
        final int shift = val.bytes[val.offset] - SHIFT_START_INT;
        if (shift > 31 || shift < 0)
            throw new NumberFormatException(
                    "Invalid shift value in prefixCoded bytes (is encoded value really an INT?)");
        return shift;
    }

    /**
     * Returns a long from prefixCoded bytes.
     * Rightmost bits will be zero for lower precision codes.
     * This method can be used to decode a term's value.
     * @throws NumberFormatException if the supplied {@link BytesRef} is
     * not correctly prefix encoded.
     * @see #longToPrefixCoded
     */
    public static long prefixCodedToLong(final BytesRef val) {
        long sortableBits = 0L;
        for (int i = val.offset + 1, limit = val.offset + val.length; i < limit; i++) {
            sortableBits <<= 7;
            final byte b = val.bytes[i];
            if (b < 0) {
                throw new NumberFormatException("Invalid prefixCoded numerical value representation (byte "
                        + Integer.toHexString(b & 0xff) + " at position " + (i - val.offset) + " is invalid)");
            }
            sortableBits |= b;
        }
        return (sortableBits << getPrefixCodedLongShift(val)) ^ 0x8000000000000000L;
    }

    /**
     * Returns an int from prefixCoded bytes.
     * Rightmost bits will be zero for lower precision codes.
     * This method can be used to decode a term's value.
     * @throws NumberFormatException if the supplied {@link BytesRef} is
     * not correctly prefix encoded.
     * @see #intToPrefixCoded
     */
    public static int prefixCodedToInt(final BytesRef val) {
        int sortableBits = 0;
        for (int i = val.offset + 1, limit = val.offset + val.length; i < limit; i++) {
            sortableBits <<= 7;
            final byte b = val.bytes[i];
            if (b < 0) {
                throw new NumberFormatException("Invalid prefixCoded numerical value representation (byte "
                        + Integer.toHexString(b & 0xff) + " at position " + (i - val.offset) + " is invalid)");
            }
            sortableBits |= b;
        }
        return (sortableBits << getPrefixCodedIntShift(val)) ^ 0x80000000;
    }

    /**
     * Splits a long range recursively.
     * You may implement a builder that adds clauses to a
     * {@link org.apache.lucene.search.BooleanQuery} for each call to its
     * {@link LongRangeBuilder#addRange(BytesRef,BytesRef)}
     * method.
     * <p>This method is used by {@link org.apache.solr.legacy.LegacyNumericRangeQuery}.
     */
    public static void splitLongRange(final LongRangeBuilder builder, final int precisionStep, final long minBound,
            final long maxBound) {
        splitRange(builder, 64, precisionStep, minBound, maxBound);
    }

    /**
     * Splits an int range recursively.
     * You may implement a builder that adds clauses to a
     * {@link org.apache.lucene.search.BooleanQuery} for each call to its
     * {@link IntRangeBuilder#addRange(BytesRef,BytesRef)}
     * method.
     * <p>This method is used by {@link org.apache.solr.legacy.LegacyNumericRangeQuery}.
     */
    public static void splitIntRange(final IntRangeBuilder builder, final int precisionStep, final int minBound,
            final int maxBound) {
        splitRange(builder, 32, precisionStep, minBound, maxBound);
    }

    /** This helper does the splitting for both 32 and 64 bit. */
    private static void splitRange(final Object builder, final int valSize, final int precisionStep, long minBound,
            long maxBound) {
        if (precisionStep < 1)
            throw new IllegalArgumentException("precisionStep must be >=1");
        if (minBound > maxBound)
            return;
        for (int shift = 0;; shift += precisionStep) {
            // calculate new bounds for inner precision
            final long diff = 1L << (shift + precisionStep), mask = ((1L << precisionStep) - 1L) << shift;
            final boolean hasLower = (minBound & mask) != 0L, hasUpper = (maxBound & mask) != mask;
            final long nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask,
                    nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask;
            final boolean lowerWrapped = nextMinBound < minBound, upperWrapped = nextMaxBound > maxBound;

            if (shift + precisionStep >= valSize || nextMinBound > nextMaxBound || lowerWrapped || upperWrapped) {
                // We are in the lowest precision or the next precision is not available.
                addRange(builder, valSize, minBound, maxBound, shift);
                // exit the split recursion loop
                break;
            }

            if (hasLower)
                addRange(builder, valSize, minBound, minBound | mask, shift);
            if (hasUpper)
                addRange(builder, valSize, maxBound & ~mask, maxBound, shift);

            // recurse to next precision
            minBound = nextMinBound;
            maxBound = nextMaxBound;
        }
    }

    /** Helper that delegates to correct range builder */
    private static void addRange(final Object builder, final int valSize, long minBound, long maxBound,
            final int shift) {
        // for the max bound set all lower bits (that were shifted away):
        // this is important for testing or other usages of the splitted range
        // (e.g. to reconstruct the full range). The prefixEncoding will remove
        // the bits anyway, so they do not hurt!
        maxBound |= (1L << shift) - 1L;
        // delegate to correct range builder
        switch (valSize) {
        case 64:
            ((LongRangeBuilder) builder).addRange(minBound, maxBound, shift);
            break;
        case 32:
            ((IntRangeBuilder) builder).addRange((int) minBound, (int) maxBound, shift);
            break;
        default:
            // Should not happen!
            throw new IllegalArgumentException("valSize must be 32 or 64.");
        }
    }

    /**
     * Callback for {@link #splitLongRange}.
     * You need to overwrite only one of the methods.
     * @lucene.internal
     * @since 2.9, API changed non backwards-compliant in 4.0
     */
    public static abstract class LongRangeBuilder {

        /**
         * Overwrite this method, if you like to receive the already prefix encoded range bounds.
         * You can directly build classical (inclusive) range queries from them.
         */
        public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
            throw new UnsupportedOperationException();
        }

        /**
         * Overwrite this method, if you like to receive the raw long range bounds.
         * You can use this for e.g. debugging purposes (print out range bounds).
         */
        public void addRange(final long min, final long max, final int shift) {
            final BytesRefBuilder minBytes = new BytesRefBuilder(), maxBytes = new BytesRefBuilder();
            longToPrefixCoded(min, shift, minBytes);
            longToPrefixCoded(max, shift, maxBytes);
            addRange(minBytes.get(), maxBytes.get());
        }

    }

    /**
     * Callback for {@link #splitIntRange}.
     * You need to overwrite only one of the methods.
     * @lucene.internal
     * @since 2.9, API changed non backwards-compliant in 4.0
     */
    public static abstract class IntRangeBuilder {

        /**
         * Overwrite this method, if you like to receive the already prefix encoded range bounds.
         * You can directly build classical range (inclusive) queries from them.
         */
        public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
            throw new UnsupportedOperationException();
        }

        /**
         * Overwrite this method, if you like to receive the raw int range bounds.
         * You can use this for e.g. debugging purposes (print out range bounds).
         */
        public void addRange(final int min, final int max, final int shift) {
            final BytesRefBuilder minBytes = new BytesRefBuilder(), maxBytes = new BytesRefBuilder();
            intToPrefixCoded(min, shift, minBytes);
            intToPrefixCoded(max, shift, maxBytes);
            addRange(minBytes.get(), maxBytes.get());
        }

    }

    /**
     * Filters the given {@link TermsEnum} by accepting only prefix coded 64 bit
     * terms with a shift value of <tt>0</tt>.
     * 
     * @param termsEnum
     *          the terms enum to filter
     * @return a filtered {@link TermsEnum} that only returns prefix coded 64 bit
     *         terms with a shift value of <tt>0</tt>.
     */
    public static TermsEnum filterPrefixCodedLongs(TermsEnum termsEnum) {
        return new SeekingNumericFilteredTermsEnum(termsEnum) {

            @Override
            protected AcceptStatus accept(BytesRef term) {
                return LegacyNumericUtils.getPrefixCodedLongShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END;
            }
        };
    }

    /**
     * Filters the given {@link TermsEnum} by accepting only prefix coded 32 bit
     * terms with a shift value of <tt>0</tt>.
     * 
     * @param termsEnum
     *          the terms enum to filter
     * @return a filtered {@link TermsEnum} that only returns prefix coded 32 bit
     *         terms with a shift value of <tt>0</tt>.
     */
    public static TermsEnum filterPrefixCodedInts(TermsEnum termsEnum) {
        return new SeekingNumericFilteredTermsEnum(termsEnum) {

            @Override
            protected AcceptStatus accept(BytesRef term) {
                return LegacyNumericUtils.getPrefixCodedIntShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END;
            }
        };
    }

    /** Just like FilteredTermsEnum, except it adds a limited
     *  seekCeil implementation that only works with {@link
     *  #filterPrefixCodedInts} and {@link
     *  #filterPrefixCodedLongs}. */
    private static abstract class SeekingNumericFilteredTermsEnum extends FilteredTermsEnum {
        public SeekingNumericFilteredTermsEnum(final TermsEnum tenum) {
            super(tenum, false);
        }

        @Override
        @SuppressWarnings("fallthrough")
        public SeekStatus seekCeil(BytesRef term) throws IOException {

            // NOTE: This is not general!!  It only handles YES
            // and END, because that's all we need for the numeric
            // case here

            SeekStatus status = tenum.seekCeil(term);
            if (status == SeekStatus.END) {
                return SeekStatus.END;
            }

            actualTerm = tenum.term();

            if (accept(actualTerm) == AcceptStatus.YES) {
                return status;
            } else {
                return SeekStatus.END;
            }
        }
    }

    private static Terms intTerms(Terms terms) {
        return new FilterLeafReader.FilterTerms(terms) {
            @Override
            public TermsEnum iterator() throws IOException {
                return filterPrefixCodedInts(in.iterator());
            }
        };
    }

    private static Terms longTerms(Terms terms) {
        return new FilterLeafReader.FilterTerms(terms) {
            @Override
            public TermsEnum iterator() throws IOException {
                return filterPrefixCodedLongs(in.iterator());
            }
        };
    }

    /**
     * Returns the minimum int value indexed into this
     * numeric field or null if no terms exist.
     */
    public static Integer getMinInt(Terms terms) throws IOException {
        // All shift=0 terms are sorted first, so we don't need
        // to filter the incoming terms; we can just get the
        // min:
        BytesRef min = terms.getMin();
        return (min != null) ? LegacyNumericUtils.prefixCodedToInt(min) : null;
    }

    /**
     * Returns the maximum int value indexed into this
     * numeric field or null if no terms exist.
     */
    public static Integer getMaxInt(Terms terms) throws IOException {
        BytesRef max = intTerms(terms).getMax();
        return (max != null) ? LegacyNumericUtils.prefixCodedToInt(max) : null;
    }

    /**
     * Returns the minimum long value indexed into this
     * numeric field or null if no terms exist.
     */
    public static Long getMinLong(Terms terms) throws IOException {
        // All shift=0 terms are sorted first, so we don't need
        // to filter the incoming terms; we can just get the
        // min:
        BytesRef min = terms.getMin();
        return (min != null) ? LegacyNumericUtils.prefixCodedToLong(min) : null;
    }

    /**
     * Returns the maximum long value indexed into this
     * numeric field or null if no terms exist.
     */
    public static Long getMaxLong(Terms terms) throws IOException {
        BytesRef max = longTerms(terms).getMax();
        return (max != null) ? LegacyNumericUtils.prefixCodedToLong(max) : null;
    }

}