org.apache.lucene.util.StringHelper.java Source code

Introduction

Here is the source code for org.apache.lucene.util.StringHelper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.io.DataInputStream;
import java.math.BigInteger;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;

/**
 * Methods for manipulating strings.
 *
 * @lucene.internal
 */
public abstract class StringHelper {

    /**
     * Compares two {@link BytesRef}, element by element, and returns the
     * number of elements common to both arrays (from the start of each).
     * This method assumes currentTerm comes after priorTerm.
     *
     * @param priorTerm The first {@link BytesRef} to compare
     * @param currentTerm The second {@link BytesRef} to compare
     * @return The number of common elements (from the start of each).
     */
    public static int bytesDifference(BytesRef priorTerm, BytesRef currentTerm) {
        int mismatch = FutureArrays.mismatch(priorTerm.bytes, priorTerm.offset, priorTerm.offset + priorTerm.length,
                currentTerm.bytes, currentTerm.offset, currentTerm.offset + currentTerm.length);
        if (mismatch < 0) {
            throw new IllegalArgumentException(
                    "terms out of order: priorTerm=" + priorTerm + ",currentTerm=" + currentTerm);
        }
        return mismatch;
    }

    /** 
     * Returns the length of {@code currentTerm} needed for use as a sort key.
     * so that {@link BytesRef#compareTo(BytesRef)} still returns the same result.
     * This method assumes currentTerm comes after priorTerm.
     */
    public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) {
        return bytesDifference(priorTerm, currentTerm) + 1;
    }

    private StringHelper() {
    }

    /**
     * Returns <code>true</code> iff the ref starts with the given prefix.
     * Otherwise <code>false</code>.
     * 
     * @param ref
     *         the {@code byte[]} to test
     * @param prefix
     *         the expected prefix
     * @return Returns <code>true</code> iff the ref starts with the given prefix.
     *         Otherwise <code>false</code>.
     */
    public static boolean startsWith(byte[] ref, BytesRef prefix) {
        // not long enough to start with the prefix
        if (ref.length < prefix.length) {
            return false;
        }
        return FutureArrays.equals(ref, 0, prefix.length, prefix.bytes, prefix.offset,
                prefix.offset + prefix.length);
    }

    /**
     * Returns <code>true</code> iff the ref starts with the given prefix.
     * Otherwise <code>false</code>.
     * 
     * @param ref
     *          the {@link BytesRef} to test
     * @param prefix
     *          the expected prefix
     * @return Returns <code>true</code> iff the ref starts with the given prefix.
     *         Otherwise <code>false</code>.
     */
    public static boolean startsWith(BytesRef ref, BytesRef prefix) {
        // not long enough to start with the prefix
        if (ref.length < prefix.length) {
            return false;
        }
        return FutureArrays.equals(ref.bytes, ref.offset, ref.offset + prefix.length, prefix.bytes, prefix.offset,
                prefix.offset + prefix.length);
    }

    /**
     * Returns <code>true</code> iff the ref ends with the given suffix. Otherwise
     * <code>false</code>.
     * 
     * @param ref
     *          the {@link BytesRef} to test
     * @param suffix
     *          the expected suffix
     * @return Returns <code>true</code> iff the ref ends with the given suffix.
     *         Otherwise <code>false</code>.
     */
    public static boolean endsWith(BytesRef ref, BytesRef suffix) {
        int startAt = ref.length - suffix.length;
        // not long enough to start with the suffix
        if (startAt < 0) {
            return false;
        }
        return FutureArrays.equals(ref.bytes, ref.offset + startAt, ref.offset + startAt + suffix.length,
                suffix.bytes, suffix.offset, suffix.offset + suffix.length);
    }

    /** Pass this as the seed to {@link #murmurhash3_x86_32}. */

    // Poached from Guava: set a different salt/seed
    // for each JVM instance, to frustrate hash key collision
    // denial of service attacks, and to catch any places that
    // somehow rely on hash function/order across JVM
    // instances:
    public static final int GOOD_FAST_HASH_SEED;

    static {
        String prop = System.getProperty("tests.seed");
        if (prop != null) {
            // So if there is a test failure that relied on hash
            // order, we remain reproducible based on the test seed:
            GOOD_FAST_HASH_SEED = prop.hashCode();
        } else {
            GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
        }
    }

    /** Returns the MurmurHash3_x86_32 hash.
     * Original source/tests at https://github.com/yonik/java_util/
     */
    @SuppressWarnings("fallthrough")
    public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {

        final int c1 = 0xcc9e2d51;
        final int c2 = 0x1b873593;

        int h1 = seed;
        int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block

        for (int i = offset; i < roundedEnd; i += 4) {
            // little endian load order
            int k1 = (data[i] & 0xff) | ((data[i + 1] & 0xff) << 8) | ((data[i + 2] & 0xff) << 16)
                    | (data[i + 3] << 24);
            k1 *= c1;
            k1 = Integer.rotateLeft(k1, 15);
            k1 *= c2;

            h1 ^= k1;
            h1 = Integer.rotateLeft(h1, 13);
            h1 = h1 * 5 + 0xe6546b64;
        }

        // tail
        int k1 = 0;

        switch (len & 0x03) {
        case 3:
            k1 = (data[roundedEnd + 2] & 0xff) << 16;
            // fallthrough
        case 2:
            k1 |= (data[roundedEnd + 1] & 0xff) << 8;
            // fallthrough
        case 1:
            k1 |= (data[roundedEnd] & 0xff);
            k1 *= c1;
            k1 = Integer.rotateLeft(k1, 15);
            k1 *= c2;
            h1 ^= k1;
        }

        // finalization
        h1 ^= len;

        // fmix(h1);
        h1 ^= h1 >>> 16;
        h1 *= 0x85ebca6b;
        h1 ^= h1 >>> 13;
        h1 *= 0xc2b2ae35;
        h1 ^= h1 >>> 16;

        return h1;
    }

    public static int murmurhash3_x86_32(BytesRef bytes, int seed) {
        return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
    }

    // Holds 128 bit unsigned value:
    private static BigInteger nextId;
    private static final BigInteger mask128;
    private static final Object idLock = new Object();

    static {
        // 128 bit unsigned mask
        byte[] maskBytes128 = new byte[16];
        Arrays.fill(maskBytes128, (byte) 0xff);
        mask128 = new BigInteger(1, maskBytes128);

        String prop = System.getProperty("tests.seed");

        // State for xorshift128:
        long x0;
        long x1;

        if (prop != null) {
            // So if there is a test failure that somehow relied on this id,
            // we remain reproducible based on the test seed:
            if (prop.length() > 8) {
                prop = prop.substring(prop.length() - 8);
            }
            x0 = Long.parseLong(prop, 16);
            x1 = x0;
        } else {
            // seed from /dev/urandom, if its available
            try (DataInputStream is = new DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) {
                x0 = is.readLong();
                x1 = is.readLong();
            } catch (Exception unavailable) {
                // may not be available on this platform
                // fall back to lower quality randomness from 3 different sources:
                x0 = System.nanoTime();
                x1 = StringHelper.class.hashCode() << 32;

                StringBuilder sb = new StringBuilder();
                // Properties can vary across JVM instances:
                try {
                    Properties p = System.getProperties();
                    for (String s : p.stringPropertyNames()) {
                        sb.append(s);
                        sb.append(p.getProperty(s));
                    }
                    x1 |= sb.toString().hashCode();
                } catch (SecurityException notallowed) {
                    // getting Properties requires wildcard read-write: may not be allowed
                    x1 |= StringBuffer.class.hashCode();
                }
            }
        }

        // Use a few iterations of xorshift128 to scatter the seed
        // in case multiple Lucene instances starting up "near" the same
        // nanoTime, since we use ++ (mod 2^128) for full period cycle:
        for (int i = 0; i < 10; i++) {
            long s1 = x0;
            long s0 = x1;
            x0 = s0;
            s1 ^= s1 << 23; // a
            x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
        }

        // 64-bit unsigned mask
        byte[] maskBytes64 = new byte[8];
        Arrays.fill(maskBytes64, (byte) 0xff);
        BigInteger mask64 = new BigInteger(1, maskBytes64);

        // First make unsigned versions of x0, x1:
        BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
        BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);

        // Concatentate bits of x0 and x1, as unsigned 128 bit integer:
        nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
    }

    /** length in bytes of an ID */
    public static final int ID_LENGTH = 16;

    /** Generates a non-cryptographic globally unique id. */
    public static byte[] randomId() {

        // NOTE: we don't use Java's UUID.randomUUID() implementation here because:
        //
        //   * It's overkill for our usage: it tries to be cryptographically
        //     secure, whereas for this use we don't care if someone can
        //     guess the IDs.
        //
        //   * It uses SecureRandom, which on Linux can easily take a long time
        //     (I saw ~ 10 seconds just running a Lucene test) when entropy
        //     harvesting is falling behind.
        //
        //   * It loses a few (6) bits to version and variant and it's not clear
        //     what impact that has on the period, whereas the simple ++ (mod 2^128)
        //     we use here is guaranteed to have the full period.

        byte bits[];
        synchronized (idLock) {
            bits = nextId.toByteArray();
            nextId = nextId.add(BigInteger.ONE).and(mask128);
        }

        // toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
        if (bits.length > ID_LENGTH) {
            assert bits.length == ID_LENGTH + 1;
            assert bits[0] == 0;
            return ArrayUtil.copyOfSubArray(bits, 1, bits.length);
        } else {
            byte[] result = new byte[ID_LENGTH];
            System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
            return result;
        }
    }

    /** 
     * Helper method to render an ID as a string, for debugging
     * <p>
     * Returns the string {@code (null)} if the id is null.
     * Otherwise, returns a string representation for debugging.
     * Never throws an exception. The returned string may
     * indicate if the id is definitely invalid.
     */
    public static String idToString(byte id[]) {
        if (id == null) {
            return "(null)";
        } else {
            StringBuilder sb = new StringBuilder();
            sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
            if (id.length != ID_LENGTH) {
                sb.append(" (INVALID FORMAT)");
            }
            return sb.toString();
        }
    }

    /** Just converts each int in the incoming {@link IntsRef} to each byte
     *  in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
     *  if any int value is out of bounds for a byte. */
    public static BytesRef intsRefToBytesRef(IntsRef ints) {
        byte[] bytes = new byte[ints.length];
        for (int i = 0; i < ints.length; i++) {
            int x = ints.ints[ints.offset + i];
            if (x < 0 || x > 255) {
                throw new IllegalArgumentException(
                        "int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
            }
            bytes[i] = (byte) x;
        }

        return new BytesRef(bytes);
    }
}