edu.umd.marbl.mhap.sketch.HashUtils.java Source code

Java tutorial

Introduction

Here is the source code for edu.umd.marbl.mhap.sketch.HashUtils.java

Source

/* 
 * MHAP package
 * 
 * This  software is distributed "as is", without any warranty, including 
 * any implied warranty of merchantability or fitness for a particular
 * use. The authors assume no responsibility for, and shall not be liable
 * for, any special, indirect, or consequential damages, or any damages
 * whatsoever, arising out of or in connection with the use of this
 * software.
 * 
 * Copyright (c) 2015 by Konstantin Berlin and Sergey Koren
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */
package edu.umd.marbl.mhap.sketch;

import java.nio.ByteBuffer;

import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

import edu.umd.marbl.mhap.math.BasicMath;
import edu.umd.marbl.mhap.utils.MersenneTwisterFast;

public class HashUtils {
    public static long[] computeHashes(String item, int numWords, int seed) {
        long[] hashes = new long[numWords];

        for (int word = 0; word < numWords; word += 2) {
            HashFunction hashFunc = Hashing.murmur3_128(seed + word);
            Hasher hasher = hashFunc.newHasher();
            hasher.putUnencodedChars(item);

            // get the two longs out
            HashCode hc = hasher.hash();
            ByteBuffer bb = ByteBuffer.wrap(hc.asBytes());
            hashes[word] = bb.getLong(0);
            if (word + 1 < numWords)
                hashes[word + 1] = bb.getLong(8);
        }

        return hashes;
    }

    public final static int[] computeHashesInt(Object obj, int numWords, int seed) {
        if (obj instanceof Integer)
            return computeHashesIntInt((Integer) obj, numWords, seed);
        if (obj instanceof Long)
            return computeHashesIntLong((Long) obj, numWords, seed);
        if (obj instanceof Double)
            return computeHashesIntDouble((Double) obj, numWords, seed);
        if (obj instanceof Float)
            return computeHashesIntFloat((Float) obj, numWords, seed);
        if (obj instanceof String)
            return computeHashesIntString((String) obj, numWords, seed);

        throw new SketchRuntimeException("Cannot hash class type " + obj.getClass().getCanonicalName());
    }

    public final static int[] computeHashesIntDouble(double obj, int numWords, int seed) {
        int[] hashes = new int[numWords];

        HashFunction hf = Hashing.murmur3_32(seed);

        for (int iter = 0; iter < numWords; iter++) {
            HashCode hc = hf.newHasher().putDouble(obj).putInt(iter).hash();

            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static int[] computeHashesIntFloat(float obj, int numWords, int seed) {
        int[] hashes = new int[numWords];

        HashFunction hf = Hashing.murmur3_32(seed);

        for (int iter = 0; iter < numWords; iter++) {
            HashCode hc = hf.newHasher().putFloat(obj).putInt(iter).hash();

            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static int[] computeHashesIntInt(int obj, int numWords, int seed) {
        int[] hashes = new int[numWords];

        HashFunction hf = Hashing.murmur3_32(seed);

        for (int iter = 0; iter < numWords; iter++) {
            HashCode hc = hf.newHasher().putInt(obj).putInt(iter).hash();

            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static int[] computeHashesIntLong(long obj, int numWords, int seed) {
        int[] hashes = new int[numWords];

        HashFunction hf = Hashing.murmur3_32(seed);

        for (int iter = 0; iter < numWords; iter++) {
            HashCode hc = hf.newHasher().putLong(obj).putInt(iter).hash();

            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static int[] computeHashesIntString(String obj, int numWords, int seed) {
        int[] hashes = new int[numWords];

        HashFunction hf = Hashing.murmur3_32(seed);

        for (int iter = 0; iter < numWords; iter++) {
            HashCode hc = hf.newHasher().putUnencodedChars(obj).putInt(iter).hash();

            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static long[][] computeNGramHashes(final String seq, final int nGramSize, final int numWords,
            final int seed) {
        final int numberNGrams = seq.length() - nGramSize + 1;

        if (numberNGrams < 1)
            throw new SketchRuntimeException("N-gram size bigger than string length.");

        // get the rabin hashes
        final long[] rabinHashes = computeSequenceHashesLong(seq, nGramSize, seed);

        final long[][] hashes = new long[rabinHashes.length][numWords];

        // Random rand = new Random(0);
        for (int iter = 0; iter < rabinHashes.length; iter++) {
            // rand.setSeed(rabinHashes[iter]);
            long x = rabinHashes[iter];

            for (int word = 0; word < numWords; word++) {
                // hashes[iter][word] = rand.nextLong();

                // XORShift Random Number Generators
                x ^= (x << 21);
                x ^= (x >>> 35);
                x ^= (x << 4);
                hashes[iter][word] = x;
            }
        }

        return hashes;
    }

    public final static long[][] computeNGramHashesExact(final String seq, final int nGramSize, final int numWords,
            final int seed) {
        HashFunction hf = Hashing.murmur3_128(seed);

        long[][] hashes = new long[seq.length() - nGramSize + 1][numWords];
        for (int iter = 0; iter < hashes.length; iter++) {
            String subStr = seq.substring(iter, iter + nGramSize);

            for (int word = 0; word < numWords; word++) {
                HashCode hc = hf.newHasher().putUnencodedChars(subStr).putInt(word).hash();
                hashes[iter][word] = hc.asLong();
            }
        }

        return hashes;
    }

    public final static int[] computeSequenceHashes(final String seq, final int nGramSize) {
        HashFunction hf = Hashing.murmur3_32(0);

        int[] hashes = new int[seq.length() - nGramSize + 1];
        for (int iter = 0; iter < hashes.length; iter++) {
            HashCode hc = hf.newHasher().putUnencodedChars(seq.substring(iter, iter + nGramSize)).hash();
            hashes[iter] = hc.asInt();
        }

        return hashes;
    }

    public final static long[] computeSequenceHashesLong(final String seq, final int nGramSize, final int seed) {
        HashFunction hf = Hashing.murmur3_128(seed);

        long[] hashes = new long[seq.length() - nGramSize + 1];
        for (int iter = 0; iter < hashes.length; iter++) {
            HashCode hc = hf.newHasher().putUnencodedChars(seq.substring(iter, iter + nGramSize)).hash();
            hashes[iter] = hc.asLong();
        }

        return hashes;
    }

    public static double[] randomGuassianVector(int n, int seed) {
        //now generate the guassian
        MersenneTwisterFast rand = new MersenneTwisterFast(seed);

        double[] vec = new double[n];
        for (int iter = 0; iter < n; iter++) {
            vec[iter] = rand.nextGaussian();
        }

        //normalize
        double norm = BasicMath.norm(vec);
        if (norm < 1.0e-10)
            return vec;

        return BasicMath.mult(vec, 1.0 / norm);
    }

    public static double[] randomStringGuassianVector(String str, int n, int seed) {
        int[] seeds = new int[4];
        for (int iter = 0; iter < 4; iter++) {
            HashFunction hf = Hashing.murmur3_32(seed * 4 + iter);
            HashCode hc = hf.newHasher().putUnencodedChars(str).hash();

            seeds[iter] = hc.asInt();
        }

        //now generate the guassian
        MersenneTwisterFast rand = new MersenneTwisterFast(seeds);

        double[] vec = new double[n];
        for (int iter = 0; iter < n; iter++) {
            vec[iter] = rand.nextGaussian();
        }

        //normalize
        double norm = BasicMath.norm(vec);
        if (norm < 1.0e-10)
            return vec;

        return BasicMath.mult(vec, 1.0 / norm);
    }

    private HashUtils() {
    }

}