org.apache.accumulo.core.client.sample.AbstractHashSampler.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.core.client.sample.AbstractHashSampler.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.accumulo.core.client.sample;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;

import java.io.DataOutput;
import java.io.IOException;
import java.util.Set;

import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.sample.impl.DataoutputHasher;

import com.google.common.collect.ImmutableSet;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;

/**
 * A base class that can be used to create Samplers based on hashing. This class offers consistent
 * options for configuring the hash function. The subclass decides which parts of the key to hash.
 *
 * <p>
 * This class support two options passed into {@link #init(SamplerConfiguration)}. One option is
 * {@code hasher} which specifies a hashing algorithm. Valid values for this option are {@code md5},
 * {@code sha1}, and {@code murmur3_32}. If you are not sure, then choose {@code murmur3_32}.
 *
 * <p>
 * The second option is {@code modulus} which can have any positive integer as a value.
 *
 * <p>
 * Any data where {@code hash(data) % modulus == 0} will be selected for the sample.
 *
 * @since 1.8.0
 */
public abstract class AbstractHashSampler implements Sampler {

    private HashFunction hashFunction;
    private int modulus;

    private static final Set<String> VALID_OPTIONS = ImmutableSet.of("hasher", "modulus");

    /**
     * Subclasses with options should override this method and return true if the option is valid for
     * the subclass or if {@code super.isValidOption(opt)} returns true.
     */

    protected boolean isValidOption(String option) {
        return VALID_OPTIONS.contains(option);
    }

    /**
     * Subclasses with options should override this method and call {@code super.init(config)}.
     */
    @SuppressFBWarnings(value = "UNSAFE_HASH_EQUALS", justification = "these hashes don't protect any secrets, just used for binning")
    @Override
    public void init(SamplerConfiguration config) {
        String hasherOpt = config.getOptions().get("hasher");
        String modulusOpt = config.getOptions().get("modulus");

        requireNonNull(hasherOpt, "Hasher not specified");
        requireNonNull(modulusOpt, "Modulus not specified");

        for (String option : config.getOptions().keySet()) {
            checkArgument(isValidOption(option), "Unknown option : %s", option);
        }

        switch (hasherOpt) {
        case "murmur3_32":
            hashFunction = Hashing.murmur3_32();
            break;
        case "md5":
            @SuppressWarnings("deprecation")
            HashFunction deprecatedMd5 = Hashing.md5();
            hashFunction = deprecatedMd5;
            break;
        case "sha1":
            @SuppressWarnings("deprecation")
            HashFunction deprecatedSha1 = Hashing.sha1();
            hashFunction = deprecatedSha1;
            break;
        default:
            throw new IllegalArgumentException("Unknown hahser " + hasherOpt);
        }

        modulus = Integer.parseInt(modulusOpt);
    }

    /**
     * Subclass must override this method and hash some portion of the key.
     *
     * @param hasher
     *          Data written to this will be used to compute the hash for the key.
     */
    protected abstract void hash(DataOutput hasher, Key k) throws IOException;

    @Override
    public boolean accept(Key k) {
        Hasher hasher = hashFunction.newHasher();
        try {
            hash(new DataoutputHasher(hasher), k);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return hasher.hash().asInt() % modulus == 0;
    }
}