graphene.util.ooab.ByteArrayFilter.java Source code

Java tutorial

Introduction

Here is the source code for graphene.util.ooab.ByteArrayFilter.java

Source

/*
 *
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *       http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

// Copyright 2012 Jeff Hodges. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package graphene.util.ooab;

import java.math.RoundingMode;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReferenceArray;

import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.math.IntMath;

/**
 * ByteArrayFilter is used to filter out duplicate byte arrays from a given dataset or stream. It is
 * guaranteed to never return a false positive (that is, it will never say that an item has already
 * been seen by the filter when it has not) but may return a false negative.
 *
 * ByteArrayFilter is thread-safe.
 */
public class ByteArrayFilter {
    private static final HashFunction HASH_FUNC = Hashing.murmur3_32();
    private final int sizeMask;
    private final AtomicReferenceArray<byte[]> array;
    private static final int MAX_SIZE = 1 << 30;

    /**
     * Constructs a ByteArrayFilter with an underlying array of the given size, rounded up to the next
     * power of two.
     *
     * This rounding occurs because the hashing is much faster on an array the size of a power of two.
     * If you really want a different sized array, used the AtomicReferenceArray constructor.
     *
     * @param size The size of the underlying array.
     */
    public ByteArrayFilter(int size) {
        if (size <= 0) {
            throw new IllegalArgumentException("array size must be greater than zero, was " + size);
        }
        if (size > MAX_SIZE) {
            throw new IllegalArgumentException(
                    "array size may not be larger than 2**31-1, but will be rounded to larger. was " + size);
        }
        // round to the next largest power of two
        int poweredSize = IntMath.pow(2, IntMath.log2(size, RoundingMode.CEILING));
        this.sizeMask = poweredSize - 1;
        this.array = new AtomicReferenceArray<byte[]>(poweredSize);
    }

    /**
     * Returns whether the given byte array has been previously seen by this array. That is, if a byte
     * array with the same bytes as id has been passed to to this method before.
     *
     * This method may return false when it has seen an id before. This occurs if the id passed in
     * hashes to the same index in the underlying array as another id previously checked. On the
     * flip side, this method will never return true incorrectly.
     *
     * @param id The byte array that may have been previously seen.
     * @return Whether the byte array is contained in the ByteArrayFilter.
     */
    public boolean containsAndAdd(byte[] id) {
        HashCode code = HASH_FUNC.hashBytes(id);
        int index = Math.abs(code.asInt()) & sizeMask;
        byte[] oldId = array.getAndSet(index, id);
        return Arrays.equals(id, oldId);
    }

    /**
     * Returns the size of the underlying array. Welp.
     *
     * @return The size of the underlying array.
     */
    public int getSize() {
        return array.length();
    }
}