Example usage for org.apache.hadoop.util.bloom BloomFilter BloomFilter

List of usage examples for org.apache.hadoop.util.bloom BloomFilter BloomFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.util.bloom BloomFilter BloomFilter.

Prototype

public BloomFilter(int vectorSize, int nbHash, int hashType) 

Source Link

Document

Constructor

Usage

From source file:brickhouse.udf.bloom.BloomFactory.java

License:Apache License

public static Filter NewBloomInstance(double c, int n, int k) {
    LOG.info("Creating new Bloom filter C = " + c + " N =  " + n + " K = " + k);
    BloomFilter dbf = new BloomFilter((int) Math.ceil(c * n), k, DEFAULT_HASH_TYPE);
    return dbf;//from w  ww . ja  v  a2  s  .  c o  m
}

From source file:com.cloudera.util.bloom.BloomSet.java

License:Apache License

/**
 * Create empty BloomSet.//from   w  w  w  .  j  a  v a 2  s . co  m
 * 
 * Wikipedia uses the following size and # of hashes to achieve particular
 * false positives rates.
 * 
 * m bits needed, n inserted elements, p false positive rate, k hash
 * functions, e is max false positive error rate.
 * 
 * k = (m/n) ln 2
 * 
 * m = - n ln p / (ln 2)^2
 * 
 * 1.44 log_2 (1/e) = # of bits per inserted element
 **/
public BloomSet(int nbits, int hashes) {
    bloom = new BloomFilter(nbits, hashes, hashType);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesBetweenSets.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }//w ww  . j av a 2s  .c o m

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph.getGraphElementsWithStatisticsBetweenSets(
            Collections.singleton(new TypeValue("customer", "A0")), seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    // Query for all edges in set {customer|A0, customer|A23}
    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }/*from  w w w.  j  a v  a2 s  .c  o m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph
            .getGraphElementsWithStatisticsWithinSet(seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    GraphElement expectedElement3 = new GraphElement(new Entity("customer", "A23", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics3 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement3, expectedStatistics3));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

/**
 * Used to sanity check the calculation used in {@link BloomFilterUtilities}'s <code>calculateBloomFilterSize()</code>
 * and <code>calculateNumHashes()</code> methods.
 *
 * @param args  No arguments necessary//ww w  . j a v a 2  s  . c om
 */
public static void main(String[] args) {
    int[] numItems = new int[] { 10, 100, 1000, 10000, 100000 };
    for (int num : numItems) {
        System.out.println("Num items to add = " + num);
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // 1MB.
        int size = (int) (-num * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, 1024 * 1024);
        System.out.println("Size = " + size + " bits (=" + (size / 8) + " bytes)");
        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        int numHashes = Math.max(1, (int) ((size / num) * Math.log(2)));
        System.out.println("Num hashes = " + numHashes);
        // Create Bloom filter
        BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        // Add num items to it
        for (int i = 0; i < num; i++) {
            filter.add(new Key(("" + i).getBytes()));
        }
        // Theoretical probability of false positive is (1 - e^(-kn/m)) ^ k (as long as size hasn't been
        // truncated to 1MB).
        System.out.println("Theoretical probability of false positive = "
                + Math.pow(1.0 - Math.exp(-(double) numHashes * num / (double) size), numHashes));
        // Test false positive rate - should be approx 1 in 10000
        int numPass = 0;
        for (int i = num; i < 1000000; i++) {
            if (filter.membershipTest(new Key(("" + i).getBytes()))) {
                numPass++;
            }
        }
        System.out.println("Measured probability of false positive " + (numPass / 1000000.0));
    }
}

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
            throws StoreException, AccumuloElementConversionException {
        final Set<EntitySeed> seeds = new HashSet<>();
        seeds.add(AccumuloTestData.SEED_A0);
        seeds.add(AccumuloTestData.SEED_A23);
        // Add a bunch of items that are not in the data to make the probability of being able to find a false
        // positive sensible.
        for (int i = 0; i < 10; i++) {
            seeds.add(new EntitySeed("abc" + i));
        }//from   ww w .  j a v  a 2 s .com

        // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
        // one that GraphElementsWithStatisticsWithinSetRetriever creates.
        final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
        if (!loadIntoMemory) {
            store.getProperties().setMaxEntriesForBatchScanner("20");
        }

        // Find something that will give a false positive
        // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // maxBloomFilterToPassToAnIterator bytes.
        int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
        // Create Bloom filter and add seeds to it
        final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        for (final EntitySeed seed : seeds) {
            filter.add(new org.apache.hadoop.util.bloom.Key(
                    store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
        }

        // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
        // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
        int count = 0;
        int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
        while (count < maxNumberOfTries) {
            count++;
            if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
                break;
            }
        }
        if (count == maxNumberOfTries) {
            fail("Didn't find a false positive");
        }

        // False positive is "" + count so create an edge from seeds to that
        final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true);
        edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
        Set<Element> data = new HashSet<>();
        data.add(edge);
        final User user = new User();
        addElements(data, store, user);
        // Now query for all edges in set - shouldn't get the false positive
        AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>(
                AccumuloTestData.SEED_A0_SET, seeds, defaultView);
        final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
        // Check results are as expected

        assertEquals(2, results.size());
        assertThat(results,
                IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY));
    }

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
        throws StoreException, AccumuloElementConversionException {
    // Query for all edges in set {A0, A23}
    final Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }/*from w  w  w.  jav a2  s.c  o m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntitySeed seed : seeds) {
        filter.add(new org.apache.hadoop.util.bloom.Key(
                store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }

    // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    final GetElements<EntitySeed, ?> op = new GetRelatedElements<>(defaultView, seeds);
    // Now query for all edges in set - shouldn't get the false positive
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);

    // Check results are as expected
    assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23,
            AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY));
}

From source file:gaffer.accumulostore.utils.BloomFilterUtils.java

License:Apache License

/**
 * Returns a {@link org.apache.hadoop.util.bloom.BloomFilter} of the
 * necessary size to achieve the given false positive rate (subject to the
 * given maximum size), configured with the optimal number of hash
 * functions.// w  ww.ja  va 2s  .c  om
 *
 * @param falsePositiveRate the false positive rate
 * @param numItemsToBeAdded the number of items to be added
 * @param maximumSize       the maximum size
 * @return A new BloomFilter with the desired Settings
 */
public static BloomFilter getBloomFilter(final double falsePositiveRate, final int numItemsToBeAdded,
        final int maximumSize) {
    final int size = calculateBloomFilterSize(falsePositiveRate, numItemsToBeAdded, maximumSize);
    final int numHashes = calculateNumHashes(size, numItemsToBeAdded);
    return new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
}

From source file:gaffer.accumulostore.utils.BloomFilterUtils.java

License:Apache License

/**
 * Returns a {@link org.apache.hadoop.util.bloom.BloomFilter} of the given
 * size./*from   w ww  .  j ava  2s .  c  o  m*/
 *
 * @param size the size of the bloom filter to create
 * @return A new BloomFilter of the desired size
 */
public static BloomFilter getBloomFilter(final int size) {
    return new BloomFilter(size, 13, Hash.MURMUR_HASH);
}

From source file:gaffer.predicate.typevalue.impl.TestValueInBloomFilterPredicate.java

License:Apache License

@Test
public void testAccept() {
    BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    ValueInBloomFilterPredicate predicate = new ValueInBloomFilterPredicate(filter);
    assertTrue(predicate.accept("X", "ABC"));
    assertTrue(predicate.accept("X", "DEF"));
    assertFalse(predicate.accept("Y", "lkjhgfdsa"));
}