Example usage for org.apache.hadoop.util.hash Hash MURMUR

Introduction

In this page you can find the example usage for org.apache.hadoop.util.hash Hash MURMUR_HASH.

Prototype

int MURMUR_HASH

To view the source code for org.apache.hadoop.util.hash Hash MURMUR_HASH.

Click Source Link

Document

Constant to denote MurmurHash .

Usage

From source file:boa.aggregators.DistinctAggregator.java

License:Apache License

/** {@inheritDoc} */
@Override/*from  w  w  w .j  a v a 2 s  .co  m*/
public void start(final EmitKey key) {
    super.start(key);

    this.filter = new DynamicBloomFilter(this.vectorSize, HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg());
}

From source file:com.eincs.athens.handler.AthensBlockFilter.java

License:Apache License

@Inject
public AthensBlockFilter(BlockDB blockDB) {
    this.blockDB = blockDB;
    this.bloomFilter = new CountingBloomFilter(BLOOM_FILTER_VECTOR_SIZE, BLOOM_FILTER_NUMBER_HASH,
            Hash.MURMUR_HASH);
}

From source file:com.uber.hoodie.common.BloomFilter.java

License:Apache License

public BloomFilter(int numEntries, double errorRate) {
    this(numEntries, errorRate, Hash.MURMUR_HASH);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesBetweenSets.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }// w ww .j a v a 2s . c  o  m

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph.getGraphElementsWithStatisticsBetweenSets(
            Collections.singleton(new TypeValue("customer", "A0")), seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    // Query for all edges in set {customer|A0, customer|A23}
    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }// w  w w  .  j  av a 2 s.co  m

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph
            .getGraphElementsWithStatisticsWithinSet(seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    GraphElement expectedElement3 = new GraphElement(new Entity("customer", "A23", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics3 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement3, expectedStatistics3));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

/**
 * Used to sanity check the calculation used in {@link BloomFilterUtilities}'s <code>calculateBloomFilterSize()</code>
 * and <code>calculateNumHashes()</code> methods.
 *
 * @param args  No arguments necessary/*w w  w. ja v a  2s  .  c o m*/
 */
public static void main(String[] args) {
    int[] numItems = new int[] { 10, 100, 1000, 10000, 100000 };
    for (int num : numItems) {
        System.out.println("Num items to add = " + num);
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // 1MB.
        int size = (int) (-num * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, 1024 * 1024);
        System.out.println("Size = " + size + " bits (=" + (size / 8) + " bytes)");
        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        int numHashes = Math.max(1, (int) ((size / num) * Math.log(2)));
        System.out.println("Num hashes = " + numHashes);
        // Create Bloom filter
        BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        // Add num items to it
        for (int i = 0; i < num; i++) {
            filter.add(new Key(("" + i).getBytes()));
        }
        // Theoretical probability of false positive is (1 - e^(-kn/m)) ^ k (as long as size hasn't been
        // truncated to 1MB).
        System.out.println("Theoretical probability of false positive = "
                + Math.pow(1.0 - Math.exp(-(double) numHashes * num / (double) size), numHashes));
        // Test false positive rate - should be approx 1 in 10000
        int numPass = 0;
        for (int i = num; i < 1000000; i++) {
            if (filter.membershipTest(new Key(("" + i).getBytes()))) {
                numPass++;
            }
        }
        System.out.println("Measured probability of false positive " + (numPass / 1000000.0));
    }
}

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
            throws StoreException, AccumuloElementConversionException {
        final Set<EntitySeed> seeds = new HashSet<>();
        seeds.add(AccumuloTestData.SEED_A0);
        seeds.add(AccumuloTestData.SEED_A23);
        // Add a bunch of items that are not in the data to make the probability of being able to find a false
        // positive sensible.
        for (int i = 0; i < 10; i++) {
            seeds.add(new EntitySeed("abc" + i));
        }//from  w  w  w . j  a  v  a  2s .  c o m

        // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
        // one that GraphElementsWithStatisticsWithinSetRetriever creates.
        final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
        if (!loadIntoMemory) {
            store.getProperties().setMaxEntriesForBatchScanner("20");
        }

        // Find something that will give a false positive
        // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // maxBloomFilterToPassToAnIterator bytes.
        int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
        // Create Bloom filter and add seeds to it
        final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        for (final EntitySeed seed : seeds) {
            filter.add(new org.apache.hadoop.util.bloom.Key(
                    store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
        }

        // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
        // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
        int count = 0;
        int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
        while (count < maxNumberOfTries) {
            count++;
            if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
                break;
            }
        }
        if (count == maxNumberOfTries) {
            fail("Didn't find a false positive");
        }

        // False positive is "" + count so create an edge from seeds to that
        final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true);
        edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
        Set<Element> data = new HashSet<>();
        data.add(edge);
        final User user = new User();
        addElements(data, store, user);
        // Now query for all edges in set - shouldn't get the false positive
        AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>(
                AccumuloTestData.SEED_A0_SET, seeds, defaultView);
        final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
        // Check results are as expected

        assertEquals(2, results.size());
        assertThat(results,
                IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY));
    }

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
        throws StoreException, AccumuloElementConversionException {
    // Query for all edges in set {A0, A23}
    final Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }//from  ww w  .j ava2  s  .c  om

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntitySeed seed : seeds) {
        filter.add(new org.apache.hadoop.util.bloom.Key(
                store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }

    // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    final GetElements<EntitySeed, ?> op = new GetRelatedElements<>(defaultView, seeds);
    // Now query for all edges in set - shouldn't get the false positive
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);

    // Check results are as expected
    assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23,
            AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY));
}

From source file:gaffer.accumulostore.test.bloom.FilterWritabilityTest.java

License:Apache License

@Test
public void shouldAcceptValidFilter() {
    // Given/*from  w  w  w  .j a v a  2s.  co m*/
    final BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));

    // Then
    assertTrue(filter.membershipTest(new Key("ABC".getBytes())));
    assertTrue(filter.membershipTest(new Key("DEF".getBytes())));
    assertFalse(filter.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

From source file:gaffer.accumulostore.test.bloom.FilterWritabilityTest.java

License:Apache License

@Test
public void shouldWriteAndReadFilter() throws IOException {
    // Given//w  ww  .  j a v  a2  s  .  c  om
    final BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final DataOutputStream out = new DataOutputStream(baos);
    filter.write(out);
    String x = new String(baos.toByteArray(), AccumuloStoreConstants.BLOOM_FILTER_CHARSET);
    final ByteArrayInputStream bais = new ByteArrayInputStream(
            x.getBytes(AccumuloStoreConstants.BLOOM_FILTER_CHARSET));

    // When
    final DataInputStream in = new DataInputStream(bais);
    final BloomFilter read = new BloomFilter();
    read.readFields(in);

    // Then
    assertTrue(read.membershipTest(new Key("ABC".getBytes())));
    assertTrue(read.membershipTest(new Key("DEF".getBytes())));
    assertFalse(read.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

Example usage for org.apache.hadoop.util.hash Hash MURMUR_HASH

Introduction

Prototype

Document

Usage