Example usage for org.apache.hadoop.util.bloom BloomFilter membershipTest

List of usage examples for org.apache.hadoop.util.bloom BloomFilter membershipTest

Introduction

In this page you can find the example usage for org.apache.hadoop.util.bloom BloomFilter membershipTest.

Prototype

@Override
    public boolean membershipTest(Key key) 

Source Link

Usage

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesBetweenSets.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }/*from  w  w  w .ja va  2s  .co  m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph.getGraphElementsWithStatisticsBetweenSets(
            Collections.singleton(new TypeValue("customer", "A0")), seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    // Query for all edges in set {customer|A0, customer|A23}
    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }/*from w  ww  . ja  v a 2 s  . co  m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph
            .getGraphElementsWithStatisticsWithinSet(seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    GraphElement expectedElement3 = new GraphElement(new Entity("customer", "A23", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics3 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement3, expectedStatistics3));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

/**
 * Used to sanity check the calculation used in {@link BloomFilterUtilities}'s <code>calculateBloomFilterSize()</code>
 * and <code>calculateNumHashes()</code> methods.
 *
 * @param args  No arguments necessary//w w  w .  j  a  v  a  2s. co m
 */
public static void main(String[] args) {
    int[] numItems = new int[] { 10, 100, 1000, 10000, 100000 };
    for (int num : numItems) {
        System.out.println("Num items to add = " + num);
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // 1MB.
        int size = (int) (-num * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, 1024 * 1024);
        System.out.println("Size = " + size + " bits (=" + (size / 8) + " bytes)");
        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        int numHashes = Math.max(1, (int) ((size / num) * Math.log(2)));
        System.out.println("Num hashes = " + numHashes);
        // Create Bloom filter
        BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        // Add num items to it
        for (int i = 0; i < num; i++) {
            filter.add(new Key(("" + i).getBytes()));
        }
        // Theoretical probability of false positive is (1 - e^(-kn/m)) ^ k (as long as size hasn't been
        // truncated to 1MB).
        System.out.println("Theoretical probability of false positive = "
                + Math.pow(1.0 - Math.exp(-(double) numHashes * num / (double) size), numHashes));
        // Test false positive rate - should be approx 1 in 10000
        int numPass = 0;
        for (int i = num; i < 1000000; i++) {
            if (filter.membershipTest(new Key(("" + i).getBytes()))) {
                numPass++;
            }
        }
        System.out.println("Measured probability of false positive " + (numPass / 1000000.0));
    }
}

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
            throws StoreException, AccumuloElementConversionException {
        final Set<EntitySeed> seeds = new HashSet<>();
        seeds.add(AccumuloTestData.SEED_A0);
        seeds.add(AccumuloTestData.SEED_A23);
        // Add a bunch of items that are not in the data to make the probability of being able to find a false
        // positive sensible.
        for (int i = 0; i < 10; i++) {
            seeds.add(new EntitySeed("abc" + i));
        }/*from www. j  a  v a  2s . com*/

        // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
        // one that GraphElementsWithStatisticsWithinSetRetriever creates.
        final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
        if (!loadIntoMemory) {
            store.getProperties().setMaxEntriesForBatchScanner("20");
        }

        // Find something that will give a false positive
        // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // maxBloomFilterToPassToAnIterator bytes.
        int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
        // Create Bloom filter and add seeds to it
        final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        for (final EntitySeed seed : seeds) {
            filter.add(new org.apache.hadoop.util.bloom.Key(
                    store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
        }

        // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
        // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
        int count = 0;
        int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
        while (count < maxNumberOfTries) {
            count++;
            if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
                break;
            }
        }
        if (count == maxNumberOfTries) {
            fail("Didn't find a false positive");
        }

        // False positive is "" + count so create an edge from seeds to that
        final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true);
        edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
        Set<Element> data = new HashSet<>();
        data.add(edge);
        final User user = new User();
        addElements(data, store, user);
        // Now query for all edges in set - shouldn't get the false positive
        AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>(
                AccumuloTestData.SEED_A0_SET, seeds, defaultView);
        final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
        // Check results are as expected

        assertEquals(2, results.size());
        assertThat(results,
                IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY));
    }

From source file:gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
        throws StoreException, AccumuloElementConversionException {
    // Query for all edges in set {A0, A23}
    final Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }//from  ww w.  ja v  a2s  .c om

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntitySeed seed : seeds) {
        filter.add(new org.apache.hadoop.util.bloom.Key(
                store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }

    // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    final GetElements<EntitySeed, ?> op = new GetRelatedElements<>(defaultView, seeds);
    // Now query for all edges in set - shouldn't get the false positive
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);

    // Check results are as expected
    assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23,
            AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY));
}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<Classification> classify(List<String> queryTokens, FilterSet filterSet) {
    int cntTokens = queryTokens.size();
    int maxToken = cntTokens > 15 ? 15 : cntTokens;

    Integer[] tokenPositions = new Integer[maxToken];

    for (int i = 0; i < maxToken; i++)
        tokenPositions[i] = Integer.valueOf(i);

    List<Classification> classificationsList = new ArrayList<Classification>();

    OrderedChoiceIterable orderedChoiceIterable = new OrderedChoiceIterable(tokenPositions);
    for (Integer[] tokenPos : orderedChoiceIterable)
        if (tokenPos != null) {
            StringBuilder sb = new StringBuilder();
            TreeSet<Integer> tokenPositions1 = new TreeSet<Integer>();
            for (int k = 0; k < tokenPos.length; k++) {
                sb.append((String) queryTokens.get(tokenPos[k].intValue()) + " ");
                tokenPositions1.add(tokenPos[k]);
            }/*from www .  java  2s  . c  o m*/

            String searchTerm = sb.toString().trim();
            if (searchTerm.length() != 0) {
                for (String field : filterSet.getFilters()) {
                    BloomFilter f = filterSet.getFilter(field);
                    char symbol = filterSet.getSymbol(field);
                    if (f.membershipTest(new Key(searchTerm.getBytes())))
                        classificationsList
                                .add(new Classification(field, symbol, searchTerm, Arrays.asList(tokenPos)));
                }
            }
        }
    return classificationsList;
}

From source file:org.apache.crunch.contrib.bloomfilter.BloomFiltersIT.java

License:Apache License

@Test
public void testFilterCreation() throws IOException {
    String inputPath = tempDir.copyResourceFileName("shakes.txt");
    BloomFilterFn<String> filterFn = new BloomFilterFn<String>() {
        @Override/*from w w  w  .ja  va  2 s .co  m*/
        public Collection<Key> generateKeys(String input) {
            List<String> parts = Arrays.asList(StringUtils.split(input, " "));
            Collection<Key> keys = new HashSet<Key>();
            for (String stringpart : parts) {
                keys.add(new Key(stringpart.getBytes()));
            }
            return keys;
        }
    };
    Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn)
            .getValue();
    assertEquals(1, filterValues.size());
    BloomFilter filter = filterValues.get("shakes.txt");
    assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes())));
    assertTrue(filter.membershipTest(new Key("apples".getBytes())));
}

From source file:uk.gov.gchq.gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
        throws StoreException, AccumuloElementConversionException {
    final Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }/*w w w  .ja  v  a 2s. c  o  m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntitySeed seed : seeds) {
        filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }

    // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true);
    edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
    Set<Element> data = new HashSet<>();
    data.add(edge);
    final User user = new User();
    addElements(data, store, user);
    // Now query for all edges in set - shouldn't get the false positive
    AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>(
            AccumuloTestData.SEED_A0_SET, seeds, defaultView);
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
    // Check results are as expected

    assertEquals(2, results.size());
    assertThat(results,
            IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY));
}

From source file:uk.gov.gchq.gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java

License:Apache License

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store)
        throws StoreException, AccumuloElementConversionException {
    // Query for all edges in set {A0, A23}
    final Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }/*from w ww .  j a va2s  . co m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntitySeed seed : seeds) {
        filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }

    // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    final GetElements<EntitySeed, ?> op = new GetElements<>(defaultView, seeds);
    // Now query for all edges in set - shouldn't get the false positive
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);

    // Check results are as expected
    assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23,
            AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY));
}