List of usage examples for org.apache.hadoop.util.bloom BloomFilter membershipTest
@Override
public boolean membershipTest(Key key)
From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesBetweenSets.java
License:Apache License
static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException { AccumuloBackedGraph graph = setupGraph(); Set<TypeValue> seeds = new HashSet<TypeValue>(); seeds.add(new TypeValue("customer", "A0")); seeds.add(new TypeValue("customer", "A23")); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new TypeValue("abc", "abc" + i)); }/*from w w w .ja va 2s .co m*/ // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { graph.setMaxEntriesForBatchScanner(20); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (TypeValue entity : seeds) { filter.add(new Key(entity.getValue().getBytes())); } // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER; while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore); SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000)); graph.addGraphElementsWithStatistics( Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics))); // Now query for all edges in set - shouldn't get the false positive CloseableIterable<GraphElementWithStatistics> retriever = graph.getGraphElementsWithStatisticsBetweenSets( Collections.singleton(new TypeValue("customer", "A0")), seeds, loadIntoMemory); Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>(); for (GraphElementWithStatistics gews : retriever) { results.add(gews); } retriever.close(); // Check results are as expected Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>(); GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase", "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore)); SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23)); expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1)); GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore", visibilityString1, sevenDaysBefore, sixDaysBefore)); SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000)); expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2)); assertEquals(expectedResults, results); }
From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java
License:Apache License
static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException { AccumuloBackedGraph graph = setupGraph(); // Query for all edges in set {customer|A0, customer|A23} Set<TypeValue> seeds = new HashSet<TypeValue>(); seeds.add(new TypeValue("customer", "A0")); seeds.add(new TypeValue("customer", "A23")); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new TypeValue("abc", "abc" + i)); }/*from w ww . ja v a 2 s . co m*/ // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { graph.setMaxEntriesForBatchScanner(20); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (TypeValue entity : seeds) { filter.add(new Key(entity.getValue().getBytes())); } // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER; while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore); SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000)); graph.addGraphElementsWithStatistics( Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics))); // Now query for all edges in set - shouldn't get the false positive CloseableIterable<GraphElementWithStatistics> retriever = graph .getGraphElementsWithStatisticsWithinSet(seeds, loadIntoMemory); Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>(); for (GraphElementWithStatistics gews : retriever) { results.add(gews); } retriever.close(); // Check results are as expected Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>(); GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase", "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore)); SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23)); expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1)); GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore", visibilityString1, sevenDaysBefore, sixDaysBefore)); SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000)); expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2)); GraphElement expectedElement3 = new GraphElement(new Entity("customer", "A23", "purchase", "instore", visibilityString1, sevenDaysBefore, sixDaysBefore)); SetOfStatistics expectedStatistics3 = new SetOfStatistics("count", new Count(23)); expectedResults.add(new GraphElementWithStatistics(expectedElement3, expectedStatistics3)); assertEquals(expectedResults, results); }
From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java
License:Apache License
/** * Used to sanity check the calculation used in {@link BloomFilterUtilities}'s <code>calculateBloomFilterSize()</code> * and <code>calculateNumHashes()</code> methods. * * @param args No arguments necessary//w w w . j a v a 2s. co m */ public static void main(String[] args) { int[] numItems = new int[] { 10, 100, 1000, 10000, 100000 }; for (int num : numItems) { System.out.println("Num items to add = " + num); // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // 1MB. int size = (int) (-num * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, 1024 * 1024); System.out.println("Size = " + size + " bits (=" + (size / 8) + " bytes)"); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. int numHashes = Math.max(1, (int) ((size / num) * Math.log(2))); System.out.println("Num hashes = " + numHashes); // Create Bloom filter BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); // Add num items to it for (int i = 0; i < num; i++) { filter.add(new Key(("" + i).getBytes())); } // Theoretical probability of false positive is (1 - e^(-kn/m)) ^ k (as long as size hasn't been // truncated to 1MB). System.out.println("Theoretical probability of false positive = " + Math.pow(1.0 - Math.exp(-(double) numHashes * num / (double) size), numHashes)); // Test false positive rate - should be approx 1 in 10000 int numPass = 0; for (int i = num; i < 1000000; i++) { if (filter.membershipTest(new Key(("" + i).getBytes()))) { numPass++; } } System.out.println("Measured probability of false positive " + (numPass / 1000000.0)); } }
From source file:gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java
License:Apache License
private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException, AccumuloElementConversionException { final Set<EntitySeed> seeds = new HashSet<>(); seeds.add(AccumuloTestData.SEED_A0); seeds.add(AccumuloTestData.SEED_A23); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new EntitySeed("abc" + i)); }/*from www. j a v a 2s . com*/ // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { store.getProperties().setMaxEntriesForBatchScanner("20"); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator()); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (final EntitySeed seed : seeds) { filter.add(new org.apache.hadoop.util.bloom.Key( store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex()))); } // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator(); while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true); edge.putProperty(AccumuloPropertyNames.COUNT, 1000000); Set<Element> data = new HashSet<>(); data.add(edge); final User user = new User(); addElements(data, store, user); // Now query for all edges in set - shouldn't get the false positive AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>( AccumuloTestData.SEED_A0_SET, seeds, defaultView); final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory); // Check results are as expected assertEquals(2, results.size()); assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY)); }
From source file:gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java
License:Apache License
private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException, AccumuloElementConversionException { // Query for all edges in set {A0, A23} final Set<EntitySeed> seeds = new HashSet<>(); seeds.add(AccumuloTestData.SEED_A0); seeds.add(AccumuloTestData.SEED_A23); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new EntitySeed("abc" + i)); }//from ww w. ja v a2s .c om // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { store.getProperties().setMaxEntriesForBatchScanner("20"); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator()); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (final EntitySeed seed : seeds) { filter.add(new org.apache.hadoop.util.bloom.Key( store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex()))); } // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator(); while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new org.apache.hadoop.util.bloom.Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that final GetElements<EntitySeed, ?> op = new GetRelatedElements<>(defaultView, seeds); // Now query for all edges in set - shouldn't get the false positive final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory); // Check results are as expected assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY)); }
From source file:in.geocoder.component.GeocodingComponent.java
License:Apache License
private List<Classification> classify(List<String> queryTokens, FilterSet filterSet) { int cntTokens = queryTokens.size(); int maxToken = cntTokens > 15 ? 15 : cntTokens; Integer[] tokenPositions = new Integer[maxToken]; for (int i = 0; i < maxToken; i++) tokenPositions[i] = Integer.valueOf(i); List<Classification> classificationsList = new ArrayList<Classification>(); OrderedChoiceIterable orderedChoiceIterable = new OrderedChoiceIterable(tokenPositions); for (Integer[] tokenPos : orderedChoiceIterable) if (tokenPos != null) { StringBuilder sb = new StringBuilder(); TreeSet<Integer> tokenPositions1 = new TreeSet<Integer>(); for (int k = 0; k < tokenPos.length; k++) { sb.append((String) queryTokens.get(tokenPos[k].intValue()) + " "); tokenPositions1.add(tokenPos[k]); }/*from www . java 2s . c o m*/ String searchTerm = sb.toString().trim(); if (searchTerm.length() != 0) { for (String field : filterSet.getFilters()) { BloomFilter f = filterSet.getFilter(field); char symbol = filterSet.getSymbol(field); if (f.membershipTest(new Key(searchTerm.getBytes()))) classificationsList .add(new Classification(field, symbol, searchTerm, Arrays.asList(tokenPos))); } } } return classificationsList; }
From source file:org.apache.crunch.contrib.bloomfilter.BloomFiltersIT.java
License:Apache License
@Test public void testFilterCreation() throws IOException { String inputPath = tempDir.copyResourceFileName("shakes.txt"); BloomFilterFn<String> filterFn = new BloomFilterFn<String>() { @Override/*from w w w .ja va 2 s .co m*/ public Collection<Key> generateKeys(String input) { List<String> parts = Arrays.asList(StringUtils.split(input, " ")); Collection<Key> keys = new HashSet<Key>(); for (String stringpart : parts) { keys.add(new Key(stringpart.getBytes())); } return keys; } }; Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn) .getValue(); assertEquals(1, filterValues.size()); BloomFilter filter = filterValues.get("shakes.txt"); assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes()))); assertTrue(filter.membershipTest(new Key("apples".getBytes()))); }
From source file:uk.gov.gchq.gaffer.accumulostore.retriever.impl.AccumuloIDBetweenSetsRetrieverTest.java
License:Apache License
private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException, AccumuloElementConversionException { final Set<EntitySeed> seeds = new HashSet<>(); seeds.add(AccumuloTestData.SEED_A0); seeds.add(AccumuloTestData.SEED_A23); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new EntitySeed("abc" + i)); }/*w w w .ja v a 2s. c o m*/ // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { store.getProperties().setMaxEntriesForBatchScanner("20"); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator()); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (final EntitySeed seed : seeds) { filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex()))); } // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator(); while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that final Edge edge = new Edge(TestGroups.EDGE, "A0", "" + count, true); edge.putProperty(AccumuloPropertyNames.COUNT, 1000000); Set<Element> data = new HashSet<>(); data.add(edge); final User user = new User(); addElements(data, store, user); // Now query for all edges in set - shouldn't get the false positive AbstractAccumuloTwoSetSeededOperation<EntitySeed, Element> op = new GetElementsBetweenSets<>( AccumuloTestData.SEED_A0_SET, seeds, defaultView); final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory); // Check results are as expected assertEquals(2, results.size()); assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY)); }
From source file:uk.gov.gchq.gaffer.accumulostore.retriever.impl.AccumuloIDWithinSetRetrieverTest.java
License:Apache License
private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException, AccumuloElementConversionException { // Query for all edges in set {A0, A23} final Set<EntitySeed> seeds = new HashSet<>(); seeds.add(AccumuloTestData.SEED_A0); seeds.add(AccumuloTestData.SEED_A23); // Add a bunch of items that are not in the data to make the probability of being able to find a false // positive sensible. for (int i = 0; i < 10; i++) { seeds.add(new EntitySeed("abc" + i)); }/*from w ww . j a va2s . co m*/ // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the // one that GraphElementsWithStatisticsWithinSetRetriever creates. final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20; if (!loadIntoMemory) { store.getProperties().setMaxEntriesForBatchScanner("20"); } // Find something that will give a false positive // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method. // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of // maxBloomFilterToPassToAnIterator bytes. int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0))); size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator()); // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set. final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2))); // Create Bloom filter and add seeds to it final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH); for (final EntitySeed seed : seeds) { filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex()))); } // Test random items against it - should only have to shouldRetieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem). int count = 0; int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator(); while (count < maxNumberOfTries) { count++; if (filter.membershipTest(new Key(("" + count).getBytes()))) { break; } } if (count == maxNumberOfTries) { fail("Didn't find a false positive"); } // False positive is "" + count so create an edge from seeds to that final GetElements<EntitySeed, ?> op = new GetElements<>(defaultView, seeds); // Now query for all edges in set - shouldn't get the false positive final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory); // Check results are as expected assertThat(results, IsCollectionContaining.hasItems(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY)); }