Example usage for org.apache.mahout.common.iterator StringRecordIterator StringRecordIterator

List of usage examples for org.apache.mahout.common.iterator StringRecordIterator StringRecordIterator

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator StringRecordIterator StringRecordIterator.

Prototype

public StringRecordIterator(Iterable<String> stringIterator, String pattern) 

Source Link

Usage

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java

License:Apache License

private static void runFPGrowth(Parameters params) throws IOException {
    log.info("Starting Sequential FPGrowth");
    int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
    int minSupport = Integer.valueOf(params.get("minSupport", "3"));

    Path output = new Path(params.get("output", "output.txt"));
    Path input = new Path(params.get("input"));

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);

    Charset encoding = Charset.forName(params.get("encoding"));

    String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class,
            TopKStringPatterns.class);

    FSDataInputStream inputStream = null;
    FSDataInputStream inputStreamAgain = null;

    Collection<String> features = Sets.newHashSet();

    if ("true".equals(params.get(PFPGrowth.USE_FPG2))) {
        com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String> fp = new com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String>();

        try {//  ww  w.  j  a v  a 2 s  . c o m
            inputStream = fs.open(input);
            inputStreamAgain = fs.open(input);
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    } else {
        FPGrowth<String> fp = new FPGrowth<String>();

        inputStream = fs.open(input);
        inputStreamAgain = fs.open(input);
        try {
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    }

    List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, output);
    for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
        log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond());
    }
}

From source file:com.skp.experiment.fpm.pfpgrowth.FPGrowthDriver.java

License:Apache License

private static void runFPGrowth(Parameters params) throws IOException {
    log.info("Starting Sequential FPGrowth");
    int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
    int minSupport = Integer.valueOf(params.get("minSupport", "3"));

    String output = params.get("output", "output.txt");

    Path path = new Path(output);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(path.toUri(), conf);

    Charset encoding = Charset.forName(params.get("encoding"));
    String input = params.get("input");

    String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, TopKStringPatterns.class);

    if ("true".equals(params.get("useFPG2"))) {
        org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String>();
        Collection<String> features = new HashSet<String>();

        try {/*from  www .j a v a 2s  .c o m*/
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(new File(input), encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.closeQuietly(writer);
        }
    } else {
        org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>();
        Collection<String> features = new HashSet<String>();
        try {
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(new File(input), encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

    List<Pair<String, TopKStringPatterns>> frequentPatterns = org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
            .readFrequentPattern(conf, path);
    for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
        log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond());
    }
}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTest.java

License:Apache License

@Test
public void testSpecificCaseFromRetailDataMinSup500() throws IOException {
    FPGrowth<String> fp = new FPGrowth<String>();

    StringRecordIterator it = new StringRecordIterator(
            new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+");
    int pattern_41_36_39 = 0;
    while (it.hasNext()) {
        Pair<List<String>, Long> next = it.next();
        List<String> items = next.getFirst();
        if (items.contains("41") && items.contains("36") && items.contains("39")) {
            pattern_41_36_39++;//  w  w w .  ja  v a  2  s  .co  m
        }
    }

    final Map<Set<String>, Long> results = Maps.newHashMap();

    Set<String> returnableFeatures = Sets.newHashSet();
    returnableFeatures.add("41");
    returnableFeatures.add("36");
    returnableFeatures.add("39");

    fp.generateTopKFrequentPatterns(
            new StringRecordIterator(new FileLineIterable(Resources.getResource("retail.dat").openStream()),
                    "\\s+"),

            fp.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
                    500),
            500, 1000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() {

                @Override
                public void collect(String key, List<Pair<List<String>, Long>> value) {

                    for (Pair<List<String>, Long> v : value) {
                        List<String> l = v.getFirst();
                        results.put(Sets.newHashSet(l), v.getSecond());
                    }
                }

            }, new StatusUpdater() {

                @Override
                public void update(String status) {
                }
            });

    assertEquals(Long.valueOf(pattern_41_36_39), results.get(returnableFeatures));

}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTest2.java

License:Apache License

@Test
public void testSpecificCaseFromRetailDataMinSup500() throws IOException {
    FPGrowthObj<String> fp = new FPGrowthObj<String>();

    StringRecordIterator it = new StringRecordIterator(
            new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+");
    int pattern_41_36_39 = 0;
    while (it.hasNext()) {
        Pair<List<String>, Long> next = it.next();
        List<String> items = next.getFirst();
        if (items.contains("41") && items.contains("36") && items.contains("39")) {
            pattern_41_36_39++;//from   ww w.  j  a va  2s  .  c  o  m
        }
    }

    final Map<Set<String>, Long> results = Maps.newHashMap();

    Set<String> returnableFeatures = Sets.newHashSet();
    returnableFeatures.add("41");
    returnableFeatures.add("36");
    returnableFeatures.add("39");

    fp.generateTopKFrequentPatterns(
            new StringRecordIterator(new FileLineIterable(Resources.getResource("retail.dat").openStream()),
                    "\\s+"),

            fp.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
                    500),
            500, 1000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() {

                @Override
                public void collect(String key, List<Pair<List<String>, Long>> value) {

                    for (Pair<List<String>, Long> v : value) {
                        List<String> l = v.getFirst();
                        results.put(Sets.newHashSet(l), v.getSecond());
                    }
                }

            }, new StatusUpdater() {

                @Override
                public void update(String status) {
                }
            });

    assertEquals(Long.valueOf(pattern_41_36_39), results.get(returnableFeatures));

}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTestVs.java

License:Apache License

@Test
public void testVsWithRetailData() throws IOException {
    String inputFilename = "retail.dat";
    int minSupport = 500;
    Collection<String> returnableFeatures = Sets.newHashSet();

    org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp1 = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>();

    Map<Set<String>, Long> results1 = Maps.newHashMap();

    fp1.generateTopKFrequentPatterns(// ww  w.  j  av a  2 s  . c o m
            new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()),
                    "\\s+"),

            fp1.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"),
                    minSupport),
            minSupport, 100000, returnableFeatures, new MapCollector(results1), new DummyUpdater());

    FPGrowthObj<String> fp2 = new FPGrowthObj<String>();
    Map<Set<String>, Long> initialResults2 = Maps.newHashMap();
    fp2.generateTopKFrequentPatterns(
            new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()),
                    "\\s+"),

            fp2.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"),
                    minSupport),
            minSupport, 100000, Sets.<String>newHashSet(), new MapCollector(initialResults2),
            new DummyUpdater());

    Map<Set<String>, Long> results2;
    if (returnableFeatures.isEmpty()) {
        results2 = initialResults2;
    } else {
        Map<Set<String>, Long> tmpResult = new HashMap<Set<String>, Long>();
        for (Map.Entry<Set<String>, Long> result2 : initialResults2.entrySet()) {
            Set<String> r2feats = result2.getKey();
            boolean hasSome = false;
            for (String rf : returnableFeatures) {
                if (r2feats.contains(rf)) {
                    hasSome = true;
                    break;
                }
            }
            if (hasSome) {
                tmpResult.put(result2.getKey(), result2.getValue());
            }
        }
        results2 = tmpResult;
    }

    boolean allMatch = hasAll(results1, results2);
    log.info("checked " + results1.size() + " itemsets iterating through #1");

    allMatch &= hasAll(results2, results1);
    log.info("checked " + results2.size() + " itemsets iterating through #2");

    assertTrue("Had mismatches!", allMatch);
}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthSyntheticDataTest.java

License:Apache License

@Test
public void testSpecificCasesFromSynthData() throws IOException {
    FPGrowthObj<String> fp = new FPGrowthObj<String>();

    String inputFilename = "FPGsynth.dat";

    StringRecordIterator it = new StringRecordIterator(
            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+");
    int patternCnt_10_13_1669 = 0;
    int patternCnt_10_13 = 0;
    while (it.hasNext()) {
        Pair<List<String>, Long> next = it.next();
        List<String> items = next.getFirst();
        if (items.contains("10") && items.contains("13")) {
            patternCnt_10_13++;/*from www  . j  a  v  a 2  s .  co m*/
            if (items.contains("1669")) {
                patternCnt_10_13_1669++;
            }
        }
    }

    int minSupport = 50;
    if (patternCnt_10_13_1669 < minSupport) {
        throw new IllegalStateException("the test is broken or data is missing (" + patternCnt_10_13_1669 + ", "
                + patternCnt_10_13 + ')');
    }

    final Map<Set<String>, Long> results = Maps.newHashMap();

    Set<String> features_10_13 = Sets.newHashSet();
    features_10_13.add("10");
    features_10_13.add("13");

    Set<String> returnableFeatures = Sets.newHashSet();
    returnableFeatures.add("10");
    returnableFeatures.add("13");
    returnableFeatures.add("1669");

    fp.generateTopKFrequentPatterns(
            new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()),
                    "\\s+"),

            fp.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"),
                    minSupport),
            minSupport, 100000, returnableFeatures,
            new OutputCollector<String, List<Pair<List<String>, Long>>>() {

                @Override
                public void collect(String key, List<Pair<List<String>, Long>> value) {

                    for (Pair<List<String>, Long> v : value) {
                        List<String> l = v.getFirst();
                        results.put(Sets.newHashSet(l), v.getSecond());
                        System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst());
                    }
                }

            }, new StatusUpdater() {

                @Override
                public void update(String status) {
                }
            });

    assertEquals(patternCnt_10_13, highestSupport(results, features_10_13));
    assertEquals(patternCnt_10_13_1669, highestSupport(results, returnableFeatures));

}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthSyntheticDataTest.java

License:Apache License

@Test
public void testVsWithSynthData() throws IOException {
    Collection<String> returnableFeatures = Sets.newHashSet();

    // not limiting features (or including too many) can cause
    // the test to run a very long time
    returnableFeatures.add("10");
    returnableFeatures.add("13");
    //    returnableFeatures.add("1669");

    FPGrowth<String> fp1 = new FPGrowth<String>();

    final Map<Set<String>, Long> results1 = Maps.newHashMap();

    String inputFilename = "FPGsynth.dat";
    int minSupport = 100;
    fp1.generateTopKFrequentPatterns(/*from  ww  w .  j av  a 2  s . c om*/
            new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()),
                    "\\s+"),

            fp1.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"),
                    minSupport),
            minSupport, 1000000, returnableFeatures,
            new OutputCollector<String, List<Pair<List<String>, Long>>>() {

                @Override
                public void collect(String key, List<Pair<List<String>, Long>> value) {

                    for (Pair<List<String>, Long> v : value) {
                        List<String> l = v.getFirst();
                        results1.put(Sets.newHashSet(l), v.getSecond());
                        System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst());
                    }
                }

            }, new StatusUpdater() {

                @Override
                public void update(String status) {
                }
            });

    FPGrowthObj<String> fp2 = new FPGrowthObj<String>();
    final Map<Set<String>, Long> initialResults2 = Maps.newHashMap();
    fp2.generateTopKFrequentPatterns(
            new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()),
                    "\\s+"),

            fp2.generateFList(
                    new StringRecordIterator(
                            new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"),
                    minSupport),
            minSupport, 1000000, Sets.<String>newHashSet(),
            new OutputCollector<String, List<Pair<List<String>, Long>>>() {

                @Override
                public void collect(String key, List<Pair<List<String>, Long>> value) {

                    for (Pair<List<String>, Long> v : value) {
                        List<String> l = v.getFirst();
                        initialResults2.put(Sets.newHashSet(l), v.getSecond());
                        System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst());
                    }
                }

            }, new StatusUpdater() {

                @Override
                public void update(String status) {
                }
            });

    Map<Set<String>, Long> results2;
    if (returnableFeatures.isEmpty()) {
        results2 = initialResults2;
    } else {
        Map<Set<String>, Long> tmpResult = new HashMap<Set<String>, Long>();
        for (Map.Entry<Set<String>, Long> result2 : initialResults2.entrySet()) {
            Set<String> r2feats = result2.getKey();
            boolean hasSome = false;
            for (String rf : returnableFeatures) {
                if (r2feats.contains(rf)) {
                    hasSome = true;
                    break;
                }
            }
            if (hasSome) {
                tmpResult.put(result2.getKey(), result2.getValue());
            }
        }
        results2 = tmpResult;
    }

    boolean allMatch = true;
    int itemsetsChecked = 0;
    for (Map.Entry<Set<String>, Long> result1 : results1.entrySet()) {
        itemsetsChecked++;
        Set<String> feats = result1.getKey();
        long supp1 = result1.getValue();
        long supp2 = highestSupport(results2, feats);
        if (supp1 != supp2) {
            allMatch = false;
            System.out.println("mismatch checking results1 [ " + supp1 + " vs " + supp2 + "]: " + feats);
        }
    }
    System.out.println("checked " + itemsetsChecked + " itemsets iterating through #1");

    itemsetsChecked = 0;
    for (Map.Entry<Set<String>, Long> result2 : results2.entrySet()) {
        itemsetsChecked++;
        Set<String> feats = result2.getKey();
        long supp2 = result2.getValue();
        long supp1 = highestSupport(results1, feats);
        if (supp1 != supp2) {
            allMatch = false;
            System.out.println("mismatch checking results2 [ " + supp1 + " vs " + supp2 + "]: " + feats);
        }
    }
    System.out.println("checked " + itemsetsChecked + " itemsets iterating through #2");

    assertTrue("Had mismatches!", allMatch);
}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java

License:Apache License

@Override
public void setUp() throws Exception {
    super.setUp();
    params.set(PFPGrowth.MIN_SUPPORT, "100");
    params.set(PFPGrowth.MAX_HEAPSIZE, "10000");
    params.set(PFPGrowth.NUM_GROUPS, "50");
    params.set(PFPGrowth.ENCODING, "UTF-8");
    File inputDir = getTestTempDir("transactions");
    File outputDir = getTestTempDir("frequentpatterns");
    File input = new File(inputDir, "test.txt");
    params.set(PFPGrowth.INPUT, input.getAbsolutePath());
    params.set(PFPGrowth.OUTPUT, outputDir.getAbsolutePath());

    Writer writer = Files.newWriter(input, Charsets.UTF_8);
    try {//from ww  w.j a v  a2s  .co m
        StringRecordIterator it = new StringRecordIterator(
                new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+");
        Collection<List<String>> transactions = Lists.newArrayList();

        while (it.hasNext()) {
            Pair<List<String>, Long> next = it.next();
            transactions.add(next.getFirst());
        }

        for (List<String> transaction : transactions) {
            String sep = "";
            for (String item : transaction) {
                writer.write(sep + item);
                sep = ",";
            }
            writer.write("\n");
        }

    } finally {
        Closeables.close(writer, false);
    }
}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java

License:Apache License

/**
 * Test Parallel FPGrowth on retail data using top-level runPFPGrowth() method
 *//*w ww  . j a  v  a  2s  .  c  o  m*/
@Test
public void testRetailDataMinSup100() throws Exception {
    StringRecordIterator it = new StringRecordIterator(
            new FileLineIterable(Resources.getResource("retail_results_with_min_sup_100.dat").openStream()),
            "\\s+");
    Map<Set<String>, Long> expectedResults = Maps.newHashMap();
    while (it.hasNext()) {
        Pair<List<String>, Long> next = it.next();
        List<String> items = Lists.newArrayList(next.getFirst());
        String supportString = items.remove(items.size() - 1);
        Long support = Long.parseLong(supportString.substring(1, supportString.length() - 1));
        expectedResults.put(Sets.newHashSet(items), support);
    }

    PFPGrowth.runPFPGrowth(params, getConfiguration());

    List<Pair<String, TopKStringPatterns>> frequentPatterns = PFPGrowth.readFrequentPattern(params);

    Map<Set<String>, Long> results = Maps.newHashMap();
    for (Pair<String, TopKStringPatterns> topK : frequentPatterns) {
        Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator();
        while (topKIt.hasNext()) {
            Pair<List<String>, Long> entry = topKIt.next();
            results.put(Sets.newHashSet(entry.getFirst()), entry.getSecond());
        }
    }

    for (Entry<Set<String>, Long> entry : results.entrySet()) {
        Set<String> key = entry.getKey();
        if (expectedResults.get(key) == null) {
            System.out.println("spurious (1): " + key + " with " + entry.getValue());
        } else {
            if (!expectedResults.get(key).equals(results.get(entry.getKey()))) {
                System.out.println("invalid (1): " + key + ", expected: " + expectedResults.get(key) + ", got: "
                        + results.get(entry.getKey()));
            } else {
                System.out.println("matched (1): " + key + ", with: " + expectedResults.get(key));
            }
        }
    }

    for (Entry<Set<String>, Long> entry : expectedResults.entrySet()) {
        Set<String> key = entry.getKey();
        if (results.get(key) == null) {
            System.out.println("missing (1): " + key + " with " + entry.getValue());
        }
    }
    assertEquals(expectedResults.size(), results.size());
}

From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java

License:Apache License

/**
 * Test Parallel FPG on retail data, running various stages individually
 *///from  w ww  .ja  v  a2 s. com
@Test
public void testRetailDataMinSup100InSteps() throws Exception {
    StringRecordIterator it = new StringRecordIterator(
            new FileLineIterable(Resources.getResource("retail_results_with_min_sup_100.dat").openStream()),
            "\\s+");
    Map<Set<String>, Long> expectedResults = Maps.newHashMap();
    while (it.hasNext()) {
        Pair<List<String>, Long> next = it.next();
        List<String> items = Lists.newArrayList(next.getFirst());
        String supportString = items.remove(items.size() - 1);
        Long support = Long.parseLong(supportString.substring(1, supportString.length() - 1));
        expectedResults.put(Sets.newHashSet(items), support);
    }
    Configuration conf = getConfiguration();
    log.info("Starting Parallel Counting Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE));
    PFPGrowth.startParallelCounting(params, conf);

    List<Pair<String, Long>> fList = PFPGrowth.readFList(params);
    PFPGrowth.saveFList(fList, params, conf);
    int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);
    int maxPerGroup = fList.size() / numGroups;
    if (fList.size() % numGroups != 0) {
        maxPerGroup++;
    }
    params.set(PFPGrowth.MAX_PER_GROUP, Integer.toString(maxPerGroup));

    PFPGrowth.startParallelFPGrowth(params, conf);
    log.info("Starting Pattern Aggregation Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE));
    PFPGrowth.startAggregating(params, conf);
    List<Pair<String, TopKStringPatterns>> frequentPatterns = PFPGrowth.readFrequentPattern(params);

    Map<Set<String>, Long> results = Maps.newHashMap();
    for (Pair<String, TopKStringPatterns> topK : frequentPatterns) {
        Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator();
        while (topKIt.hasNext()) {
            Pair<List<String>, Long> entry = topKIt.next();
            results.put(Sets.newHashSet(entry.getFirst()), entry.getSecond());
        }
    }

    for (Entry<Set<String>, Long> entry : results.entrySet()) {
        Set<String> key = entry.getKey();
        if (expectedResults.get(key) == null) {
            System.out.println("spurious (2): " + key);
        } else {
            if (!expectedResults.get(key).equals(results.get(entry.getKey()))) {
                System.out.println("invalid (2): " + key + ", expected: " + expectedResults.get(key) + ", got: "
                        + results.get(entry.getKey()));
            } else {
                System.out.println("matched (2): " + key + ", with: " + expectedResults.get(key));
            }
        }
    }

    for (Entry<Set<String>, Long> entry : expectedResults.entrySet()) {
        Set<String> key = entry.getKey();
        if (results.get(key) == null) {
            System.out.println("missing: " + key);
        }
    }
    assertEquals(expectedResults.size(), results.size());
}