List of usage examples for org.apache.mahout.common.iterator StringRecordIterator StringRecordIterator
public StringRecordIterator(Iterable<String> stringIterator, String pattern)
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java
License:Apache License
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); Path output = new Path(params.get("output", "output.txt")); Path input = new Path(params.get("input")); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); Charset encoding = Charset.forName(params.get("encoding")); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, TopKStringPatterns.class); FSDataInputStream inputStream = null; FSDataInputStream inputStreamAgain = null; Collection<String> features = Sets.newHashSet(); if ("true".equals(params.get(PFPGrowth.USE_FPG2))) { com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String> fp = new com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String>(); try {// ww w. j a v a 2 s . c o m inputStream = fs.open(input); inputStreamAgain = fs.open(input); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } else { FPGrowth<String> fp = new FPGrowth<String>(); inputStream = fs.open(input); inputStreamAgain = fs.open(input); try { fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, output); for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond()); } }
From source file:com.skp.experiment.fpm.pfpgrowth.FPGrowthDriver.java
License:Apache License
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); String output = params.get("output", "output.txt"); Path path = new Path(output); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(path.toUri(), conf); Charset encoding = Charset.forName(params.get("encoding")); String input = params.get("input"); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, TopKStringPatterns.class); if ("true".equals(params.get("useFPG2"))) { org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String>(); Collection<String> features = new HashSet<String>(); try {/*from www .j a v a 2s .c o m*/ fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.closeQuietly(writer); } } else { org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>(); Collection<String> features = new HashSet<String>(); try { fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.closeQuietly(writer); } } List<Pair<String, TopKStringPatterns>> frequentPatterns = org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth .readFrequentPattern(conf, path); for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond()); } }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTest.java
License:Apache License
@Test public void testSpecificCaseFromRetailDataMinSup500() throws IOException { FPGrowth<String> fp = new FPGrowth<String>(); StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"); int pattern_41_36_39 = 0; while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); List<String> items = next.getFirst(); if (items.contains("41") && items.contains("36") && items.contains("39")) { pattern_41_36_39++;// w w w . ja v a 2 s .co m } } final Map<Set<String>, Long> results = Maps.newHashMap(); Set<String> returnableFeatures = Sets.newHashSet(); returnableFeatures.add("41"); returnableFeatures.add("36"); returnableFeatures.add("39"); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"), fp.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"), 500), 500, 1000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() { @Override public void collect(String key, List<Pair<List<String>, Long>> value) { for (Pair<List<String>, Long> v : value) { List<String> l = v.getFirst(); results.put(Sets.newHashSet(l), v.getSecond()); } } }, new StatusUpdater() { @Override public void update(String status) { } }); assertEquals(Long.valueOf(pattern_41_36_39), results.get(returnableFeatures)); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTest2.java
License:Apache License
@Test public void testSpecificCaseFromRetailDataMinSup500() throws IOException { FPGrowthObj<String> fp = new FPGrowthObj<String>(); StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"); int pattern_41_36_39 = 0; while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); List<String> items = next.getFirst(); if (items.contains("41") && items.contains("36") && items.contains("39")) { pattern_41_36_39++;//from ww w. j a va 2s . c o m } } final Map<Set<String>, Long> results = Maps.newHashMap(); Set<String> returnableFeatures = Sets.newHashSet(); returnableFeatures.add("41"); returnableFeatures.add("36"); returnableFeatures.add("39"); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"), fp.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"), 500), 500, 1000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() { @Override public void collect(String key, List<Pair<List<String>, Long>> value) { for (Pair<List<String>, Long> v : value) { List<String> l = v.getFirst(); results.put(Sets.newHashSet(l), v.getSecond()); } } }, new StatusUpdater() { @Override public void update(String status) { } }); assertEquals(Long.valueOf(pattern_41_36_39), results.get(returnableFeatures)); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthRetailDataTestVs.java
License:Apache License
@Test public void testVsWithRetailData() throws IOException { String inputFilename = "retail.dat"; int minSupport = 500; Collection<String> returnableFeatures = Sets.newHashSet(); org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp1 = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>(); Map<Set<String>, Long> results1 = Maps.newHashMap(); fp1.generateTopKFrequentPatterns(// ww w. j av a 2 s . c o m new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), fp1.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), minSupport), minSupport, 100000, returnableFeatures, new MapCollector(results1), new DummyUpdater()); FPGrowthObj<String> fp2 = new FPGrowthObj<String>(); Map<Set<String>, Long> initialResults2 = Maps.newHashMap(); fp2.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), fp2.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), minSupport), minSupport, 100000, Sets.<String>newHashSet(), new MapCollector(initialResults2), new DummyUpdater()); Map<Set<String>, Long> results2; if (returnableFeatures.isEmpty()) { results2 = initialResults2; } else { Map<Set<String>, Long> tmpResult = new HashMap<Set<String>, Long>(); for (Map.Entry<Set<String>, Long> result2 : initialResults2.entrySet()) { Set<String> r2feats = result2.getKey(); boolean hasSome = false; for (String rf : returnableFeatures) { if (r2feats.contains(rf)) { hasSome = true; break; } } if (hasSome) { tmpResult.put(result2.getKey(), result2.getValue()); } } results2 = tmpResult; } boolean allMatch = hasAll(results1, results2); log.info("checked " + results1.size() + " itemsets iterating through #1"); allMatch &= hasAll(results2, results1); log.info("checked " + results2.size() + " itemsets iterating through #2"); assertTrue("Had mismatches!", allMatch); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthSyntheticDataTest.java
License:Apache License
@Test public void testSpecificCasesFromSynthData() throws IOException { FPGrowthObj<String> fp = new FPGrowthObj<String>(); String inputFilename = "FPGsynth.dat"; StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"); int patternCnt_10_13_1669 = 0; int patternCnt_10_13 = 0; while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); List<String> items = next.getFirst(); if (items.contains("10") && items.contains("13")) { patternCnt_10_13++;/*from www . j a v a 2 s . co m*/ if (items.contains("1669")) { patternCnt_10_13_1669++; } } } int minSupport = 50; if (patternCnt_10_13_1669 < minSupport) { throw new IllegalStateException("the test is broken or data is missing (" + patternCnt_10_13_1669 + ", " + patternCnt_10_13 + ')'); } final Map<Set<String>, Long> results = Maps.newHashMap(); Set<String> features_10_13 = Sets.newHashSet(); features_10_13.add("10"); features_10_13.add("13"); Set<String> returnableFeatures = Sets.newHashSet(); returnableFeatures.add("10"); returnableFeatures.add("13"); returnableFeatures.add("1669"); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), fp.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), minSupport), minSupport, 100000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() { @Override public void collect(String key, List<Pair<List<String>, Long>> value) { for (Pair<List<String>, Long> v : value) { List<String> l = v.getFirst(); results.put(Sets.newHashSet(l), v.getSecond()); System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst()); } } }, new StatusUpdater() { @Override public void update(String status) { } }); assertEquals(patternCnt_10_13, highestSupport(results, features_10_13)); assertEquals(patternCnt_10_13_1669, highestSupport(results, returnableFeatures)); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.FPGrowthSyntheticDataTest.java
License:Apache License
@Test public void testVsWithSynthData() throws IOException { Collection<String> returnableFeatures = Sets.newHashSet(); // not limiting features (or including too many) can cause // the test to run a very long time returnableFeatures.add("10"); returnableFeatures.add("13"); // returnableFeatures.add("1669"); FPGrowth<String> fp1 = new FPGrowth<String>(); final Map<Set<String>, Long> results1 = Maps.newHashMap(); String inputFilename = "FPGsynth.dat"; int minSupport = 100; fp1.generateTopKFrequentPatterns(/*from ww w . j av a 2 s . c om*/ new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), fp1.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), minSupport), minSupport, 1000000, returnableFeatures, new OutputCollector<String, List<Pair<List<String>, Long>>>() { @Override public void collect(String key, List<Pair<List<String>, Long>> value) { for (Pair<List<String>, Long> v : value) { List<String> l = v.getFirst(); results1.put(Sets.newHashSet(l), v.getSecond()); System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst()); } } }, new StatusUpdater() { @Override public void update(String status) { } }); FPGrowthObj<String> fp2 = new FPGrowthObj<String>(); final Map<Set<String>, Long> initialResults2 = Maps.newHashMap(); fp2.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), fp2.generateFList( new StringRecordIterator( new FileLineIterable(Resources.getResource(inputFilename).openStream()), "\\s+"), minSupport), minSupport, 1000000, Sets.<String>newHashSet(), new OutputCollector<String, List<Pair<List<String>, Long>>>() { @Override public void collect(String key, List<Pair<List<String>, Long>> value) { for (Pair<List<String>, Long> v : value) { List<String> l = v.getFirst(); initialResults2.put(Sets.newHashSet(l), v.getSecond()); System.out.println("found pat [" + v.getSecond() + "]: " + v.getFirst()); } } }, new StatusUpdater() { @Override public void update(String status) { } }); Map<Set<String>, Long> results2; if (returnableFeatures.isEmpty()) { results2 = initialResults2; } else { Map<Set<String>, Long> tmpResult = new HashMap<Set<String>, Long>(); for (Map.Entry<Set<String>, Long> result2 : initialResults2.entrySet()) { Set<String> r2feats = result2.getKey(); boolean hasSome = false; for (String rf : returnableFeatures) { if (r2feats.contains(rf)) { hasSome = true; break; } } if (hasSome) { tmpResult.put(result2.getKey(), result2.getValue()); } } results2 = tmpResult; } boolean allMatch = true; int itemsetsChecked = 0; for (Map.Entry<Set<String>, Long> result1 : results1.entrySet()) { itemsetsChecked++; Set<String> feats = result1.getKey(); long supp1 = result1.getValue(); long supp2 = highestSupport(results2, feats); if (supp1 != supp2) { allMatch = false; System.out.println("mismatch checking results1 [ " + supp1 + " vs " + supp2 + "]: " + feats); } } System.out.println("checked " + itemsetsChecked + " itemsets iterating through #1"); itemsetsChecked = 0; for (Map.Entry<Set<String>, Long> result2 : results2.entrySet()) { itemsetsChecked++; Set<String> feats = result2.getKey(); long supp2 = result2.getValue(); long supp1 = highestSupport(results1, feats); if (supp1 != supp2) { allMatch = false; System.out.println("mismatch checking results2 [ " + supp1 + " vs " + supp2 + "]: " + feats); } } System.out.println("checked " + itemsetsChecked + " itemsets iterating through #2"); assertTrue("Had mismatches!", allMatch); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); params.set(PFPGrowth.MIN_SUPPORT, "100"); params.set(PFPGrowth.MAX_HEAPSIZE, "10000"); params.set(PFPGrowth.NUM_GROUPS, "50"); params.set(PFPGrowth.ENCODING, "UTF-8"); File inputDir = getTestTempDir("transactions"); File outputDir = getTestTempDir("frequentpatterns"); File input = new File(inputDir, "test.txt"); params.set(PFPGrowth.INPUT, input.getAbsolutePath()); params.set(PFPGrowth.OUTPUT, outputDir.getAbsolutePath()); Writer writer = Files.newWriter(input, Charsets.UTF_8); try {//from ww w.j a v a2s .co m StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"); Collection<List<String>> transactions = Lists.newArrayList(); while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); transactions.add(next.getFirst()); } for (List<String> transaction : transactions) { String sep = ""; for (String item : transaction) { writer.write(sep + item); sep = ","; } writer.write("\n"); } } finally { Closeables.close(writer, false); } }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java
License:Apache License
/** * Test Parallel FPGrowth on retail data using top-level runPFPGrowth() method *//*w ww . j a v a 2s . c o m*/ @Test public void testRetailDataMinSup100() throws Exception { StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource("retail_results_with_min_sup_100.dat").openStream()), "\\s+"); Map<Set<String>, Long> expectedResults = Maps.newHashMap(); while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); List<String> items = Lists.newArrayList(next.getFirst()); String supportString = items.remove(items.size() - 1); Long support = Long.parseLong(supportString.substring(1, supportString.length() - 1)); expectedResults.put(Sets.newHashSet(items), support); } PFPGrowth.runPFPGrowth(params, getConfiguration()); List<Pair<String, TopKStringPatterns>> frequentPatterns = PFPGrowth.readFrequentPattern(params); Map<Set<String>, Long> results = Maps.newHashMap(); for (Pair<String, TopKStringPatterns> topK : frequentPatterns) { Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator(); while (topKIt.hasNext()) { Pair<List<String>, Long> entry = topKIt.next(); results.put(Sets.newHashSet(entry.getFirst()), entry.getSecond()); } } for (Entry<Set<String>, Long> entry : results.entrySet()) { Set<String> key = entry.getKey(); if (expectedResults.get(key) == null) { System.out.println("spurious (1): " + key + " with " + entry.getValue()); } else { if (!expectedResults.get(key).equals(results.get(entry.getKey()))) { System.out.println("invalid (1): " + key + ", expected: " + expectedResults.get(key) + ", got: " + results.get(entry.getKey())); } else { System.out.println("matched (1): " + key + ", with: " + expectedResults.get(key)); } } } for (Entry<Set<String>, Long> entry : expectedResults.entrySet()) { Set<String> key = entry.getKey(); if (results.get(key) == null) { System.out.println("missing (1): " + key + " with " + entry.getValue()); } } assertEquals(expectedResults.size(), results.size()); }
From source file:org.gpfvic.mahout.fpm.pfpgrowth.PFPGrowthRetailDataTest.java
License:Apache License
/** * Test Parallel FPG on retail data, running various stages individually *///from w ww .ja v a2 s. com @Test public void testRetailDataMinSup100InSteps() throws Exception { StringRecordIterator it = new StringRecordIterator( new FileLineIterable(Resources.getResource("retail_results_with_min_sup_100.dat").openStream()), "\\s+"); Map<Set<String>, Long> expectedResults = Maps.newHashMap(); while (it.hasNext()) { Pair<List<String>, Long> next = it.next(); List<String> items = Lists.newArrayList(next.getFirst()); String supportString = items.remove(items.size() - 1); Long support = Long.parseLong(supportString.substring(1, supportString.length() - 1)); expectedResults.put(Sets.newHashSet(items), support); } Configuration conf = getConfiguration(); log.info("Starting Parallel Counting Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE)); PFPGrowth.startParallelCounting(params, conf); List<Pair<String, Long>> fList = PFPGrowth.readFList(params); PFPGrowth.saveFList(fList, params, conf); int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT); int maxPerGroup = fList.size() / numGroups; if (fList.size() % numGroups != 0) { maxPerGroup++; } params.set(PFPGrowth.MAX_PER_GROUP, Integer.toString(maxPerGroup)); PFPGrowth.startParallelFPGrowth(params, conf); log.info("Starting Pattern Aggregation Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE)); PFPGrowth.startAggregating(params, conf); List<Pair<String, TopKStringPatterns>> frequentPatterns = PFPGrowth.readFrequentPattern(params); Map<Set<String>, Long> results = Maps.newHashMap(); for (Pair<String, TopKStringPatterns> topK : frequentPatterns) { Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator(); while (topKIt.hasNext()) { Pair<List<String>, Long> entry = topKIt.next(); results.put(Sets.newHashSet(entry.getFirst()), entry.getSecond()); } } for (Entry<Set<String>, Long> entry : results.entrySet()) { Set<String> key = entry.getKey(); if (expectedResults.get(key) == null) { System.out.println("spurious (2): " + key); } else { if (!expectedResults.get(key).equals(results.get(entry.getKey()))) { System.out.println("invalid (2): " + key + ", expected: " + expectedResults.get(key) + ", got: " + results.get(entry.getKey())); } else { System.out.println("matched (2): " + key + ", with: " + expectedResults.get(key)); } } } for (Entry<Set<String>, Long> entry : expectedResults.entrySet()) { Set<String> key = entry.getKey(); if (results.get(key) == null) { System.out.println("missing: " + key); } } assertEquals(expectedResults.size(), results.size()); }