List of usage examples for org.apache.mahout.common.iterator FileLineIterable FileLineIterable
public FileLineIterable(InputStream is, Charset encoding, boolean skipFirstLine)
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java
License:Apache License
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); Path output = new Path(params.get("output", "output.txt")); Path input = new Path(params.get("input")); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); Charset encoding = Charset.forName(params.get("encoding")); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, TopKStringPatterns.class); FSDataInputStream inputStream = null; FSDataInputStream inputStreamAgain = null; Collection<String> features = Sets.newHashSet(); if ("true".equals(params.get(PFPGrowth.USE_FPG2))) { com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String> fp = new com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String>(); try {//w ww . j a v a2 s .c o m inputStream = fs.open(input); inputStreamAgain = fs.open(input); fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } else { FPGrowth<String> fp = new FPGrowth<String>(); inputStream = fs.open(input); inputStreamAgain = fs.open(input); try { fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.close(writer, false); Closeables.close(inputStream, true); Closeables.close(inputStreamAgain, true); } } List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, output); for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond()); } }
From source file:com.skp.experiment.fpm.pfpgrowth.FPGrowthDriver.java
License:Apache License
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); String output = params.get("output", "output.txt"); Path path = new Path(output); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(path.toUri(), conf); Charset encoding = Charset.forName(params.get("encoding")); String input = params.get("input"); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, TopKStringPatterns.class); if ("true".equals(params.get("useFPG2"))) { org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String>(); Collection<String> features = new HashSet<String>(); try {/*from w w w . ja v a 2 s.com*/ fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.closeQuietly(writer); } } else { org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>(); Collection<String> features = new HashSet<String>(); try { fp.generateTopKFrequentPatterns( new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern), fp.generateFList(new StringRecordIterator( new FileLineIterable(new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter( new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null)); } finally { Closeables.closeQuietly(writer); } } List<Pair<String, TopKStringPatterns>> frequentPatterns = org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth .readFrequentPattern(conf, path); for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond()); } }