List of usage examples for org.apache.mahout.common Parameters get
public String get(String key)
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Serializes the fList and returns the string representation of the List *//*from w w w . ja v a 2 s .c om*/ public static void saveFList(Iterable<Pair<String, Long>> flist, Parameters params, Configuration conf) throws IOException { Path flistPath = new Path(params.get(OUTPUT), F_LIST); FileSystem fs = FileSystem.get(flistPath.toUri(), conf); flistPath = fs.makeQualified(flistPath); HadoopUtil.delete(conf, flistPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class); try { for (Pair<String, Long> pair : flist) { writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond())); } } finally { writer.close(); } DistributedCache.addCacheFile(flistPath.toUri(), conf); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Read the Frequent Patterns generated from Text * //from ww w.ja va 2 s .c o m * @return List of TopK patterns for each string frequent feature */ public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException { Configuration conf = new Configuration(); Path frequentPatternsPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, FILE_PATTERN)); List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList(); for (FileStatus fileStatus : outputFiles) { ret.addAll(FPGrowth.readFrequentPattern(conf, fileStatus.getPath())); } return ret; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features * present in it and thus calculate the final Top K frequent Patterns for each feature *//*from w w w. ja v a 2 s. c o m*/ public static void startAggregating(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(OUTPUT), FPGROWTH); Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileOutputFormat.setOutputPath(job, outPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(AggregatorMapper.class); job.setCombinerClass(AggregatorReducer.class); job.setReducerClass(AggregatorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Count the frequencies of various features in parallel using Map/Reduce *//*from ww w . ja v a2 s .co m*/ public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get(INPUT); Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards *//*from w ww . jav a 2s . c o m*/ public static void startParallelFPGrowth(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(INPUT)); Job job = new Job(conf, "PFP Growth Driver running over input" + input); job.setJarByClass(PFPGrowth.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(TransactionTree.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FPGROWTH); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelFPGrowthMapper.class); job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.cg.mapreduce.myfpgrowth.PFPGrowth.java
License:Apache License
/** * Serializes the fList and returns the string representation of the List *///from w ww.j a va 2 s . c o m public static void saveFList(List<Pair<String, Long>> fList, Parameters params, Configuration conf) throws IOException { Path flistPath = new Path(params.get(OUTPUT) + "/oldlist", F_LIST); FileSystem fs = FileSystem.get(flistPath.toUri(), conf); flistPath = fs.makeQualified(flistPath); HadoopUtil.delete(conf, flistPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, Pair.class); // set param to control group size in MR jobs int numGroups = params.getInt(NUM_GROUPS, NUM_GROUPS_DEFAULT); int maxPerGroup = fList.size() / numGroups; if (fList.size() % numGroups != 0) { maxPerGroup++; } params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup)); try { int group = 0; int count = 0; for (Pair<String, Long> pair : fList) { if (count == maxPerGroup) { group++; count = 0; } writer.append(new Text(pair.getFirst()), new Pair<Integer, Long>(group, pair.getSecond())); //writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond())); } } finally { writer.close(); } DistributedCache.addCacheFile(flistPath.toUri(), conf); }
From source file:com.cg.mapreduce.myfpgrowth.PFPGrowth.java
License:Apache License
/** * Count the frequencies of various features in parallel using Map/Reduce */// w ww. j a v a2s . co m public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get(INPUT); Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); // Job job = initJob(conf); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.cg.mapreduce.myfpgrowth.PFPGrowth.java
License:Apache License
/** * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards *//*from w w w .j ava 2s . c o m*/ public static void startParallelFPGrowth(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(INPUT)); Job job = new Job(conf, "PFP Growth Driver running over input" + input); job.setJarByClass(PFPGrowth.class); // Job job = initJob(conf); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ArrayList.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FPGROWTH); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelFPGrowthMapper.class); //job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.skp.experiment.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Read the Frequent Patterns generated from Text * /*from w ww. j av a 2 s. com*/ * @return List of TopK patterns for each string frequent feature */ public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException { Configuration conf = new Configuration(); Path frequentPatternsPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, FILE_PATTERN)); List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList(); for (FileStatus fileStatus : outputFiles) { ret.addAll(org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth.readFrequentPattern(conf, fileStatus.getPath())); } return ret; }
From source file:de.mpii.fsm.driver.FsmDriver.java
License:Apache License
/** * (non-Javadoc)/*from w w w. j av a 2 s . c o m*/ * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) * * Add the appropriate options here. Execute the MG-FSM algorithm * according to the parameters specified at run time. * * @param String[] args * @return int */ @Override public int run(String[] args) throws Exception { /* Here parameters that will be available to the user * during run time are specified and intialized. */ /* Hadooop-config options */ addOutputOption(); /*User-interesting options*/ addOption("input", "i", "(Optional) Specify the path from where the input is to be read" + "\n NOTE: This option can not be used with -(r)esume option.", null); addOption("support", "s", "(Optional) Minimum support (sigma) " + "\nDefault Value: 1\n", FsmConfig.SIGMA_DEFAULT_STRING); addOption("gamma", "g", "(Optional) Maximum allowed for mining frequent sequences (gamma)" + " by MG-FSM " + "\nDefault Value: 2\n", FsmConfig.GAMMA_DEFAULT_STRING); addOption("lambda", "l", "(Optional) Maximum length for mining frequent sequences (lambda)" + "\nDefault Value: 5\n", FsmConfig.LAMBDA_DEFAULT_STRING); addOption("execMode", "m", "Method of execution viz. s -(s)equential or d -(d)istributed" + "\nDefault Value: (s)-sequential\n", FsmConfig.DEFAULT_EXEC_MODE); addOption("type", "t", "(Optional) Specify the mining mode." + "\nExpected values for input:" + "\n1. a -(a)ll\n2. m -(m)aximal \n3. c -(c)losed" + "\nDefault Value : a -(a)ll\n", FsmConfig.DEFAULT_TYPE); /* keepFiles default value is null. * It will be set to a temporary location, in case * no path is specified.*/ addOption("keepFiles", "k", "(Optional) Keep the intermediary files " + "for later use or runs. The files stored are:" + "\n1. Dictionary \n2. Encoded Sequences \n " + "Specify the intermediate path where to keep these files :", null); /* resume points to the location where the * intermediary files are located*/ addOption("resume", "r", "(Optional) Resume running further " + "runs of the MG-FSM algorithm on" + " already encoded transaction file located in the folder specified in input.\n", null); /*Developer-interesting options*/ addOption("partitionSize", "p", "(Optional) Explicitly specify the partition size." + "\nDefault Value: 10000", FsmConfig.DEFAULT_PARTITION_SIZE); addOption("indexing", "id", "(Optional) Specify the indexing mode." + "\nExpected values for input:" + "\n1. none\n2. minmax \n3. full" + "\nDefault Value : full\n", FsmConfig.DEFAULT_INDEXING_METHOD); /* split flag is false by default*/ addFlag("split", "sp", "(Optional) Explicitly specify " + "whether or not to allow split by setting this flag."); addOption("numReducers", "N", "(Optional) Number of reducers to be used by MG-FSM. Default value: 90 ", "90"); /*------------------------------------------------------------ * ERROR CHECKS *------------------------------------------------------------*/ /* Parse the arguments received from * the user during run-time.*/ if (parseArguments(args) == null) { System.out.println("\n------------\n" + " E R R O R " + "\n------------\n"); System.out.println("One of the mandatory options is NOT specified"); System.out.println("e.g. the input option MUST be specified."); //Return a non-zero exit status to indicate failure return 1; } Parameters params = new Parameters(); if (hasOption("tempDir")) { String tempDirPath = getOption("tempDir"); params.set("tempDir", tempDirPath); } if (hasOption("input")) { String inputString = getOption("input"); params.set("input", inputString); } else { params.set("input", null); } if (hasOption("support")) { String supportString = getOption("support"); /* * Checks & constraints on the value that can * be assigned to support, gamma, & lambda. * * NOTE: refer [1] */ if (Integer.parseInt(supportString) < 1) { System.out.println("Value of support should be greater than or equal to 1"); //Return a non-zero exit status to indicate failure return (1); } params.set("support", supportString); } if (hasOption("gamma")) { String gammaString = getOption("gamma"); if (Integer.parseInt(gammaString) < 0) { System.out.println("Value of gap should be greater than or equal to 0"); //Return a non-zero exit status to indicate failure return (1); } params.set("gamma", gammaString); } if (hasOption("lambda")) { String lambdaString = getOption("lambda"); if (Integer.parseInt(lambdaString) < 2) { System.out.println("Value of length should be greater than or equal to 2"); //Return a non-zero exit status to indicate failure return (1); } params.set("lambda", lambdaString); } if (hasOption("execMode")) { String modeString = getOption("execMode"); params.set("execMode", modeString); } if (hasOption("type")) { String modeString = getOption("type"); params.set("type", modeString); } if (hasOption("indexing")) { String indexingString = getOption("indexing"); params.set("indexing", indexingString); } if (hasOption("partitionSize")) { String partitionString = getOption("partitionSize"); params.set("partitionSize", partitionString); } if (hasOption("split")) { params.set("split", "true"); } else { params.set("split", "false"); } if (hasOption("keepFiles")) { String keepFilesString = getOption("keepFiles"); params.set("keepFiles", keepFilesString); } else { params.set("keepFiles", null); } if (hasOption("resume")) { String resumeString = getOption("resume"); params.set("resume", resumeString); } else { params.set("resume", null); } if (hasOption("numReducers")) { String numReducersString = getOption("numReducers"); params.set("numReducers", numReducersString); } else { params.set("numReducers", null); } Path inputDir = null; Path outputDir = getOutputPath(); /* --------------------------------------------------------------------- * ERROR CHECKS ON COMBINATION OF OPTIONS SUPPLIED TO THE DRIVER * --------------------------------------------------------------------*/ //Complain if the '-(t)ype' is equal to '-(m)aximal' or '-(c)losed' and //the 'tempDir' is not specified /*if((params.get("tempDir")==null||params.get("tempDir").contentEquals("temp"))&& ((params.get("type").toCharArray()[0]=='m')||(params.get("type").toCharArray()[0]=='c'))){ System.out .println("If -(t)ype is -(m)aximal or -(c)losed then a -tempDir path must be specified"); }*/ if ((params.get("resume") != null) && (params.get("keepFiles") != null)) { System.out.println("-(r)esume & -(k)eepFiles are mutually exclusive options"); System.out.println("Exiting..."); //Return a non-zero exit status to indicate failure return (1); } if ((params.get("input") != null) && (params.get("resume") != null)) { System.out.println("-(r)esume & -(i)nput are mutually exclusive options"); System.out.println("Exiting..."); //Return a non-zero exit status to indicate failure return (1); } if ((params.get("input") == null) && (params.get("resume") == null)) { System.out.println("At least one option from -(i)nput or -(r)esume must be specified"); System.out.println("Exiting..."); //Return a non-zero exit status to indicate failure return (1); } else { if (params.get("input") != null) { inputDir = new Path(params.get("input")); } else { inputDir = new Path(params.get("resume")); } } /* --------------------------------------------------------------------- * Checks to make sure the i/o paths * exist and are consistent. * -------------------------------------------------------------------- */ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); //If the output paths exist clean them up if (fs.exists(outputDir)) { System.out.println("Deleting existing output path"); fs.delete(outputDir, true); } //Create the necessary output paths afresh now fs.mkdirs(outputDir); //Complain if the input path doesn't exist if (!fs.exists(inputDir)) { System.out.println("\n------------\n" + " E R R O R " + "\n------------\n"); System.out.println("Input path does not exist OR input option not specified. Exiting..."); //Return a non-zero exit status to indicate failure return (1); } if (inputDir.toString().compareTo(outputDir.toString()) == 0) { System.out.println("\n------------\n" + " E R R O R " + "\n------------\n"); System.out.println("The input and output path can NOT be same." + "\nThe output path is deleted prior to running the Hadoop jobs." + "\nHence, the input would be also deleted if paths are same." + "\nExiting..."); //Return a non-zero exit status to indicate failure return (1); } params.set("input", inputDir.toString()); params.set("output", outputDir.toString()); /*--------------------------------------------------------------------- * END OF ERROR CHECKS * --------------------------------------------------------------------*/ /* Execute the FSM Job depending upon the parameters specified. */ String executionMethod = getOption("execMode"); //Set the resume and keepFiles flags in the commonConfig. //Also, set the intermediateOutput path accordingly. if (params.get("resume") != null) commonConfig.setResumeOption(true); else commonConfig.setResumeOption(false); if (params.get("keepFiles") != null) { commonConfig.setKeepFilesOption(true); Path intermediateDir = new Path(params.get("keepFiles")); if (fs.exists(intermediateDir)) { fs.delete(intermediateDir, true); } commonConfig.setIntermediatePath(params.get("keepFiles")); } else { File intermediateOutputPath = File.createTempFile("MG_FSM_INTRM_OP_", ""); //Below JDK 7 we are only allowed to create temporary files. //Hence, turn the file into a directory in temporary folder. intermediateOutputPath.delete(); intermediateOutputPath.mkdir(); commonConfig.setIntermediatePath(intermediateOutputPath.getAbsolutePath().toString()); System.out.println("The intermediate output will be written \n" + "to this temporary path :" + intermediateOutputPath); commonConfig.setKeepFilesOption(false); } //Set the 'tempDir' if its null if (params.get("tempDir") == null || params.get("tempDir").contentEquals("temp")) { File tempOutputPath = File.createTempFile("MG_FSM_TEMP_OP_", ""); tempOutputPath.delete(); //tempOutputPath.mkdir(); commonConfig.setTmpPath(tempOutputPath.getAbsolutePath().toString()); System.out.println("The temporary output associated with the internal map -reduce\n" + "jobs will be written to this temporary path :" + commonConfig.getTmpPath()); } else { commonConfig.setTmpPath(params.get("tempDir")); } //Set the input and output paths of the commonConfig commonConfig.setInputPath(params.get("input")); commonConfig.setOutputPath(params.get("output")); commonConfig.setDictionaryPath( commonConfig.getIntermediatePath().concat("/" + Constants.OUTPUT_DICTIONARY_FILE_PATH)); //Supply the rest of the algorithm specific options to commonConfig commonConfig.setSigma(Integer.parseInt(params.get("support"))); commonConfig.setGamma(Integer.parseInt(params.get("gamma"))); commonConfig.setLambda(Integer.parseInt(params.get("lambda"))); commonConfig.setPartitionSize(Long.parseLong(params.get("partitionSize"))); commonConfig.setAllowSplits(Boolean.parseBoolean(params.get("splits"))); if (params.get("numReducers") != null) { commonConfig.setNumberOfReducers(Integer.parseInt(params.get("numReducers"))); } switch (params.get("type").toCharArray()[0]) { case 'a': { commonConfig.setType(FsmConfig.Type.ALL); break; } case 'm': { commonConfig.setType(FsmConfig.Type.MAXIMAL); break; } case 'c': { commonConfig.setType(FsmConfig.Type.CLOSED); break; } default: { commonConfig.setType(FsmConfig.Type.ALL); break; } } switch (params.get("indexing").toCharArray()[0]) { case 'n': { commonConfig.setIndexingMethod(FsmConfig.IndexingMethod.NONE); break; } case 'm': { commonConfig.setIndexingMethod(FsmConfig.IndexingMethod.MINMAX); break; } case 'f': { commonConfig.setIndexingMethod(FsmConfig.IndexingMethod.FULL); break; } default: { commonConfig.setIndexingMethod(FsmConfig.IndexingMethod.FULL); break; } } //SEQUENTIAL EXECUTION MODE if ("s".equalsIgnoreCase(executionMethod)) { SequentialMode mySequentialMiner; mySequentialMiner = new SequentialMode(commonConfig); // If we are dealing with a fresh set of transactions // we need to do encode & then mine. if (!commonConfig.isResumeOption()) { mySequentialMiner.createDictionary(commonConfig.getInputPath()); mySequentialMiner.createIdToItemMap(); //If the input path is a corpus //runSeqJob will recursively call encodeAndMine() //on all the files to bring together a encoded sequences file //and consequently call the sequences miner on each of these //encoded sequences mySequentialMiner.runSeqJob(new File(commonConfig.getInputPath())); } /* * If the transactions are encoded from previous runs, then run * the following set of functions for reading the encoded transactions * and then directly mine them for frequent sequences. */ else { mySequentialMiner.setIdToItemMap(new Dictionary().readDictionary( commonConfig.getInputPath().concat("/" + Constants.OUTPUT_DICTIONARY_FILE_PATH))); mySequentialMiner.encodeAndMine(mySequentialMiner.getCommonConfig().getInputPath()); } } //DISTRIBUTED EXECUTION MODE else if ("d".equalsIgnoreCase(executionMethod)) { DistributedMode myDistributedMiner = new DistributedMode(commonConfig); /*Execute the appropriate job based on whether we need to * encode the input sequences or not. */ if (!commonConfig.isResumeOption()) myDistributedMiner.runJobs(); else myDistributedMiner.resumeJobs(); } //END OF EXECUTING FSM JOB //Return a zero exit status to indicate successful completion return 0; }