List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards * /*from www .j a va 2 s .c om*/ * @param params * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startParallelFPGrowth(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("pfp.parameters", params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get("output") + "/sortedoutput"; Job job = new Job(conf, "PFP Growth Driver running over input" + input); job.setJarByClass(PFPGrowth.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(TransactionTree.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(new Path(params.get("output")), "fpgrowth"); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.overwriteOutput(outPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(ParallelFPGrowthMapper.class); job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the * features present in it and thus calculate the final Top K frequent Patterns for each feature *//* w ww . j av a 2s . c o m*/ public static void startAggregating(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); // YA // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){ // conf.set("mapred.tasktracker.map.tasks.maximum", "6"); // conf.set("mapred.map.child.java.opts", "-Xmx1000M"); // conf.set("mapred.tasktracker.reduce.tasks.maximum", "6"); // conf.set("mapred.reduce.child.java.opts", "-Xmx1000M"); // } conf.setInt("mapred.max.map.failures.percent", 10); conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError"); // END YA String gfisIn = params.get(PFPGrowth.GROUP_FIS_IN, params.get(OUTPUT)); Path input = new Path(gfisIn, FPGROWTH); Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileOutputFormat.setOutputPath(job, outPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(AggregatorMapper.class); job.setCombinerClass(AggregatorReducer.class); job.setReducerClass(AggregatorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Count the frequencies of various features in parallel using Map/Reduce *///from w w w. ja va2 s . c o m public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){ // conf.set("mapred.tasktracker.map.tasks.maximum", "3"); // conf.set("mapred.tasktracker.reduce.tasks.maximum", "3"); // conf.set("mapred.map.child.java.opts", "-Xmx777M"); // conf.set("mapred.reduce.child.java.opts", "-Xmx777M"); // conf.setInt("mapred.max.map.failures.percent", 0); // } conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError"); // String input = params.get(INPUT); // Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START)); // Long.toString(PFPGrowth.TREC2011_MIN_TIMESTAMP)); //GMT23JAN2011)); long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END)); // Long.toString(Long.MAX_VALUE)); long windowSize = Long .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime))); long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize))); endTime = Math.min(endTime, startTime + windowSize); FileSystem fs = FileSystem.get(conf); // TODONE: do I need?getLocal(conf); Job[] jobArr = new Job[(int) Math.ceil(windowSize / stepSize)]; for (int j = 0; startTime < endTime; startTime += stepSize, ++j) { long jobEnd = startTime + stepSize; Job job = new Job(conf, "Parallel counting running over inerval " + startTime + "-" + jobEnd); // endTime); // Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); Path outRoot = new Path(params.get(OUTROOT)); Path stepOutput = new Path(outRoot, startTime + ""); stepOutput = new Path(stepOutput, jobEnd + ""); if (fs.exists(stepOutput)) { continue; } jobArr[j] = job; Path outPath = new Path(stepOutput, PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath); // HadoopUtil.delete(conf, outPath); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); PartitionByTimestamp.setInputPaths(job, params, conf); // FileInputFormat.addInputPath(job, new Path(input)); // job.setInputFormatClass(HtmlTweetInputFormat.class); job.setInputFormatClass(CSVTweetInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.submit(); // boolean succeeded = job.waitForCompletion(true); // if (!succeeded) { // throw new IllegalStateException("Job failed!"); // } } boolean allCompleted; do { Thread.sleep(1000); allCompleted = true; for (int j = 0; j < jobArr.length; ++j) { if (jobArr[j] == null) { continue; } boolean complete = jobArr[j].isComplete(); allCompleted &= complete; if (!complete) { String report = (j + " (" + jobArr[j].getJobName() + "): map " + StringUtils.formatPercent(jobArr[j].mapProgress(), 0) + " reduce " + StringUtils.formatPercent(jobArr[j].reduceProgress(), 0) + " - Tracking: " + jobArr[j].getTrackingURL()); LOG.info(report); } } } while (!allCompleted); boolean allSuccess = true; for (int j = 0; j < jobArr.length; ++j) { if (jobArr[j] == null) { continue; } boolean success = jobArr[j].isSuccessful(); allSuccess &= success; if (!success) { String report = (j + " (" + jobArr[j].getJobName() + "): FAILED - Tracking: " + jobArr[j].getTrackingURL()); LOG.info(report); } } if (!allSuccess) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards *//*from w ww . j a va 2 s.co m*/ public static void startParallelFPGrowth(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); // YA // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){ // conf.set("mapred.tasktracker.map.tasks.maximum", "6"); // conf.set("mapred.map.child.java.opts", "-Xmx1000M"); // conf.set("mapred.tasktracker.reduce.tasks.maximum", "6"); // conf.set("mapred.reduce.child.java.opts", "-Xmx1000M"); // } conf.setInt("mapred.max.map.failures.percent", 10); conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError"); // END YA long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START)); long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END)); long windowSize = Long .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime))); long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize))); endTime = Math.min(endTime, startTime + windowSize); Job job = new Job(conf, "PFPGrowth running over inerval " + startTime + "-" + endTime); job.setJarByClass(PFPGrowth.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(TransactionTree.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); // FileSystem fs = FileSystem.get(conf); //TODONE: do I need?getLocal(conf); PartitionByTimestamp.setInputPaths(job, params, conf); // FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FPGROWTH); FileOutputFormat.setOutputPath(job, outPath); // HadoopUtil.delete(conf, outPath); // job.setInputFormatClass(HtmlTweetInputFormat.class); job.setInputFormatClass(CSVTweetInputFormat.class); if (runMode.equals(RunningMode.BlockUpdate)) { job.setMapperClass(ParallelFPStreamMapper.class); // job.setCombinerClass(ParallelFPStreamCombiner.class); job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPStreamReducer.class); } else { job.setMapperClass(ParallelFPGrowthMapper.class); job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); } job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.ga.watchmaker.cd.hadoop.CDMahoutEvaluator.java
License:Apache License
/** * Configure the job/*from w ww .j a v a2s. c o m*/ * * @param job Job to configure * @param rules classification rules to evaluate * @param target label value to evaluate the rules for * @param inpath input path (the dataset) * @param outpath output <code>Path</code> * @param split DatasetSplit used to separate training and testing input * @throws IOException */ private static void configureJob(Job job, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) throws IOException { split.storeJobParameters(job.getConfiguration()); FileInputFormat.setInputPaths(job, inpath); FileOutputFormat.setOutputPath(job, outpath); job.setJarByClass(CDMahoutEvaluator.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(CDFitness.class); job.setMapperClass(CDMapper.class); job.setCombinerClass(CDReducer.class); job.setReducerClass(CDReducer.class); job.setInputFormatClass(DatasetTextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // store the parameters Configuration conf = job.getConfiguration(); conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules)); conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet())); conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target); }
From source file:org.apache.mahout.ga.watchmaker.cd.tool.CDInfosTool.java
License:Apache License
/** * Configure the job/*from ww w . j ava2 s . c o m*/ * * @param job * @param descriptors attributes's descriptors * @param inpath input <code>Path</code> * @param outpath output <code>Path</code> * @throws IOException */ private static void configureJob(Job job, Descriptors descriptors, Path inpath, Path outpath) throws IOException { FileInputFormat.setInputPaths(job, inpath); FileOutputFormat.setOutputPath(job, outpath); job.setJarByClass(CDInfosTool.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(ToolMapper.class); job.setCombinerClass(ToolCombiner.class); job.setReducerClass(ToolReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // store the stringified descriptors job.getConfiguration().set(ToolMapper.ATTRIBUTES, StringUtils.toString(descriptors.getChars())); }
From source file:org.apache.mahout.graph.AdjacencyMatrixJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true); addOption("edges", null, "text files containing the edges of the graph (vertexA,vertexB per line)", true); addOption("symmetric", null, "produce a symmetric adjacency matrix (corresponds to an undirected graph)", String.valueOf(false)); addOutputOption();/*w w w . ja v a 2 s . co m*/ Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path vertices = new Path(getOption("vertices")); Path edges = new Path(getOption("edges")); boolean symmetric = Boolean.parseBoolean(getOption("symmetric")); log.info("Indexing vertices sequentially, this might take a while..."); int numVertices = indexVertices(vertices, getOutputPath(VERTEX_INDEX)); HadoopUtil.writeInt(numVertices, getOutputPath(NUM_VERTICES), getConf()); Preconditions.checkArgument(numVertices > 0); log.info("Found " + numVertices + " vertices, creating adjacency matrix..."); Job createAdjacencyMatrix = prepareJob(edges, getOutputPath(ADJACENCY_MATRIX), TextInputFormat.class, VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); createAdjacencyMatrix.setCombinerClass(VectorSumReducer.class); Configuration createAdjacencyMatrixConf = createAdjacencyMatrix.getConfiguration(); createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices)); createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, getOutputPath(VERTEX_INDEX).toString()); createAdjacencyMatrixConf.setBoolean(SYMMETRIC_PARAM, symmetric); boolean succeeded = createAdjacencyMatrix.waitForCompletion(true); return succeeded ? 0 : -1; }
From source file:org.apache.mahout.graph.common.DegreeDistributionJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from www.jav a 2 s. com*/ addOutputOption(); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Job degreesPerVertex = prepareJob(getInputPath(), getTempPath(TMP_DEGREES_PER_VERTEX), SequenceFileInputFormat.class, DegreeOfVertexMapper.class, Vertex.class, IntWritable.class, IntSumReducer.class, Vertex.class, IntWritable.class, SequenceFileOutputFormat.class); degreesPerVertex.setCombinerClass(IntSumReducer.class); degreesPerVertex.waitForCompletion(true); Job degreeDistribution = prepareJob(getTempPath(TMP_DEGREES_PER_VERTEX), getOutputPath(), SequenceFileInputFormat.class, DegreesMapper.class, IntWritable.class, IntWritable.class, IntSumReducer.class, IntWritable.class, IntWritable.class, TextOutputFormat.class); degreeDistribution.setCombinerClass(IntSumReducer.class); degreeDistribution.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.graph.common.LocalClusteringCoefficientJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOption("edges", "e", "path to the edges of the input graph", true); addOption("triangles", "t", "path to the triangles of the input graph", true); addOutputOption();//from w w w .j a v a 2 s.c om Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path edgesPath = new Path(parsedArgs.get("--edges")); Path trianglesPath = new Path(parsedArgs.get("--triangles")); // unfortunately we don't have access to an undeprecated MultipleInputs, so we need several M/R steps instead of one... Job countEdgesPerVertex = prepareJob(edgesPath, getTempPath(TMP_EDGES_PER_VERTEX), SequenceFileInputFormat.class, EdgeCountMapper.class, Vertex.class, TriangleOrEdgeCount.class, Reducer.class, Vertex.class, TriangleOrEdgeCount.class, SequenceFileOutputFormat.class); countEdgesPerVertex.setCombinerClass(TriangleOrEdgeCountCombiner.class); countEdgesPerVertex.waitForCompletion(true); Job countTrianglesPerVertex = prepareJob(trianglesPath, getTempPath(TMP_TRIANGLES_PER_VERTEX), SequenceFileInputFormat.class, TriangleCountMapper.class, Vertex.class, TriangleOrEdgeCount.class, Reducer.class, Vertex.class, TriangleOrEdgeCount.class, SequenceFileOutputFormat.class); countTrianglesPerVertex.setCombinerClass(TriangleOrEdgeCountCombiner.class); countTrianglesPerVertex.waitForCompletion(true); Job computeLocalClusteringCoefficient = prepareJob( getCombinedTempPath(TMP_EDGES_PER_VERTEX, TMP_TRIANGLES_PER_VERTEX), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, Vertex.class, TriangleOrEdgeCount.class, LocalClusteringCoefficientReducer.class, LongWritable.class, DoubleWritable.class, TextOutputFormat.class); computeLocalClusteringCoefficient.setCombinerClass(TriangleOrEdgeCountCombiner.class); computeLocalClusteringCoefficient.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.graph.components.FindKTrussesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w .ja v a 2s . co m*/ addOutputOption(); addOption("k", "k", "The k parameter of the k-trusses to find."); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); int k = Integer.parseInt(parsedArgs.get("--k")); // extract parameter AtomicInteger currentPhase = new AtomicInteger(); Configuration conf = new Configuration(); Path simplifyInputPath = inputPath; Path simplifyOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Simplify the graph first */ SimplifyGraphJob simplifyGraphJob = new SimplifyGraphJob(); simplifyGraphJob.setConf(conf); simplifyGraphJob.run(new String[] { "--input", simplifyInputPath.toString(), "--output", simplifyOutputPath.toString(), "--tempDir", tempDirPath.toString() }); } Path currentTrussesDirPath = simplifyOutputPath; if (shouldRunNextPhase(parsedArgs, currentPhase)) { while (true) { /* * Augment the simplified graph with degrees */ // scatter the edges to each of the vertices and count degree Path augmentInputPath = currentTrussesDirPath; Path augmentOutputPath = new Path(tempDirPath, "augment" + String.valueOf(System.currentTimeMillis())); AugmentGraphWithDegreesJob augmentGraphWithDegreesJob = new AugmentGraphWithDegreesJob(); augmentGraphWithDegreesJob.setConf(conf); augmentGraphWithDegreesJob.run(new String[] { "--input", augmentInputPath.toString(), "--output", augmentOutputPath.toString(), "--tempDir", new Path(tempDirPath, String.valueOf(System.currentTimeMillis())).toString(), }); /* * Enumerate triangles in the graph */ Path enumerateInputPath = augmentOutputPath; // scatter the edges to lower degree vertex and build open triads Path enumerateOutputPath = new Path(tempDirPath, "enumerate" + String.valueOf(System.currentTimeMillis())); EnumerateTrianglesJob enumerateTrianglesJob = new EnumerateTrianglesJob(); enumerateTrianglesJob.setConf(conf); enumerateTrianglesJob.run(new String[] { "--input", enumerateInputPath.toString(), "--output", enumerateOutputPath.toString(), "--tempDir", new Path(tempDirPath, String.valueOf(System.currentTimeMillis())).toString(), }); /* * Drop edges with insufficient support */ Path checkSupportInputPath = enumerateOutputPath; Path checkSupportOutputPath = new Path(tempDirPath, "support" + String.valueOf(System.currentTimeMillis())); Job checkTrianglesForSupport = prepareJob(checkSupportInputPath, checkSupportOutputPath, SequenceFileInputFormat.class, SplitTrianglesToEdgesMapper.class, UndirectedEdge.class, IntWritable.class, DropUnsupportedEdgesReducer.class, UndirectedEdge.class, NullWritable.class, SequenceFileOutputFormat.class); checkTrianglesForSupport.setCombinerClass(IntSumReducer.class); checkTrianglesForSupport.getConfiguration().setInt(K, k); checkTrianglesForSupport.waitForCompletion(true); currentTrussesDirPath = checkSupportOutputPath; long droppedEdges = checkTrianglesForSupport.getCounters().findCounter(Counter.DROPPED_EDGES) .getValue(); log.info("{} edges were dropped", droppedEdges); if (droppedEdges == 0L) { break; } } } Path componentsInputPath = new Path(tempDirPath, "converted" + String.valueOf(System.currentTimeMillis())); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Prepare the input for FindComponents */ Job convertFromat = prepareJob(currentTrussesDirPath, componentsInputPath, SequenceFileInputFormat.class, PrepareInputMapper.class, Vertex.class, FlaggedVertex.class, Reducer.class, Vertex.class, FlaggedVertex.class, SequenceFileOutputFormat.class); convertFromat.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Find the components of the remaining graph */ FindComponentsJob componentsJob = new FindComponentsJob(); componentsJob.setConf(conf); componentsJob.run(new String[] { "--input", componentsInputPath.toString(), "--output", outputPath.toString(), "--tempDir", tempDirPath.toString(), }); } return 0; }