List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass
public void setReducerClass(Class<? extends Reducer> theClass)
From source file:cascading.flow.FlowStep.java
License:Open Source License
protected JobConf getJobConf(JobConf parentConf) throws IOException { JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf); // set values first so they can't break things downstream if (hasProperties()) { for (Map.Entry entry : getProperties().entrySet()) conf.set(entry.getKey().toString(), entry.getValue().toString()); }/*from w w w.j a va2s .c o m*/ // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepName()); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapperClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(conf); initFromSink(conf); initFromTraps(conf); if (sink.getScheme().getNumSinkParts() != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(sink.getScheme().getNumSinkParts()); else conf.setNumMapTasks(sink.getScheme().getNumSinkParts()); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks conf.setInt("cascading.flow.step.id", id); conf.set("cascading.flow.step", Util.serializeBase64(this)); return conf; }
From source file:cascading.flow.hadoop.HadoopFlowStep.java
License:Open Source License
public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) { JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig); // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH))); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapRunnerClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(flowProcess, conf);/*from ww w .j a v a 2 s.c om*/ initFromSink(flowProcess, conf); initFromTraps(flowProcess, conf); initFromStepConfigDef(conf); int numSinkParts = getSink().getScheme().getNumSinkParts(); if (numSinkParts != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(numSinkParts); else conf.setNumMapTasks(numSinkParts); } else if (getGroup() != null) { int gatherPartitions = conf.getNumReduceTasks(); if (gatherPartitions == 0) gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0); if (gatherPartitions == 0) throw new FlowException(getName(), "a default number of gather partitions must be set, see FlowRuntimeProps"); conf.setNumReduceTasks(gatherPartitions); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); conf.setPartitionerClass(GroupingPartitioner.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this, getGroup()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingSortingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks String versionString = Version.getRelease(); if (versionString != null) conf.set("cascading.version", versionString); conf.set(CASCADING_FLOW_STEP_ID, getID()); conf.set("cascading.flow.step.num", Integer.toString(getOrdinal())); HadoopUtil.setIsInflow(conf); Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator(); String mapState = pack(iterator.next(), conf); String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf); // hadoop 20.2 doesn't like dist cache when using local mode int maxSize = Short.MAX_VALUE; int length = mapState.length() + reduceState.length(); if (isHadoopLocalMode(conf) || length < maxSize) // seems safe { conf.set("cascading.flow.step.node.map", mapState); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce", reduceState); } else { conf.set("cascading.flow.step.node.map.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState)); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState)); } return conf; }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal(inputFileApache); JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration(); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); String outputPath = getOutputPath("flowTest"); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)), 10);//w w w .ja v a2s . c o m flow.complete(); validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();/*w ww . ja va 2 s . c o m*/ validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testFlow() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);//from w w w . java2s .co m JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); FileOutputFormat.setOutputPath(conf, new Path(outputPath1)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(flow.openSource(), 10); flow.complete(); validateLength(flow.openSink(), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testCascade() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);/*www. j a v a2 s . c o m*/ // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true); Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true); Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true))); FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true))); FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); JobConf thirdConf = new JobConf(defaultConf); thirdConf.setJobName("third-mr"); thirdConf.setOutputKeyClass(LongWritable.class); thirdConf.setOutputValueClass(Text.class); thirdConf.setMapperClass(IdentityMapper.class); thirdConf.setReducerClass(IdentityReducer.class); thirdConf.setInputFormat(TextInputFormat.class); thirdConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true))); FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true))); Flow thirdMR = new MapReduceFlow(thirdConf, true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); // cascade.writeDOT( "mrcascade.dot" ); cascade.complete(); validateLength(thirdMR.openSink(), 10); }
From source file:clusteringblocks.ClusteringBlocks.java
public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, ClusteringBlocks.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("ClusteringBlocks"); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.set("key.value.separator.in.input.line", ""); JobClient.runJob(job);/*from w w w .j a v a 2s.c o m*/ return 0; }
From source file:cn.edu.xmu.dm.mapreduce.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();// w w w . java2 s. co m return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(LongWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
From source file:cn.edu.xmu.dm.mapreduce.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//w ww. ja va 2s. co m * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "Sorter"); job.setJarByClass(Sort.class); JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.acme.extensions.mr.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf job = new JobConf(); String[] otherArgs = new GenericOptionsParser(job, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);//from w ww. j a v a 2 s . com } job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); JobClient.runJob(job); }