List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass1.java
License:Open Source License
/** * /*from ww w . j a va 2 s.co m*/ * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [2] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the GildPercentReducerPass1.class as the * combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String groupBy = getConf().get("groupBy"); StringBuffer sb = new StringBuffer(); sb.append("gild percent of: "); sb.append(groupBy); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // testing -- ensure each node gets 2 reducers JobConf jobConf = new JobConf(getConf(), GildPercentDriverPass1.class); JobClient jobClient = new JobClient(jobConf); ClusterStatus cluster = jobClient.getClusterStatus(); job.setNumReduceTasks(cluster.getTaskTrackers() * 2); // Mapper and Reducer Classes to use job.setMapperClass(GildPercentMapperPass1.class); job.setReducerClass(GildPercentReducerPass1.class); // Mapper output classes job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[2].equals("yes")) { job.setCombinerClass(GildPercentReducerPass1.class); } // The Jar file to run job.setJarByClass(GildPercentDriverPass1.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass2.java
License:Open Source License
/** * //from w ww. ja v a 2 s . c o m * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [2] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the GildPercentReducerPass2.class as * the combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); StringBuffer sb = new StringBuffer(); sb.append("sorted gild percent"); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // to ensure output is sorted job.setNumReduceTasks(1); // Mapper and Reducer Classes to use job.setMapperClass(GildPercentMapperPass2.class); job.setReducerClass(GildPercentReducerPass2.class); // Mapper output classes job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(Text.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(DoubleWritable.class); job.setOutputValueClass(Text.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[2].equals("yes")) { job.setCombinerClass(GildPercentReducerPass2.class); } // sort in descending order job.setSortComparatorClass(DoubleWritableDescendingComparator.class); // The Jar file to run job.setJarByClass(GildPercentDriverPass2.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:csc555.ebratt.depaul.edu.RCWordCountAcronymsDriver.java
License:Open Source License
/** * //from w w w.jav a2 s .c o m * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [3] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the RCWordCountReducer.class as the * combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String aggregate = getConf().get("aggregate"); String groupBy = getConf().get("groupBy"); StringBuffer sb = new StringBuffer(); sb.append("count of acronyms in: "); sb.append(aggregate); sb.append("; grouped by: "); sb.append(groupBy); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // testing -- ensure each node gets 2 reducers JobConf jobConf = new JobConf(getConf(), RCWordCountAcronymsDriver.class); JobClient jobClient = new JobClient(jobConf); ClusterStatus cluster = jobClient.getClusterStatus(); job.setNumReduceTasks(cluster.getTaskTrackers() * 2); // Mapper and Reducer Classes to use job.setMapperClass(RCWordCountMapper.class); job.setReducerClass(RCWordCountReducer.class); // Mapper output classes job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[3].equals("yes")) { job.setCombinerClass(RCWordCountReducer.class); } // The Jar file to run job.setJarByClass(RCWordCountAcronymsDriver.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:csc555.ebratt.depaul.edu.RCWordCountDriver.java
License:Open Source License
/** * //from ww w.j a v a 2 s . c om * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [3] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the RCWordCountReducer.class as the * combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String aggregate = getConf().get("aggregate"); String groupBy = getConf().get("groupBy"); StringBuffer sb = new StringBuffer(); sb.append("count of: "); sb.append(aggregate); sb.append("; grouped by: "); sb.append(groupBy); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // testing -- ensure each node gets 2 reducers JobConf jobConf = new JobConf(getConf(), RCWordCountDriver.class); JobClient jobClient = new JobClient(jobConf); ClusterStatus cluster = jobClient.getClusterStatus(); job.setNumReduceTasks(cluster.getTaskTrackers() * 5); // Mapper and Reducer Classes to use job.setMapperClass(RCWordCountMapper.class); job.setReducerClass(RCWordCountReducer.class); // Mapper output classes job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[3].equals("yes")) { job.setCombinerClass(RCWordCountReducer.class); } // The Jar file to run job.setJarByClass(RCWordCountDriver.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:csc555.ebratt.depaul.edu.VoteCountDriver.java
License:Open Source License
/** * /*from w w w.j a va2s .com*/ * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [2] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the VoteCountReducer.class as the * combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String groupBy = getConf().get("groupBy"); StringBuffer sb = new StringBuffer(); sb.append("count of votes grouped by: "); sb.append(groupBy); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // testing -- ensure each node gets 2 reducers JobConf jobConf = new JobConf(getConf(), VoteCountDriver.class); JobClient jobClient = new JobClient(jobConf); ClusterStatus cluster = jobClient.getClusterStatus(); job.setNumReduceTasks(cluster.getTaskTrackers() * 2); // Mapper and Reducer Classes to use job.setMapperClass(VoteCountMapper.class); job.setReducerClass(LongSumReducer.class); // Mapper output classes job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[2].equals("yes")) { job.setCombinerClass(LongSumReducer.class); } // The Jar file to run job.setJarByClass(VoteCountDriver.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:csc555.ebratt.depaul.edu.VoteSorterDriver.java
License:Open Source License
/** * // w w w .jav a2 s.c o m * Runs the driver by creating a new hadoop Job based on the configuration. * Defines the path in/out based on the first two arguments. Allows for an * optional combiner based on the 4th argument. * * @param args * [0] the input directory on HDFS * @param args * [1] the output directory on HDFS * @param args * [2] tells the system whether or not to use a combiner ("yes") * and, if so, it will use the VoteSorterReducer.class as the * combiner. * @throws Exception * if there is an issue with any of the arguments * */ @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); StringBuffer sb = new StringBuffer(); sb.append("sorted vote counts"); job.setJobName(sb.toString()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); // to ensure output is sorted job.setNumReduceTasks(1); // Mapper and Reducer Classes to use job.setMapperClass(VoteSorterMapper.class); job.setReducerClass(VoteSorterReducer.class); // Mapper output classes job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); // Input format class job.setInputFormatClass(TextInputFormat.class); // Reducer output classes job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); // Output format class job.setOutputFormatClass(TextOutputFormat.class); // Combiner if (args[2].equals("yes")) { job.setCombinerClass(VoteSorterReducer.class); } // sort in descending order job.setSortComparatorClass(LongWritable.DecreasingComparator.class); // The Jar file to run job.setJarByClass(VoteSorterDriver.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); return 0; }
From source file:DataCubeRefresh.Grep.java
License:Apache License
/** * Run function.//from w w w. j a va 2 s. c o m * @param args arguments * @return error code * @throws Exception if an exception occurs */ public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inUrl> <outUrl> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Job grepJob = new Job(getConf()); Job sortJob = new Job(getConf()); String tempStreamTag = UUID.randomUUID().toString(); try { grepJob.setJobName("grep-search"); TextHStreamingInputFormat.addInputStream(grepJob, 1000, 600, -1, "", false, args[0]); HStreamingJobConf.setIsStreamingJob(grepJob, true); grepJob.setMapperClass(RegexMapper.class); grepJob.getConfiguration().set("mapred.mapper.regex", args[2]); if (args.length == 4) grepJob.getConfiguration().set("mapred.mapper.regex.group", args[3]); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); grepJob.setInputFormatClass(TextHStreamingInputFormat.class); grepJob.setOutputFormatClass(TextHStreamingOutputFormat.class); HStreamingOutputFormat.setOutputStreamTag(grepJob, tempStreamTag); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.setJobName("grep-search"); grepJob.setJarByClass(this.getClass()); grepJob.submit(); sortJob.setJobName("grep-sort"); sortJob.setInputFormatClass(TextHStreamingInputFormat.class); HStreamingJobConf.setIsStreamingJob(sortJob, true); // add previous stream partition/reducer 0 as input. HStreamingInputFormat.addInputStreamTag(sortJob, tempStreamTag, 0); sortJob.setMapperClass(InverseTextMapper.class); sortJob.setNumReduceTasks(1); // single output stream sortJob.setOutputFormatClass(TextHStreamingOutputFormat.class); TextHStreamingOutputFormat.setOutputPath(sortJob, args[1]); sortJob.setSortComparatorClass( // sort by decreasing fre LongWritable.DecreasingComparator.class); sortJob.setJarByClass(this.getClass()); sortJob.submit(); return sortJob.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); try { grepJob.killJob(); } catch (Exception e1) { // ignore } try { sortJob.killJob(); } catch (Exception e2) { // ignore } } return 0; }
From source file:de.hpi.fgis.hdrs.mapreduce.examples.PredicateCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJarByClass(PredicateCount.class); job.setJobName("PredicateCount"); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TripleInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); int argc = 0; TripleInputFormat.setStoreAddress(job, args[argc++]); TripleInputFormat.setIndex(job, args[argc++]); if ("-p".equals(args[argc])) { argc++;// ww w. j ava 2s . co m String s = args[argc++]; String p = args[argc++]; String o = args[argc++]; if ("*".equals(s)) s = null; if ("*".equals(p)) p = null; if ("*".equals(o)) o = null; TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o)); } else { FileOutputFormat.setOutputPath(job, new Path(args[argc])); } boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:de.hpi.fgis.hdrs.mapreduce.examples.TripleSize.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJarByClass(TripleSize.class); job.setJobName("TripleSize"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TripleInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); int argc = 0; TripleInputFormat.setStoreAddress(job, args[argc++]); TripleInputFormat.setIndex(job, args[argc++]); if ("-p".equals(args[argc])) { argc++;// www.j av a 2 s.c o m String s = args[argc++]; String p = args[argc++]; String o = args[argc++]; if ("*".equals(s)) s = null; if ("*".equals(p)) p = null; if ("*".equals(o)) o = null; TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o)); } else { TextOutputFormat.setOutputPath(job, new Path(args[argc])); } boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams//from www.j ava 2 s. c om */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); con.set(WINDOW_TYPE, mode.toString()); con.setInt(WINDOW_SIZE, winsize); if (mode.toString().equalsIgnoreCase("DOCUMENT")) { con.setInt("mapred.job.map.memory.mb", 3000); con.set("mapred.child.java.opts", "-Xmx2900M"); con.set("mapred.reduce.child.java.opts", "-Xmx8000M"); con.setInt("mapred.job.reduce.memory.mb", 8120); } else { con.setInt("mapred.job.map.memory.mb", 2000); con.set("mapred.child.java.opts", "-Xmx1900M"); con.set("mapred.reduce.child.java.opts", "-Xmx2900M"); con.setInt("mapred.job.reduce.memory.mb", 3000); } con.setBoolean("mapred.compress.map.output", true); con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setBoolean("mapred.compress.output", true); con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setInt("mapred.task.timeout", 6000000); con.setInt("io.sort.factor", 50); con.setInt("mapreduce.map.tasks", 256); con.setInt("dfs.replication", 1); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(512); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }