List of usage examples for org.apache.hadoop.mapreduce Job setReducerClass
public void setReducerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.citic.zxyjs.zwlscx.mapreduce.lib.input.HFileOutputFormatBase.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This/*from w ww .jav a2 s. com*/ * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the * DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of * regions</li> * <li>Sets the output key/value class to match HFileOutputFormat's * requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either * KeyValueSortReducer or PutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either * KeyValue or Put before running this function. */ public static void configureIncrementalLoad(Job job, HTable table, Class<? extends HFileOutputFormatBase> hfileOutputFormatBase) throws IOException { Configuration conf = job.getConfiguration(); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(hfileOutputFormatBase); // Based on the configured map output class, set the correct reducer to // properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(PutSortReducer.class); } else if (Text.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(TextSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName()); // Use table's region boundaries for TOP split points. LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName())); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); configurePartitioner(job, startKeys); // Set compression algorithms based on column families configureCompression(table, conf); configureBloomType(table, conf); configureBlockSize(table, conf); // TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured."); }
From source file:com.ckelsel.hadoop.mapreduce.WordCount.WordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: EventCount <in> <out>"); System.exit(2);//from ww w .j av a 2s . co m } Job job = Job.getInstance(conf, "event count"); job.setJarByClass(WordCount.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); // delete output if exists Path outPath = new Path(otherArgs[1]); outPath.getFileSystem(conf).delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.accumulo.upgrade.compatibility.DataCompatibilityVerify.java
License:Open Source License
@Override public int run(String[] args) throws Exception { final String jobName = this.getClass().getName(); options.parseArgs(jobName, args);/*from w w w .j a v a 2 s . c om*/ try { final int totalMapSlots = getConf().getInt("mapred.map.tasks", DataCompatibilityTestCli.DEFAULT_NUM_ROWS); if (-1 == options.test.numRows) { options.test.numRows = totalMapSlots; } final TableOperations ops = options.connection.getConnector().tableOperations(); final List<String> names = options.test.getTableNames(ops); int totalReduceSlots = getConf().getInt("mapred.reduce.tasks", 0); if (-1 != options.test.numReduceSlots) { totalReduceSlots = options.test.numReduceSlots; } if (0 == totalReduceSlots) { totalReduceSlots = names.size(); } final int reducesPerJob = Math.max(1, totalReduceSlots / names.size()); final List<Job> jobs = new ArrayList(); for (String name : names) { final Job job = new Job(getConf(), jobName + " " + name); job.setJarByClass(this.getClass()); options.input.useAccumuloInputFormat(job, name); job.setMapperClass(DataVerifyMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setReducerClass(LongSumReducer.class); job.setCombinerClass(LongSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(options.test.output, name)); job.setNumReduceTasks(reducesPerJob); job.submit(); jobs.add(job); } boolean success = true; final long numCellsPerRow = options.test.qualifiers * DataCompatibilityLoad.FAMILIES.length; final long numCellsPerFamily = options.test.qualifiers * options.test.numRows; for (Job job : jobs) { success &= job.waitForCompletion(true); final CounterGroup group = job.getCounters().getGroup(DataVerifyMapper.class.getName()); if (null == group) { log.error("Job '" + job.getJobName() + "' doesn't have counters for the verification mapper."); success = false; } else { final Counter badCounter = group.findCounter(BAD_COUNTER); if (null != badCounter && 0 < badCounter.getValue()) { log.error("Job '" + job.getJobName() + "' has " + badCounter.getValue() + " entries with bad checksums."); success = false; } int numRows = 0; int numFamilies = 0; for (Counter counter : group) { if (counter.getName().startsWith(ROW_COUNTER_PREFIX)) { numRows++; if (numCellsPerRow != counter.getValue()) { log.error("Job '" + job.getJobName() + "', counter '" + counter.getName() + "' should have " + numCellsPerRow + " cells, but instead has " + counter.getValue()); success = false; } } else if (counter.getName().startsWith(FAMILY_COUNTER_PREFIX)) { numFamilies++; if (numCellsPerFamily != counter.getValue()) { log.error("Job '" + job.getJobName() + "', counter '" + counter.getName() + "' should have " + numCellsPerFamily + " cells, but instead has " + counter.getValue()); success = false; } } } if (options.test.numRows != numRows) { log.error("Job '" + job.getJobName() + "' is supposed to have " + options.test.numRows + " rows, but has " + numRows); success = false; } if (DataCompatibilityLoad.FAMILIES.length != numFamilies) { log.error("Job '" + job.getJobName() + "' is supposed to have " + DataCompatibilityLoad.FAMILIES.length + " families, but has " + numFamilies); success = false; } } } if (success) { log.info("All internal checks passed."); } else { log.info("Some checks failed. see log."); } return success ? 0 : 1; } finally { options.input.close(); } }
From source file:com.cloudera.avro.MapReduceAvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }//from www.j a v a2 s.c o m Job job = new Job(getConf()); job.setJarByClass(MapReduceAvroWordCount.class); job.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setSortComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); return 0; }
From source file:com.cloudera.avro.MapReduceColorCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: MapReduceColorCount <input path> <output path>"); return -1; }/*w w w .j av a2s . c om*/ Job job = new Job(getConf()); job.setJarByClass(MapReduceColorCount.class); job.setJobName("Color Count"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapperClass(ColorCountMapper.class); AvroJob.setInputKeySchema(job, User.getClassSchema()); AvroJob.setMapOutputValueSchema(job, User.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(ColorCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.ByteCount.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(new Configuration()); // Trim off the hadoop-specific args String[] remArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // Pull in properties Options options = new Options(); Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("D"); options.addOption(property);/*from w ww . j av a 2 s . c o m*/ Option skipChecksums = new Option("skipChecksums", "skip checksums"); options.addOption(skipChecksums); Option profile = new Option("profile", "profile tasks"); options.addOption(profile); CommandLineParser parser = new BasicParser(); CommandLine line = parser.parse(options, remArgs); Properties properties = line.getOptionProperties("D"); for (Entry<Object, Object> prop : properties.entrySet()) { conf.set(prop.getKey().toString(), prop.getValue().toString()); System.out.println("Set config key " + prop.getKey() + " to " + prop.getValue()); } if (line.hasOption("skipChecksums")) { conf.setBoolean("bytecount.skipChecksums", true); System.out.println("Skipping checksums"); } if (line.hasOption("profile")) { conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples,depth=100,interval=1ms,lineno=y,thread=y,file=%s"); conf.set(MRJobConfig.NUM_MAP_PROFILES, "0"); conf.set("mapred.task.profile.maps", "1"); System.out.println("Profiling map tasks"); } // Get the positional arguments out remArgs = line.getArgs(); if (remArgs.length != 2) { System.err.println("Usage: ByteCount <inputBase> <outputBase>"); System.exit(1); } String inputBase = remArgs[0]; String outputBase = remArgs[1]; Job job = Job.getInstance(conf); job.setInputFormatClass(ByteBufferInputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(ByteCountMapper.class); job.setReducerClass(ByteCountReducer.class); job.setCombinerClass(ByteCountReducer.class); job.setOutputKeyClass(ByteWritable.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(inputBase)); FileOutputFormat.setOutputPath(job, new Path(outputBase)); job.setJarByClass(ByteCount.class); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); System.out.println("\tRead counters"); printCounter(counters, READ_COUNTER.BYTES_READ); printCounter(counters, READ_COUNTER.LOCAL_BYTES_READ); printCounter(counters, READ_COUNTER.SCR_BYTES_READ); printCounter(counters, READ_COUNTER.ZCR_BYTES_READ); System.exit(success ? 0 : 1); }
From source file:com.cloudera.castagna.logparser.mr.StatusCodesStats.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w . ja v a 2s .c o m*/ Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); } boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT, Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = Job.getInstance(configuration); job.setJobName(Constants.STATUS_CODES_STATS); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(StatusCodesStatsMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(StatusCodesStatsCombiner.class); job.setReducerClass(StatusCodesStatsReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Utils.setReducers(job, configuration, log); job.setOutputFormatClass(TextOutputFormat.class); if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java
License:Open Source License
private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException { Job job = new Job(conf); conf = job.getConfiguration();//from ww w.j av a 2 s . c o m job.setJarByClass(jarClass); Set<DoNode> outputNodes = Sets.newHashSet(); Set<Target> targets = targetsToNodePaths.keySet(); MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null); for (Target target : targets) { DoNode node = null; for (NodePath nodePath : targetsToNodePaths.get(target)) { if (node == null) { PCollectionImpl collect = nodePath.tail(); node = DoNode.createOutputNode(target.toString(), collect.getPType()); outputHandler.configureNode(node, target); } outputNodes.add(walkPath(nodePath.descendingIterator(), node)); } } job.setMapperClass(CrunchMapper.class); List<DoNode> inputNodes; DoNode reduceNode = null; RTNodeSerializer serializer = new RTNodeSerializer(); if (group != null) { job.setReducerClass(CrunchReducer.class); List<DoNode> reduceNodes = Lists.newArrayList(outputNodes); reduceNode = reduceNodes.get(0); serializer.serialize(reduceNodes, conf, NodeContext.REDUCE); group.configureShuffle(job); DoNode mapOutputNode = group.getGroupingNode(); if (reduceNodes.size() == 1 && combineFnTable != null) { // Handle the combiner case DoNode mapSideCombineNode = combineFnTable.createDoNode(); mapSideCombineNode.addChild(mapOutputNode); mapOutputNode = mapSideCombineNode; } Set<DoNode> mapNodes = Sets.newHashSet(); for (NodePath nodePath : mapNodePaths) { // Advance these one step, since we've already configured // the grouping node, and the PGroupedTableImpl is the tail // of the NodePath. Iterator<PCollectionImpl> iter = nodePath.descendingIterator(); iter.next(); mapNodes.add(walkPath(iter, mapOutputNode)); } inputNodes = Lists.newArrayList(mapNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } else { // No grouping job.setNumReduceTasks(0); inputNodes = Lists.newArrayList(outputNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } if (inputNodes.size() == 1) { DoNode inputNode = inputNodes.get(0); inputNode.getSource().configureSource(job, -1); } else { for (int i = 0; i < inputNodes.size(); i++) { DoNode inputNode = inputNodes.get(i); inputNode.getSource().configureSource(job, i); } job.setInputFormatClass(CrunchInputFormat.class); } job.setJobName(createJobName(inputNodes, reduceNode)); return new CrunchJob(job, workingPath, outputHandler); }
From source file:com.cloudera.hbase.WordCount.java
License:Open Source License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: wordcount <in> <out>"); return 2; }/* w w w. j a v a2s.c o m*/ Configuration conf = getConf(); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java
License:Apache License
/** * Run the MR2 color count with generic records, and return a map of favorite colors to * the number of users.//from ww w . j ava 2s.c o m */ public static java.util.Map<String, Integer> countColors() throws IOException, ClassNotFoundException, InterruptedException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setInt("mapreduce.job.reduces", 1); Job job = Job.getInstance(conf); job.setJarByClass(ColorCount.class); job.setJobName("MR2 Color Count With Generic Records"); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); job.waitForCompletion(false); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }