List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.cloudera.castagna.logparser.mr.TranscodeLogs.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }//from w w w .j ava2s .c o m Configuration configuration = getConf(); boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT, Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = Job.getInstance(configuration); job.setJobName(Constants.STATUS_CODES_STATS); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TranscodeLogsMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.castagna.logparser.Utils.java
License:Apache License
public static void setReducers(Job job, Configuration configuration, Logger log) { boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT); int num_reducers = configuration.getInt(Constants.OPTION_NUM_REDUCERS, Constants.OPTION_NUM_REDUCERS_DEFAULT); // TODO: should we comment this out and let Hadoop decide the number of reducers? if (runLocal) { if (log != null) log.debug("Setting number of reducers to {}", 1); job.setNumReduceTasks(1); } else {//ww w .ja v a 2 s . co m job.setNumReduceTasks(num_reducers); if (log != null) log.debug("Setting number of reducers to {}", num_reducers); } }
From source file:com.cloudera.crunch.GroupingOptions.java
License:Open Source License
public void configure(Job job) { if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); }/*from w w w.jav a 2s .c om*/ if (groupingComparatorClass != null) { job.setGroupingComparatorClass(groupingComparatorClass); } if (sortComparatorClass != null) { job.setSortComparatorClass(sortComparatorClass); } if (numReducers > 0) { job.setNumReduceTasks(numReducers); LOG.info(String.format("Using %d reduce tasks", numReducers)); } }
From source file:com.cloudera.crunch.impl.mr.collect.PGroupedTableImpl.java
License:Open Source License
public void configureShuffle(Job job) { ptype.configureShuffle(job, groupingOptions); if (groupingOptions != null && groupingOptions.getNumReducers() <= 0) { int bytesPerTask = job.getConfiguration().getInt("crunch.bytes.per.reduce.task", (1000 * 1000 * 1000)); int numReduceTasks = 1 + (int) (getSize() / bytesPerTask); job.setNumReduceTasks(numReduceTasks); LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks)); }// w w w. j a v a2s .c o m }
From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java
License:Open Source License
private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException { Job job = new Job(conf); conf = job.getConfiguration();// w ww . j ava 2s.c om job.setJarByClass(jarClass); Set<DoNode> outputNodes = Sets.newHashSet(); Set<Target> targets = targetsToNodePaths.keySet(); MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null); for (Target target : targets) { DoNode node = null; for (NodePath nodePath : targetsToNodePaths.get(target)) { if (node == null) { PCollectionImpl collect = nodePath.tail(); node = DoNode.createOutputNode(target.toString(), collect.getPType()); outputHandler.configureNode(node, target); } outputNodes.add(walkPath(nodePath.descendingIterator(), node)); } } job.setMapperClass(CrunchMapper.class); List<DoNode> inputNodes; DoNode reduceNode = null; RTNodeSerializer serializer = new RTNodeSerializer(); if (group != null) { job.setReducerClass(CrunchReducer.class); List<DoNode> reduceNodes = Lists.newArrayList(outputNodes); reduceNode = reduceNodes.get(0); serializer.serialize(reduceNodes, conf, NodeContext.REDUCE); group.configureShuffle(job); DoNode mapOutputNode = group.getGroupingNode(); if (reduceNodes.size() == 1 && combineFnTable != null) { // Handle the combiner case DoNode mapSideCombineNode = combineFnTable.createDoNode(); mapSideCombineNode.addChild(mapOutputNode); mapOutputNode = mapSideCombineNode; } Set<DoNode> mapNodes = Sets.newHashSet(); for (NodePath nodePath : mapNodePaths) { // Advance these one step, since we've already configured // the grouping node, and the PGroupedTableImpl is the tail // of the NodePath. Iterator<PCollectionImpl> iter = nodePath.descendingIterator(); iter.next(); mapNodes.add(walkPath(iter, mapOutputNode)); } inputNodes = Lists.newArrayList(mapNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } else { // No grouping job.setNumReduceTasks(0); inputNodes = Lists.newArrayList(outputNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } if (inputNodes.size() == 1) { DoNode inputNode = inputNodes.get(0); inputNode.getSource().configureSource(job, -1); } else { for (int i = 0; i < inputNodes.size(); i++) { DoNode inputNode = inputNodes.get(i); inputNode.getSource().configureSource(job, i); } job.setInputFormatClass(CrunchInputFormat.class); } job.setJobName(createJobName(inputNodes, reduceNode)); return new CrunchJob(job, workingPath, outputHandler); }
From source file:com.cloudera.recordservice.examples.mapreduce.RecordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: RecordCount <input_query> <output_path>"); System.exit(1);//from www. ja v a 2s . co m } String inputQuery = args[0]; String output = args[1]; Job job = Job.getInstance(getConf()); job.setJobName("recordcount"); job.setJarByClass(RecordCount.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setNumReduceTasks(1); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(LongWritable.class); RecordServiceConfig.setInputQuery(job.getConfiguration(), inputQuery); job.setInputFormatClass(RecordServiceInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileSystem fs = FileSystem.get(job.getConfiguration()); Path outputPath = new Path(output); if (fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraChecksum.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; Job job = Job.getInstance(getConf()); if (args.length != 2 && args.length != 3) { usage();//from ww w . j av a 2 s .com return 2; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSum"); job.setJarByClass(TeraChecksum.class); job.setMapperClass(ChecksumMapper.class); job.setReducerClass(ChecksumReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Unsigned16.class); // force a single reducer job.setNumReduceTasks(1); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); } else { TeraInputFormat.setInputPaths(job, new Path(args[0])); job.setInputFormatClass(TeraInputFormat.class); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java
License:Apache License
/** * @param args the cli arguments/*from w w w. j a v a2 s. c o m*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage(); return 2; } setNumberOfRows(job, parseHumanLong(args[0])); Path outputDir = new Path(args[1]); if (outputDir.getFileSystem(getConf()).exists(outputDir)) { throw new IOException("Output directory " + outputDir + " already exists."); } FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();// w ww . j ava2 s . com return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } Job job = Job.getInstance(getConf()); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); } else { TeraInputFormat.setInputPaths(job, new Path(args[0])); job.setInputFormatClass(TeraInputFormat.class); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.sqoop.mapreduce.db.TestDataDrivenDBInputFormat.java
License:Apache License
public void testDateSplits() throws Exception { Statement s = connection.createStatement(); final String DATE_TABLE = "datetable"; final String COL = "foo"; try {/* ww w. ja v a 2 s.com*/ try { // delete the table if it already exists. s.executeUpdate("DROP TABLE " + DATE_TABLE); } catch (SQLException e) { // Ignored; proceed regardless of whether we deleted the table; // it may have simply not existed. } // Create the table. s.executeUpdate("CREATE TABLE " + DATE_TABLE + "(" + COL + " TIMESTAMP)"); s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-01')"); s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-02')"); s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-05-01')"); s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2011-04-01')"); // commit this tx. connection.commit(); Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); FileSystem fs = FileSystem.getLocal(conf); fs.delete(new Path(OUT_DIR), true); // now do a dd import Job job = new Job(conf); job.setMapperClass(ValMapper.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(DateCol.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(DateCol.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(1); job.getConfiguration().setInt("mapreduce.map.tasks", 2); FileOutputFormat.setOutputPath(job, new Path(OUT_DIR)); DBConfiguration.configureDB(job.getConfiguration(), DRIVER_CLASS, DB_URL, (String) null, (String) null); DataDrivenDBInputFormat.setInput(job, DateCol.class, DATE_TABLE, null, COL, COL); boolean ret = job.waitForCompletion(true); assertTrue("job failed", ret); // Check to see that we imported as much as we thought we did. assertEquals("Did not get all the records", 4, job.getCounters() .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue()); } finally { s.close(); } }