Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.cloudera.castagna.logparser.mr.TranscodeLogs.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//from w w  w .j  ava2s  .c o m

    Configuration configuration = getConf();

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT,
            Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = Job.getInstance(configuration);
    job.setJobName(Constants.STATUS_CODES_STATS);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TranscodeLogsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.castagna.logparser.Utils.java

License:Apache License

public static void setReducers(Job job, Configuration configuration, Logger log) {
    boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT);
    int num_reducers = configuration.getInt(Constants.OPTION_NUM_REDUCERS,
            Constants.OPTION_NUM_REDUCERS_DEFAULT);

    // TODO: should we comment this out and let Hadoop decide the number of reducers?
    if (runLocal) {
        if (log != null)
            log.debug("Setting number of reducers to {}", 1);
        job.setNumReduceTasks(1);
    } else {//ww  w .ja  v  a  2 s  . co  m
        job.setNumReduceTasks(num_reducers);
        if (log != null)
            log.debug("Setting number of reducers to {}", num_reducers);
    }
}

From source file:com.cloudera.crunch.GroupingOptions.java

License:Open Source License

public void configure(Job job) {
    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }/*from  w w w.jav  a  2s  .c  om*/
    if (groupingComparatorClass != null) {
        job.setGroupingComparatorClass(groupingComparatorClass);
    }
    if (sortComparatorClass != null) {
        job.setSortComparatorClass(sortComparatorClass);
    }
    if (numReducers > 0) {
        job.setNumReduceTasks(numReducers);
        LOG.info(String.format("Using %d reduce tasks", numReducers));
    }
}

From source file:com.cloudera.crunch.impl.mr.collect.PGroupedTableImpl.java

License:Open Source License

public void configureShuffle(Job job) {
    ptype.configureShuffle(job, groupingOptions);
    if (groupingOptions != null && groupingOptions.getNumReducers() <= 0) {
        int bytesPerTask = job.getConfiguration().getInt("crunch.bytes.per.reduce.task", (1000 * 1000 * 1000));
        int numReduceTasks = 1 + (int) (getSize() / bytesPerTask);
        job.setNumReduceTasks(numReduceTasks);
        LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks));
    }//  w  w w.  j a v  a2s  .c o m
}

From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java

License:Open Source License

private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();//  w  ww .  j  ava 2s.c  om
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    RTNodeSerializer serializer = new RTNodeSerializer();
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        reduceNode = reduceNodes.get(0);
        serializer.serialize(reduceNodes, conf, NodeContext.REDUCE);

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        if (reduceNodes.size() == 1 && combineFnTable != null) {
            // Handle the combiner case
            DoNode mapSideCombineNode = combineFnTable.createDoNode();
            mapSideCombineNode.addChild(mapOutputNode);
            mapOutputNode = mapSideCombineNode;
        }

        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    }

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(inputNodes, reduceNode));

    return new CrunchJob(job, workingPath, outputHandler);
}

From source file:com.cloudera.recordservice.examples.mapreduce.RecordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: RecordCount <input_query> <output_path>");
        System.exit(1);//from www. ja  v a 2s  .  co  m
    }
    String inputQuery = args[0];
    String output = args[1];

    Job job = Job.getInstance(getConf());
    job.setJobName("recordcount");
    job.setJarByClass(RecordCount.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(LongWritable.class);

    RecordServiceConfig.setInputQuery(job.getConfiguration(), inputQuery);
    job.setInputFormatClass(RecordServiceInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path outputPath = new Path(output);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraChecksum.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    Job job = Job.getInstance(getConf());
    if (args.length != 2 && args.length != 3) {
        usage();//from  ww  w .  j  av  a  2 s  .com
        return 2;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSum");
    job.setJarByClass(TeraChecksum.class);
    job.setMapperClass(ChecksumMapper.class);
    job.setReducerClass(ChecksumReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Unsigned16.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
    } else {
        TeraInputFormat.setInputPaths(job, new Path(args[0]));
        job.setInputFormatClass(TeraInputFormat.class);
    }
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java

License:Apache License

/**
 * @param args the cli arguments/*from   w  w w. j a v a2 s. c o  m*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    if (args.length != 2) {
        usage();
        return 2;
    }
    setNumberOfRows(job, parseHumanLong(args[0]));
    Path outputDir = new Path(args[1]);
    if (outputDir.getFileSystem(getConf()).exists(outputDir)) {
        throw new IOException("Output directory " + outputDir + " already exists.");
    }
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGen.class);
    job.setMapperClass(SortGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    if (args.length != 2 && args.length != 3) {
        usage();//  w ww  .  j ava2 s .  com
        return 1;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    Job job = Job.getInstance(getConf());
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidate.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    // force a single split
    FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
    } else {
        TeraInputFormat.setInputPaths(job, new Path(args[0]));
        job.setInputFormatClass(TeraInputFormat.class);
    }
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.sqoop.mapreduce.db.TestDataDrivenDBInputFormat.java

License:Apache License

public void testDateSplits() throws Exception {
    Statement s = connection.createStatement();
    final String DATE_TABLE = "datetable";
    final String COL = "foo";
    try {/* ww  w.  ja v a 2 s.com*/
        try {
            // delete the table if it already exists.
            s.executeUpdate("DROP TABLE " + DATE_TABLE);
        } catch (SQLException e) {
            // Ignored; proceed regardless of whether we deleted the table;
            // it may have simply not existed.
        }

        // Create the table.
        s.executeUpdate("CREATE TABLE " + DATE_TABLE + "(" + COL + " TIMESTAMP)");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-01')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-02')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-05-01')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2011-04-01')");

        // commit this tx.
        connection.commit();

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "file:///");
        FileSystem fs = FileSystem.getLocal(conf);
        fs.delete(new Path(OUT_DIR), true);

        // now do a dd import
        Job job = new Job(conf);
        job.setMapperClass(ValMapper.class);
        job.setReducerClass(Reducer.class);
        job.setMapOutputKeyClass(DateCol.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(DateCol.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(1);
        job.getConfiguration().setInt("mapreduce.map.tasks", 2);
        FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));
        DBConfiguration.configureDB(job.getConfiguration(), DRIVER_CLASS, DB_URL, (String) null, (String) null);
        DataDrivenDBInputFormat.setInput(job, DateCol.class, DATE_TABLE, null, COL, COL);

        boolean ret = job.waitForCompletion(true);
        assertTrue("job failed", ret);

        // Check to see that we imported as much as we thought we did.
        assertEquals("Did not get all the records", 4, job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue());
    } finally {
        s.close();
    }
}