Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:com.trace.hadoop.examples.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*from w  w w.j a  va2  s.  co m*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(getConf(), Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.unstruct.demo.WordCount.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from   www.  ja  v a2 s . co m
 */
public int run(String[] args) throws Exception {

    JobConf conf = new JobConf(getConf(), WordCount.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts redirects and the target for each.
 *
 * @param inputPath/*from  w w  w . j av a 2s  .  co  m*/
 * @param outputPath
 * @throws IOException
 */
private void task0(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting redirects (phase 0)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper0.class);
    conf.setReducerClass(IdentityReducer.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from Wikipedia article to (srcID, (targetID, anchor).
 *
 * @param inputPath// w w w  .j  a  v a 2s.  co m
 * @param outputPath
 * @throws IOException
 */
private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfStrings.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 *
 * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)).
 *
 * @param inputPath//from   w ww  .  j a  v a2 s. c om
 * @param outputPath
 * @throws IOException
 */
private void task2(String inputPath, String outputPath, String redirPath) throws IOException {
    LOG.info("Extracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    Random r = new Random();
    //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000);
    //LOG.info( "intermediate folder for merge " + tmpOutput );

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    conf.setNumReduceTasks(1);

    try {
        DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
    // Clean up intermediate data.
    FileSystem.get(conf).delete(new Path(inputPath), true);

    /*
    //merge
    String finalO = outputPath+"/part-00000/data";
    FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") );
    getMergeInHdfs( tmpOutput, finalO, conf );
    FileSystem.get(conf).delete(new Path(tmpOutput), true);
    */
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts CF for each found anchor.//  w ww  .j  a  va2  s .  c o m
 *
 * @param inputPath
 * @param mapPath
 * @param outputPath
 * @throws IOException
 */
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 3)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);
    LOG.info(" - mapping: " + mapPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);
    String location = "map.dat";

    try {
        DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
        //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper3.class);
    conf.setCombinerClass(MyReducer3.class);
    conf.setReducerClass(MyReducer3.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
 *
 * @param inputPath//from w  w w . j  a va  2 s  .  co  m
 * @param outputPath
 * @throws IOException
 */
private void task4(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 4)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
    FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(HMapSIW.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper4.class);
    conf.setReducerClass(MyReducer4.class);

    JobClient.runJob(conf);
}

From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVInputFormatTest.java

License:Open Source License

public int run(String[] args) throws Exception {

    getConf().set(CSVLineRecordReader.FORMAT_DELIMITER, "\"");
    getConf().set(CSVLineRecordReader.FORMAT_SEPARATOR, ",");
    getConf().setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000);
    getConf().setBoolean(CSVLineRecordReader.IS_ZIPFILE, false);

    JobConf jobConf = new JobConf(getConf());

    jobConf.setJarByClass(CSVTestRunner.class);
    jobConf.setNumReduceTasks(0);
    jobConf.setMapperClass(TestMapper.class);

    jobConf.setInputFormat(CSVNLineInputFormat.class);
    jobConf.setOutputKeyClass(NullWritable.class);
    jobConf.setOutputValueClass(Text.class);

    CSVNLineInputFormat.setInputPaths(jobConf, new Path(args[0]));
    TextOutputFormat.setOutputPath(jobConf, new Path(args[1]));

    JobClient.runJob(jobConf);//w w  w  .j  av  a2s  .  c  o  m

    return 0;
}

From source file:com.yolodata.tbana.hadoop.mapred.shuttl.TestMapper.java

License:Open Source License

public int run(String[] args) throws Exception {
    JobConf jobConf = new JobConf(TestConfigurations.getConfigurationWithShuttlSearch());

    jobConf.setJarByClass(ShuttlTestJob.class);
    jobConf.setNumReduceTasks(1);
    jobConf.setMapperClass(TestMapper.class);
    jobConf.setReducerClass(TestReducer.class);

    jobConf.setInputFormat(ShuttlCSVInputFormat.class);
    jobConf.setOutputKeyClass(LongWritable.class);
    jobConf.setOutputValueClass(Text.class);

    ShuttlCSVInputFormat.addInputPath(jobConf, new Path(args[0]));
    TextOutputFormat.setOutputPath(jobConf, new Path(args[1]));

    JobClient.runJob(jobConf);/*from w w  w . ja  va 2s.co m*/

    return 0;
}

From source file:com.yolodata.tbana.hadoop.mapred.splunk.inputformat.TestMapper.java

License:Open Source License

public int run(String[] args) throws Exception {
    JobConf jobConf = new JobConf(getConf());

    jobConf.set(SplunkInputFormat.INPUTFORMAT_MODE, args[0]);
    jobConf.setJarByClass(SplunkTestRunner.class);
    jobConf.setNumReduceTasks(1);
    jobConf.setMapperClass(TestMapper.class);
    jobConf.setReducerClass(TestReducer.class);

    jobConf.setInputFormat(SplunkInputFormat.class);
    jobConf.setOutputKeyClass(LongWritable.class);
    jobConf.setOutputValueClass(Text.class);

    TextOutputFormat.setOutputPath(jobConf, new Path(args[1]));

    JobClient.runJob(jobConf);//  ww w. j a v a2  s.  c om

    return 0;
}