Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n) 

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaLinkGraph.java

License:Apache License

private void task2(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Building adjacency lists...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), BuildWikipediaLinkGraph.class);
    conf.setJobName(/*from   w  w w.j a  v  a2s.co m*/
            String.format("BuildWikipediaLinkGraph:AdjacencyList[input: %s, output: %s, num_partitions: %d]",
                    inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.collection.wikipedia.DemoCountWikipediaPages.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from w ww .  java2  s.  c o  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);

    JobConf conf = new JobConf(getConf(), DemoCountWikipediaPages.class);
    conf.setJobName(String.format("DemoCountWikipediaPages[%s: %s]", INPUT_OPTION, inputPath));

    conf.setNumMapTasks(10);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaAnchorText.java

License:Apache License

private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Exracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(//  w  ww  .ja  v a  2  s . c o m
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PairOfIntString.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaAnchorText.java

License:Apache License

private void task2(String inputPath, String outputPath) throws IOException {
    LOG.info("Exracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(/*from  w w w. j a v a  2s.  co  m*/
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    conf.setNumReduceTasks(1);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
    // Clean up intermediate data.
    FileSystem.get(conf).delete(new Path(inputPath), true);
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.java

License:Apache License

private void task1(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Exracting edges...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaLinkGraph.class);
    conf.setJobName(String.format("ExtractWikipediaLinkGraph:Edges[input: %s, output: %s, num_partitions: %d]",
            inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);//from   ww w.j av a  2 s . c  om
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.java

License:Apache License

private void task2(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Building adjacency lists...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaLinkGraph.class);
    conf.setJobName(/*from w  w  w . java2s . c o m*/
            String.format("ExtractWikipediaLinkGraph:AdjacencyList[input: %s, output: %s, num_partitions: %d]",
                    inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndexBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from   w ww  . j  av  a 2 s.  c  o m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;
}

From source file:edu.umd.cloud9.demo.DemoWordCondProbJSON.java

License:Apache License

/**
 * Runs this tool./*from   w w  w . ja va 2 s. c o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: DemoWordCondProbJSON");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(DemoWordCondProbJSON.class);
    conf.setJobName("DemoWordCondProbJSON");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(MyTuple.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    // this is a potential gotcha! can't use ReduceClass for combine because
    // we have not collected all the counts yet, so we can't divide through
    // to compute the conditional probabilities
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.demo.DemoWordCondProbTuple.java

License:Apache License

/**
 * Runs this tool.//  w ww .  j av a 2 s  .  c o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: DemoWordCondProbTuple");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(DemoWordCondProbTuple.class);
    conf.setJobName("DemoWordCondProbTuple");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    // this is a potential gotcha! can't use ReduceClass for combine because
    // we have not collected all the counts yet, so we can't divide through
    // to compute the conditional probabilities
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.demo.DemoWordCount.java

License:Apache License

/**
 * Runs this tool.//from   w  ww .  j  a  v a2  s .c  o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: DemoWordCount");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(DemoWordCount.class);
    conf.setJobName("DemoWordCount");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyReducer.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}