Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:edu.umd.cloud9.collection.aquaint2.Aquaint2DocnoMappingBuilder.java

License:Apache License

/**
 * Runs this tool./*ww w .  ja  va  2 s  .c  om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + Aquaint2DocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    Job job = new Job(getConf(), Aquaint2DocnoMappingBuilder.class.getSimpleName());
    job.setJarByClass(Aquaint2DocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(Aquaint2DocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
    Aquaint2DocnoMapping.writeDocnoData(new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents2.java

License:Apache License

/**
 * Runs this tool./*ww  w.  j a  v  a2 s.  c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    Path inputDirPath = new Path(args[0]);
    String outputDirPathname = args[1];
    Path outputDirPath = new Path(outputDirPathname);
    Path outputFilePath = new Path(args[2]);

    LOG.info("Tool: " + NumberAquaint2Documents2.class.getCanonicalName());
    LOG.info(" - Input dir path: " + inputDirPath);
    LOG.info(" - Output dir path: " + outputDirPath);
    LOG.info(" - Output file path: " + outputFilePath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, NumberAquaint2Documents2.class.getSimpleName());
    job.setJarByClass(NumberAquaint2Documents2.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, inputDirPath);
    FileOutputFormat.setOutputPath(job, outputDirPath);
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(Aquaint2DocumentInputFormat2.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(outputDirPath, true);

    job.waitForCompletion(true);

    Path inputFilePath = new Path(
            outputDirPathname + (outputDirPathname.endsWith("/") ? "" : "/") + "/part-r-00000");
    Aquaint2DocnoMapping.writeDocnoData(inputFilePath, outputFilePath, FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override/*from  w  ww  . j a  v  a2 s . c  o m*/
public int runTool() throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);

    String inputPath = conf.get("Cloud9.InputPath");
    String inputFormat = conf.get("Cloud9.InputFormat");
    String outputPath = conf.get("Cloud9.OutputPath");
    String tag = conf.get("Cloud9.TargetTag");

    job.setJobName("ExtractFieldCollection");

    job.setJarByClass(ExtractHTMLFieldCollection.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setNumReduceTasks(200);

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(TextDocument.class);

    LOG.info("ExtractFieldCollection - " + tag);
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Input format: " + inputFormat);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Target tag: " + tag);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.umd.cloud9.collection.medline.CountMedlineCitations.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));//from w w w.  j a v  a  2  s . c o m
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(CountMedlineCitations.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return numDocs;
}

From source file:edu.umd.cloud9.collection.medline.DemoCountMedlineCitations2.java

License:Apache License

/**
 * Runs this tool./* w w  w .j a v  a  2s  .co  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    LOG.info("Tool: " + DemoCountMedlineCitations2.class.getCanonicalName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), DemoCountMedlineCitations2.class.getSimpleName());
    job.setJarByClass(DemoCountMedlineCitations.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat2.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.MedlineDocnoMappingBuilder.java

License:Apache License

/**
 * Runs this tool.// w w w  .ja  v  a  2 s .  c o  m
 */
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    if (options == null) {
        return -1;
    }

    // Temp directory.
    String tmpDir = "tmp-" + MedlineDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool: " + MedlineDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = new Job(getConf(), MedlineDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(MedlineDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(options.collection));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    String input = tmpDir + (tmpDir.endsWith("/") ? "" : "/") + "/part-r-00000";
    MedlineDocnoMapping.writeMappingData(new Path(input), new Path(options.docnoMapping),
            FileSystem.get(getConf()));

    fs.delete(new Path(tmpDir), true);

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations2.java

License:Apache License

/**
 * Runs this tool./*ww w. ja v  a  2s. c o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + NumberMedlineCitations2.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    Job job = new Job(getConf(), NumberMedlineCitations2.class.getSimpleName());
    job.setJarByClass(NumberMedlineCitations.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat2.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
    MedlineDocnoMapping.writeMappingData(new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex2.java

License:Apache License

/**
 * Runs this tool.//from  w  ww  .j  ava  2s  . c  o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    Job job = new Job(getConf(), BuildTrecForwardIndex2.class.getCanonicalName());
    job.setJarByClass(BuildTrecForwardIndex2.class);
    FileSystem fs = FileSystem.get(getConf());

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    String mappingFile = args[3];

    LOG.info("Tool name: " + BuildTrecForwardIndex2.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - mapping file: " + mappingFile);

    job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m");
    job.setNumReduceTasks(1);

    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }

    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat2.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();

    String inputFile = outputPath + "/" + "part-r-00000";

    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));

    FSDataOutputStream writer = fs.create(new Path(indexFile), true);

    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);

        writer.writeLong(offset);
        writer.writeInt(len);

        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");

    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.CountTrecDocuments.java

License:Apache License

/**
 * Runs this tool./*w w  w .  j  a  va2 s.co m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("(optional) output file to write the number of records").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountTrecDocuments.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountTrecDocuments.class.getSimpleName());
    job.setJarByClass(CountTrecDocuments.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FileSystem fs = FileSystem.get(getConf());
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }

    return numDocs;
}