Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java

License:Apache License

/**
* Run the job./*from w w  w  . j a v a  2  s .  com*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("Archive File Extractor");

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // turn off speculative execution
    job.setBoolean("mapred.map.tasks.speculative.execution", false);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    //tolerate task exceptions
    job.setBoolean("soft", false);

    int arg = 0;
    int numMaps = 10;

    String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n"
            + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"
            + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n";

    String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION,
            DateUtils.getLog17Date(System.currentTimeMillis()));

    while (arg < args.length - 1) {
        if (args[arg].equals("-soft")) {
            job.setBoolean("soft", true);
            arg++;
        } else if (args[arg].equals("-mappers")) {
            arg++;
            numMaps = Integer.parseInt(args[arg]);
            job.setNumMapTasks(numMaps);
            arg++;
        } else if (args[arg].equals("-timestamp14")) {
            arg++;
            String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg]));
            job.set("timestamp14", timestamp14);
            arg++;
        } else if (args[arg].equals("-warc-header-local-file")) {
            arg++;
            File f = new File(args[arg]);
            FileInputStream fis = new FileInputStream(f);
            warcHeaderString = IOUtils.toString(fis, "UTF-8");
            arg++;
        } else if (args[arg].equals("-hmacname")) {
            arg++;
            String hmacName = args[arg];
            job.set("hmacName", hmacName);
            arg++;
        } else if (args[arg].equals("-hmacsignature")) {
            arg++;
            String hmacSignature = args[arg];
            job.set("hmacSignature", hmacSignature);
            arg++;
        } else if (args[arg].equals("-timeout")) {
            arg++;
            int taskTimeout = Integer.parseInt(args[arg]);
            job.setInt("mapred.task.timeout", taskTimeout);
            arg++;
        } else if (args[arg].equals("-failpct")) {
            arg++;
            int failPct = Integer.parseInt(args[arg]);
            job.setInt("mapred.max.map.failures.percent", failPct);
            arg++;
        } else {
            break;
        }
    }

    job.set("warcHeaderString", warcHeaderString);

    if (args.length - 2 != arg) {
        printUsage();
        return 1;
    }

    Path inputPath = new Path(args[arg]);
    arg++;

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    Path outputPath = new Path(outputDir);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(ArchiveFileExtractorMapper.class);
    job.setJarByClass(ArchiveFileExtractor.class);

    TextInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.CDXGenerator.java

License:Apache License

/**
* Run the job./*from  w  ww . j a va2s. c  om*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("CDX Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating CDXs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(CDXGeneratorMapper.class);
    job.setJarByClass(CDXGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to CDXGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java

License:Apache License

/**
* Run the job./*from   w w  w. jav a2 s.co m*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WARCMetadataRecord Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WARCMetadataRecordGeneratorMapper.class);
    job.setJarByClass(WARCMetadataRecordGenerator.class);

    //extract outlinks by default
    job.set("outputType", "outlinks");
    int arg = 0;
    if (args[arg].equals("-hopinfo")) {
        job.set("outputType", "hopinfo");
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WARCMetadataRecordGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WATGenerator.java

License:Apache License

/**
* Run the job.//from   w w  w  . j  a  v a  2 s .  c  o m
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WAT Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating WATs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WATGeneratorMapper.class);
    job.setJarByClass(WATGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WATGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.jbs.Parse.java

License:Apache License

/**
 * Run the job./*w ww .  j a  v  a  2 s.c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    FileSystem fs = FileSystem.get(getConf());

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("jbs.Parse " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // Use the Parse-specific output format.
    job.setOutputFormat(PerMapOutputFormat.class);

    // Use our ParseMapper, with output keys and values of type
    // Text.
    job.setMapperClass(ParseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the input and output paths, from the command-line.
    Path outputDir = new Path(args[0]);
    FileOutputFormat.setOutputPath(job, outputDir);

    boolean atLeastOneInput = false;
    for (int i = 1; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            Path outputPath = new Path(outputDir, inputPath.getName());
            if (fs.exists(outputPath)) {
                LOG.debug("Output path already exists: " + outputPath);
            } else {
                atLeastOneInput = true;
                LOG.info("Add input path: " + inputPath);
                FileInputFormat.addInputPath(job, inputPath);
            }
        }
    }

    if (!atLeastOneInput) {
        LOG.info("No input files to parse.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);

    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }

    return 0;
}

From source file:org.asayler.WikiTitleCount.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker./*from   w  w  w  .  j  a  va2 s.c  o m*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleCount.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    int num_reducers = 1;

    conf.setJobName("wikititlecount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    /** Set Default Mappers */
    num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.asayler.WikiTitleSort.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker./*from   w  w  w.ja va2 s.  c  o m*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleSort.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    final int num_reducers = 1;

    conf.setJobName("wikititlesort");

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.cloudata.core.PerformanceTest.java

License:Apache License

private void runNIsMoreThanOne(final String cmd) throws IOException {
    checkTable();/*from  ww w . j  a  v  a2s  .co m*/

    // Run a mapreduce job.  Run as many maps as asked-for clients.
    // Before we start up the job, write out an input file with instruction
    // per client regards which row they are to start on.
    Path inputDir = writeInputFile(this.conf);
    this.conf.set(EvaluationMapTask.CMD_KEY, cmd);
    JobConf job = new JobConf(this.conf, this.getClass());
    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormat(TextInputFormat.class);
    job.setJobName("Cloudata Performance Evaluation");
    job.setMapperClass(EvaluationMapTask.class);
    job.setMaxMapAttempts(1);
    job.setMaxReduceAttempts(1);
    job.setNumMapTasks(this.N * 10); // Ten maps per client.
    job.setNumReduceTasks(1);
    job.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(inputDir, "outputs"));
    JobClient.runJob(job);
}

From source file:org.cloudata.core.tablet.backup.BackupBinaryJob.java

License:Apache License

public void runBackUp(String tableName, String outputPath) throws IOException {
    CloudataConf nconf = new CloudataConf();
    CloudataFileSystem fs = CloudataFileSystem.get(nconf);
    if (fs.exists(new GPath(outputPath))) {
        throw new IOException("Output path already exists:" + outputPath);
    }//from   w w w. j  av  a2 s.  co  m

    if (!CTable.existsTable(nconf, tableName)) {
        throw new IOException("No Table:" + tableName);
    }

    CTable ctable = CTable.openTable(nconf, tableName);
    String columns = "";
    for (String eachColumn : ctable.getTableSchema().getColumnsArray()) {
        columns += eachColumn + ",";
    }
    columns = columns.substring(0, columns.length() - 1);

    JobConf jobConf = new JobConf(BackupBinaryJob.class);

    jobConf.setMapperClass(BackupBinaryMap.class);
    jobConf.setInputFormat(BackupTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, tableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
    jobConf.setMapOutputKeyClass(BytesWritable.class);
    jobConf.setMapOutputValueClass(BytesWritable.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    //map only
    jobConf.setNumReduceTasks(0);

    JobClient.runJob(jobConf);
}

From source file:org.cloudata.core.tablet.backup.BackupJob.java

License:Apache License

/**
 *  mapreduce job //  www. j  a v  a2 s. com
 * @param tableName
 * @param outputPath
 * @throws IOException
 */
public void runBackUp(String tableName, String outputPath) throws IOException {
    CloudataConf nconf = new CloudataConf();
    CloudataFileSystem fs = CloudataFileSystem.get(nconf);
    if (fs.exists(new GPath(outputPath))) {
        throw new IOException("Output path already exists:" + outputPath);
    }

    if (!CTable.existsTable(nconf, tableName)) {
        throw new IOException("No Table:" + tableName);
    }

    CTable ctable = CTable.openTable(nconf, tableName);
    String columns = "";
    for (String eachColumn : ctable.getTableSchema().getColumnsArray()) {
        columns += eachColumn + ",";
    }
    columns = columns.substring(0, columns.length() - 1);

    String jobName = tableName + " backup";
    JobConf jobConf = new JobConf(BackupJob.class);
    jobConf.setJobName(jobName);

    jobConf.setMapperClass(BackupMap.class);
    jobConf.setInputFormat(BackupTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, tableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
    jobConf.set("mapred.textoutputformat.separator", ",");
    jobConf.setOutputFormat(TextOutputFormat.class);

    //map only
    jobConf.setNumReduceTasks(0);

    JobClient.runJob(jobConf);
}