Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.moz.fiji.mapreduce.output.FileMapReduceJobOutput.java

License:Apache License

/** {@inheritDoc} */
@Override//from w  w w .  j a v a2 s  .  c om
public void configure(Job job) throws IOException {
    super.configure(job);
    FileOutputFormat.setOutputPath(job, mFilePath);
    job.setNumReduceTasks(mNumSplits);
}

From source file:com.moz.fiji.mapreduce.output.framework.HFileReducerMapReduceJobOutput.java

License:Apache License

/** {@inheritDoc} */
@Override/*from w w w. j a  v  a 2s .  co m*/
public void configure(Job job) throws IOException {
    super.configure(job); // sets the Hadoop output format

    final Configuration conf = job.getConfiguration();
    conf.set(FijiConfKeys.FIJI_OUTPUT_TABLE_URI, mJobOutput.getOutputTableURI().toString());

    // Fiji table context:
    conf.setClass(FijiConfKeys.FIJI_TABLE_CONTEXT_CLASS, HFileWriterContext.class, FijiTableContext.class);

    // Set the output path.
    FileOutputFormat.setOutputPath(job, mJobOutput.getPath());

    job.setNumReduceTasks(mJobOutput.getNumReduceTasks());
}

From source file:com.moz.fiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>//from  ww w  . j ava2  s. c o m
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    FijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}

From source file:com.mozilla.hadoop.Backup.java

License:Apache License

/**
 * @param args/*from   w  w  w.  j av  a2 s.c o m*/
 * @return
 * @throws IOException
 * @throws ParseException 
 */
public Job initJob(String[] args) throws IOException, ParseException {

    Path inputPath = null;
    Path loadPath = null;
    String outputPath = null;
    boolean useSpecifiedPaths = false;
    for (int idx = 0; idx < args.length; idx++) {
        if ("-f".equals(args[idx])) {
            useSpecifiedPaths = true;
            loadPath = new Path(args[++idx]);
        } else if (idx == args.length - 1) {
            outputPath = args[idx];
        } else {
            inputPath = new Path(args[idx]);
        }
    }

    Path mrOutputPath = new Path(NAME + "-results");

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.set("backup.input.path", inputPath.toString());
    conf.set("backup.output.path", outputPath);

    FileSystem inputFs = null;
    FileSystem outputFs = null;
    Path[] inputSources = null;
    try {
        inputFs = FileSystem.get(inputPath.toUri(), new Configuration());
        outputFs = FileSystem.get(getConf());
        if (useSpecifiedPaths) {
            inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs);
        } else {
            inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs);
        }
    } finally {
        checkAndClose(inputFs);
        checkAndClose(outputFs);
    }

    Job job = new Job(getConf());
    job.setJobName(NAME);
    job.setJarByClass(Backup.class);

    job.setMapperClass(BackupMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);

    for (Path source : inputSources) {
        System.out.println("Adding input path: " + source.toString());
        FileInputFormat.addInputPath(job, source);
    }

    FileOutputFormat.setOutputPath(job, mrOutputPath);

    return job;
}

From source file:com.mozilla.main.ReadHBaseWriteHdfs.java

License:LGPL

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.job.queue.name", "prod");
    Job job = new Job(conf, "ReadHBaseWriteHDFS");
    job.setJarByClass(ReadHBaseWriteHdfs.class);
    Scan scan = new Scan();
    scan.addFamily("data".getBytes());

    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, ReadHBaseWriteHdfsMapper.class, Text.class,
            Text.class, job);

    job.setReducerClass(ReadHBaseWriteHdfsReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1000);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[0]));

    job.waitForCompletion(true);/* w w  w. j a v a2  s.  co m*/
    if (job.isSuccessful()) {
        System.out.println("DONE");
    }

    return 0;
}

From source file:com.mozilla.socorro.hadoop.CrashReportJob.java

License:LGPL

/**
 * @param args/*  w w w.j  a va  2s. com*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public static Job initJob(String jobName, Configuration conf, Class<?> mainClass,
        Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass,
        Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns,
        Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath)
        throws IOException, ParseException {
    // Set both start/end time and start/stop row
    Calendar startCal = Calendar.getInstance();
    Calendar endCal = Calendar.getInstance();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

    String startDateStr = conf.get(START_DATE);
    String endDateStr = conf.get(END_DATE);
    if (!StringUtils.isBlank(startDateStr)) {
        startCal.setTime(sdf.parse(startDateStr));
    }
    if (!StringUtils.isBlank(endDateStr)) {
        endCal.setTime(sdf.parse(endDateStr));
    }

    conf.setLong(START_TIME, startCal.getTimeInMillis());
    conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE));

    Job job = new Job(conf);
    job.setJobName(jobName);
    job.setJarByClass(mainClass);

    // input table configuration
    Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false);
    MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass,
            keyOut, valueOut, job);

    if (combinerClass != null) {
        job.setCombinerClass(combinerClass);
    }

    if (reducerClass != null) {
        job.setReducerClass(reducerClass);
    } else {
        job.setNumReduceTasks(0);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    return job;
}

From source file:com.mozilla.socorro.hadoop.HardwareAccel.java

License:LGPL

/**
 * @param args/* w ww  .  j a v a  2s . c o  m*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public Job initJob(String[] args) throws IOException, ParseException {
    Map<byte[], byte[]> columns = new HashMap<byte[], byte[]>();
    columns.put(PROCESSED_DATA_BYTES, JSON_BYTES);
    Job job = CrashReportJob.initJob(NAME, getConf(), HardwareAccel.class, HardwareAccelMapper.class,
            IntSumReducer.class, IntSumReducer.class, columns, Text.class, IntWritable.class,
            new Path(args[0]));
    job.setNumReduceTasks(1);

    return job;
}

From source file:com.msd.gin.halyard.tools.HalyardParallelExport.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(newOption("h", null, "Prints this help"));
    options.addOption(newOption("v", null, "Prints version"));
    options.addOption(newOption("s", "source_htable", "Source HBase table with Halyard RDF store"));
    options.addOption(newOption("q", "sparql_query",
            "SPARQL tuple or graph query with use of '" + PARALLEL_SPLIT_FUNCTION_URI + "' function"));
    options.addOption(newOption("t", "target_url",
            "file://<path>/<file_name>{0}.<ext> or hdfs://<path>/<file_name>{0}.<ext> or jdbc:<jdbc_connection>/<table_name>"));
    options.addOption(newOption("p", "property=value", "JDBC connection properties"));
    options.addOption(newOption("l", "driver_classpath", "JDBC driver classpath delimited by ':'"));
    options.addOption(newOption("c", "driver_class", "JDBC driver class name"));
    try {//from  w  w  w.  j  av a2 s .c o m
        CommandLine cmd = new PosixParser().parse(options, args);
        if (args.length == 0 || cmd.hasOption('h')) {
            printHelp(options);
            return -1;
        }
        if (cmd.hasOption('v')) {
            Properties p = new Properties();
            try (InputStream in = HalyardExport.class
                    .getResourceAsStream("/META-INF/maven/com.msd.gin.halyard/hbasesail/pom.properties")) {
                if (in != null)
                    p.load(in);
            }
            System.out.println("Halyard Parallel Export version " + p.getProperty("version", "unknown"));
            return 0;
        }
        if (!cmd.getArgList().isEmpty())
            throw new ExportException("Unknown arguments: " + cmd.getArgList().toString());
        for (char c : "sqt".toCharArray()) {
            if (!cmd.hasOption(c))
                throw new ExportException("Missing mandatory option: " + c);
        }
        for (char c : "sqtlc".toCharArray()) {
            String s[] = cmd.getOptionValues(c);
            if (s != null && s.length > 1)
                throw new ExportException("Multiple values for option: " + c);
        }
        String source = cmd.getOptionValue('s');
        String query = cmd.getOptionValue('q');
        if (!query.contains(PARALLEL_SPLIT_FUNCTION_NAME)) {
            throw new ExportException("Parallel export SPARQL query must contain '"
                    + PARALLEL_SPLIT_FUNCTION_URI + "' function.");
        }
        String target = cmd.getOptionValue('t');
        if ((target.startsWith("file:") || target.startsWith("hdfs:")) && !target.contains("{0}")) {
            throw new ExportException(
                    "Parallel export file target must contain '{0}' counter in the file path or name.");
        }
        getConf().set(SOURCE, source);
        getConf().set(QUERY, query);
        getConf().set(TARGET, target);
        String driver = cmd.getOptionValue('c');
        if (driver != null) {
            getConf().set(JDBC_DRIVER, driver);
        }
        String props[] = cmd.getOptionValues('p');
        if (props != null) {
            for (int i = 0; i < props.length; i++) {
                props[i] = Base64.encodeBase64String(props[i].getBytes(UTF8));
            }
            getConf().setStrings(JDBC_PROPERTIES, props);
        }
        TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class,
                AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class,
                HBaseConfiguration.class, AuthenticationProtos.class, Trace.class);
        HBaseConfiguration.addHbaseResources(getConf());
        Job job = Job.getInstance(getConf(), "HalyardParallelExport " + source + " -> " + target);
        String cp = cmd.getOptionValue('l');
        if (cp != null) {
            String jars[] = cp.split(":");
            for (int i = 0; i < jars.length; i++) {
                File f = new File(jars[i]);
                if (!f.isFile())
                    throw new ExportException("Invalid JDBC driver classpath element: " + jars[i]);
                job.addFileToClassPath(new Path(f.toURI()));
                jars[i] = f.getName();
            }
            job.getConfiguration().setStrings(JDBC_CLASSPATH, jars);
        }
        job.setJarByClass(HalyardParallelExport.class);
        job.setMaxMapAttempts(1);
        job.setMapperClass(ParallelExportMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Void.class);
        job.setNumReduceTasks(0);
        job.setInputFormatClass(IndexedInputFormat.class);
        job.setOutputFormatClass(NullOutputFormat.class);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            LOG.info("Parallel Export Completed..");
            return 0;
        }
        return -1;
    } catch (RuntimeException exp) {
        System.out.println(exp.getMessage());
        printHelp(options);
        throw exp;
    }

}

From source file:com.mycompany.hadooptrain.WordCount.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Path inputPath = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    // Create configuration
    Configuration conf = new Configuration(true);

    // Create job
    Job job = new Job(conf, "WordCount");
    job.setJarByClass(WordCountMapper.class);

    // Setup MapReduce
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);
    job.setNumReduceTasks(1);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Input/*w w w.  ja  v a2 s  .c o m*/
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    FileSystem hdfs = FileSystem.get(conf);
    if (hdfs.exists(outputDir))
        hdfs.delete(outputDir, true);

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    System.exit(code);

}

From source file:com.nearinfinity.blur.mapreduce.BlurTask.java

License:Apache License

public Job configureJob(Configuration configuration) throws IOException {
    if (getIndexingType() == INDEXING_TYPE.UPDATE) {
        checkTable();//from   w  ww.  j a  va  2 s .c o m
    }
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    DataOutputStream output = new DataOutputStream(os);
    write(output);
    output.close();
    String blurTask = new String(Base64.encodeBase64(os.toByteArray()));
    configuration.set(BLUR_BLURTASK, blurTask);

    Job job = new Job(configuration, "Blur Indexer");
    job.setReducerClass(BlurReducer.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BlurMutate.class);
    job.setNumReduceTasks(getNumReducers(configuration));
    return job;
}