Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:edu.isi.mavuno.app.mine.HarvestSentences.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String patternPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.PatternPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.OutputPath", conf);

    sLogger.info("Tool name: HarvestSentences");
    sLogger.info(" - Pattern file: " + patternPath);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestSentences");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);

    job.waitForCompletion(true);/*from  w ww  . j  a  v  a2s.  com*/

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    // required parameters
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);

    // optional parameters
    String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);

    sLogger.info("Tool name: ProcessStanfordNLP");
    sLogger.info(" - Input path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    if (suTime != null && Boolean.parseBoolean(suTime)) {
        sLogger.info("- SUTime enabled");
    }/*from  w  w  w. jav a 2s  .c o m*/

    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        sLogger.info("- Text output format enabled");
        textOutputFormat = true;
    }

    Job job = new Job(conf);
    job.setJobName("ProcessStanfordNLP");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StanfordParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StanfordParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(ProcessStanfordNLP.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.TratzParse.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);

    // optional parameter that allows the parsed documents to be output in text format
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        textOutputFormat = true;//from   w  w  w  . j  a  v a  2s.c  o  m
    }

    sLogger.info("Tool name: TratzParse");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("TratzParse");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TratzParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TratzParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(TratzParse.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
    sLogger.info(" - Total dropped sentences: "
            + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
    sLogger.info(
            " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
    sLogger.info(
            " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
    sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
    sLogger.info(" - Total named entity tagging time (ms): "
            + counters.findCounter(StatCounters.NETAG_TIME).getValue());
    sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());

    return 0;
}

From source file:edu.isi.mavuno.app.util.SequenceFileToText.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.SequenceFileToText.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.SequenceFileToText.OutputPath", conf);

    sLogger.info("Tool name: SequenceFileToText");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("SequenceFileToText");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);

    job.waitForCompletion(true);//from w  ww .  j av  a 2  s  .c o  m
    return 0;
}

From source file:edu.isi.mavuno.extract.Split.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.Split.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Split.OutputPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.Split.SplitKey", conf);

    sLogger.info("Tool name: Split");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Split key: " + splitKey);

    Job job = new Job(conf);
    job.setJobName("Split");

    MavunoUtils.recursivelyAddInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setNumReduceTasks(1);

    job.waitForCompletion(true);/*  w  w w.  j a  v a  2 s  . c  o m*/
    return 0;
}

From source file:edu.iu.benchmark.JobLauncher.java

License:Apache License

private Job configureBenchmarkJob(String cmd, int bytesPerPartition, int numPartitions, int numMappers,
        int numIterations, Path inputDirPath, Path outputDirPath) throws IOException, URISyntaxException {
    Job job = Job.getInstance(getConf(), "benchmark_job");
    FileInputFormat.setInputPaths(job, inputDirPath);
    FileOutputFormat.setOutputPath(job, outputDirPath);
    job.setInputFormatClass(SingleFileInputFormat.class);
    job.setJarByClass(JobLauncher.class);
    job.setMapperClass(BenchmarkMapper.class);
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    jobConf.set("mapreduce.framework.name", "map-collective");
    jobConf.setNumMapTasks(numMappers);/*from   w  w  w . ja va2s  .  c om*/
    job.setNumReduceTasks(0);
    jobConf.set(Constants.BENCHMARK_CMD, cmd);
    jobConf.setInt(Constants.BYTES_PER_PARTITION, bytesPerPartition);
    jobConf.setInt(Constants.NUM_PARTITIONS, numPartitions);
    jobConf.setInt(Constants.NUM_MAPPERS, numMappers);
    jobConf.setInt(Constants.NUM_ITERATIONS, numIterations);
    return job;
}

From source file:edu.iu.ccd.CCDLauncher.java

License:Apache License

private Job configureCCDJob(Path inputDir, int r, double lambda, int numIterations, int numMapTasks,
        int numThreadsPerWorker, int numModelSlices, Path modelDir, Path outputDir, String testFilePath,
        Configuration configuration, int jobID) throws IOException, URISyntaxException {
    configuration.setInt(Constants.R, r);
    configuration.setDouble(Constants.LAMBDA, lambda);
    configuration.setInt(Constants.NUM_ITERATIONS, numIterations);
    configuration.setInt(Constants.NUM_THREADS, numThreadsPerWorker);
    System.out.println("Model Dir Path: " + modelDir.toString());
    configuration.set(Constants.MODEL_DIR, modelDir.toString());
    configuration.setInt(Constants.NUM_MODEL_SLICES, numModelSlices);
    configuration.set(Constants.TEST_FILE_PATH, testFilePath);
    Job job = Job.getInstance(configuration, "ccd_job_" + jobID);
    JobConf jobConf = (JobConf) job.getConfiguration();
    jobConf.set("mapreduce.framework.name", "map-collective");
    jobConf.setNumMapTasks(numMapTasks);
    jobConf.setInt("mapreduce.job.max.split.locations", 10000);
    FileInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setInputFormatClass(MultiFileInputFormat.class);
    job.setJarByClass(CCDLauncher.class);
    job.setMapperClass(CCDMPCollectiveMapper.class);
    job.setNumReduceTasks(0);
    return job;//from   ww w. j av  a2s.  com
}

From source file:edu.iu.daal_cov.COVDaalLauncher.java

License:Apache License

private Job configureCOVJob(Path inputDir, int mem, int numMapTasks, int numThreadsPerWorker, Path modelDir,
        Path outputDir, Configuration configuration) throws IOException, URISyntaxException {

    // configuration.set(Constants.TEST_FILE_PATH, testDirPath);
    // configuration.set(Constants.TEST_TRUTH_PATH, testGroundTruthDirPath);
    configuration.setInt(Constants.NUM_MAPPERS, numMapTasks);
    configuration.setInt(Constants.NUM_THREADS, numThreadsPerWorker);
    // configuration.setInt(Constants.BATCH_SIZE, batchSize);

    Job job = Job.getInstance(configuration, "cov_job");
    JobConf jobConf = (JobConf) job.getConfiguration();

    jobConf.set("mapreduce.framework.name", "map-collective");

    jobConf.setInt("mapreduce.job.max.split.locations", 10000);

    jobConf.setInt("mapreduce.map.collective.memory.mb", mem);

    int xmx = (int) Math.ceil((mem - 2000) * 0.5);
    int xmn = (int) Math.ceil(0.25 * xmx);
    jobConf.set("mapreduce.map.collective.java.opts",
            "-Xmx" + xmx + "m -Xms" + xmx + "m" + " -Xmn" + xmn + "m");

    jobConf.setNumMapTasks(numMapTasks);

    FileInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setInputFormatClass(MultiFileInputFormat.class);
    job.setJarByClass(COVDaalLauncher.class);
    job.setMapperClass(COVDaalCollectiveMapper.class);
    job.setNumReduceTasks(0);

    System.out.println("Launcher launched");
    return job;/*from ww  w  .j  av a 2  s  .com*/
}

From source file:edu.iu.daal_linreg.LinRegDaalLauncher.java

License:Apache License

private Job configureLinRegJob(Path inputDir, String testDirPath, String testGroundTruthDirPath, int mem,
        int batchSize, int numMapTasks, int numThreadsPerWorker, Path modelDir, Path outputDir,
        Configuration configuration) throws IOException, URISyntaxException {

    configuration.set(Constants.TEST_FILE_PATH, testDirPath);
    configuration.set(Constants.TEST_TRUTH_PATH, testGroundTruthDirPath);
    configuration.setInt(Constants.NUM_MAPPERS, numMapTasks);
    configuration.setInt(Constants.NUM_THREADS, numThreadsPerWorker);
    configuration.setInt(Constants.BATCH_SIZE, batchSize);

    Job job = Job.getInstance(configuration, "linreg_job");
    JobConf jobConf = (JobConf) job.getConfiguration();

    jobConf.set("mapreduce.framework.name", "map-collective");

    jobConf.setInt("mapreduce.job.max.split.locations", 10000);

    // mapreduce.map.collective.memory.mb
    // 125000/*  w  ww. jav a 2s.  c  o  m*/
    jobConf.setInt("mapreduce.map.collective.memory.mb", mem);

    int xmx = (int) Math.ceil((mem - 2000) * 0.5);
    int xmn = (int) Math.ceil(0.25 * xmx);
    jobConf.set("mapreduce.map.collective.java.opts",
            "-Xmx" + xmx + "m -Xms" + xmx + "m" + " -Xmn" + xmn + "m");

    jobConf.setNumMapTasks(numMapTasks);

    FileInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setInputFormatClass(MultiFileInputFormat.class);
    job.setJarByClass(LinRegDaalLauncher.class);
    job.setMapperClass(LinRegDaalCollectiveMapper.class);
    job.setNumReduceTasks(0);

    System.out.println("Launcher launched");
    return job;
}

From source file:edu.iu.daal_mom.MOMDaalLauncher.java

License:Apache License

private Job configureMOMJob(Path inputDir, int mem, int numMapTasks, int numThreadsPerWorker, Path modelDir,
        Path outputDir, Configuration configuration) throws IOException, URISyntaxException {

    // configuration.set(Constants.TEST_FILE_PATH, testDirPath);
    // configuration.set(Constants.TEST_TRUTH_PATH, testGroundTruthDirPath);
    configuration.setInt(Constants.NUM_MAPPERS, numMapTasks);
    configuration.setInt(Constants.NUM_THREADS, numThreadsPerWorker);
    // configuration.setInt(Constants.BATCH_SIZE, batchSize);

    Job job = Job.getInstance(configuration, "mom_job");
    JobConf jobConf = (JobConf) job.getConfiguration();

    jobConf.set("mapreduce.framework.name", "map-collective");

    jobConf.setInt("mapreduce.job.max.split.locations", 10000);

    // mapreduce.map.collective.memory.mb
    // 125000// w w  w  .  j a v a 2 s. c  o  m
    jobConf.setInt("mapreduce.map.collective.memory.mb", mem);

    int xmx = (int) Math.ceil((mem - 2000) * 0.5);
    int xmn = (int) Math.ceil(0.25 * xmx);
    jobConf.set("mapreduce.map.collective.java.opts",
            "-Xmx" + xmx + "m -Xms" + xmx + "m" + " -Xmn" + xmn + "m");

    jobConf.setNumMapTasks(numMapTasks);

    FileInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setInputFormatClass(MultiFileInputFormat.class);
    job.setJarByClass(MOMDaalLauncher.class);
    job.setMapperClass(MOMDaalCollectiveMapper.class);
    job.setNumReduceTasks(0);

    System.out.println("Launcher launched");
    return job;
}