Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:edu.american.student.foreman.HadoopForeman.java

License:Open Source License

/**
 * Creates a native hadoop job given a HadoopJobConfiguration
 * @param conf//  w  w w .  ja  v  a 2s .  com
 * @return
 * @throws HadoopException
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public Job getHadoopJob(HadoopJobConfiguration conf) throws HadoopException {
    Job job;
    try {
        job = new Job();
        job.setJobName(conf.getJobName());

        job.setMapperClass(conf.getMapperClass());
        job.setInputFormatClass((Class<? extends InputFormat>) conf.getInputFormatClass());

        if (conf.getOutputFormatClass() != null) {
            job.setOutputFormatClass((Class<? extends OutputFormat>) conf.getOutputFormatClass());
        }
        if (conf.getOutputKeyClass() != null) {
            job.setOutputKeyClass(conf.getOutputKeyClass());
        }
        if (conf.getOutputValueClass() != null) {
            job.setOutputValueClass(conf.getOutputValueClass());
        }
        if (conf.getReducerClass() != null) {
            job.setReducerClass(conf.getReducerClass());
        }

        job.setNumReduceTasks(conf.getNumReduceTasks());
        Configuration conf1 = job.getConfiguration();
        if (conf.getInputFormatClass() == AccumuloInputFormat.class) {
            AccumuloInputFormat.setInputInfo(conf1, Constants.getAccumuloUser(),
                    Constants.getAccumuloPassword().getBytes(), conf.getDefaultTable(), conf.getDefaultAuths());
            AccumuloInputFormat.setZooKeeperInstance(conf1, Constants.getZookeeperInstanceName(),
                    Constants.getZookeeperInstance());

        }
        if (conf.getFetchColumns() != null) {
            AccumuloInputFormat.fetchColumns(conf1, conf.getFetchColumns());
        } else if (conf.getInputFormatClass() == TextInputFormat.class) {
            if (conf.getPathToProcess() != null) {
                FileInputFormat.setInputPaths(job, conf.getPathToProcess());
            }
        }
        if (conf.getOutputFormatClass() == AccumuloOutputFormat.class) {
            AccumuloOutputFormat.setOutputInfo(conf1, Constants.getAccumuloUser(),
                    Constants.getAccumuloPassword().getBytes(), true, conf.getDefaultTable());
            AccumuloOutputFormat.setZooKeeperInstance(conf1, Constants.getZookeeperInstanceName(),
                    Constants.getZookeeperInstance());

        }
        return job;

    } catch (IOException e) {
        String gripe = "Could not configure a Hadoop job";
        log.log(Level.SEVERE, gripe, e);
        throw new HadoopException(gripe, e);

    }

}

From source file:edu.american.student.mnemosyne.core.util.foreman.HadoopForeman.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public Job getHadoopJob(HadoopJobConfiguration conf) throws HadoopException {
    Job job;
    try {// ww  w  .j a  v a  2  s.c  om
        job = new Job();
        DistributedCache.setCacheArchives(
                new URI[] { new URI("/cache/accumulo-core-1.4.1.jar"),
                        new URI("/cache/accumulo-server-1.4.1.jar"), new URI("/cache/accumulo-start-1.4.1.jar"),
                        new URI("/cache/cloudtrace-1.4.1.jar"), new URI("/cache/commons-collections-3.2.jar"),
                        new URI("/cache/commons-configuration-1.5.jar"), new URI("/cache/commons-io-1.4.jar"),
                        new URI("/cache/commons-jci-core-1.0.jar"), new URI("/cache/commons-jci-fam-1.0.jar"),
                        new URI("/cache/commons-lang-2.4.jar"), new URI("/cache/commons-logging-1.0.4.jar"),
                        new URI("/cache/commons-logging-api-1.0.4.jar"), new URI("/cache/jline-0.9.94.jar"),
                        new URI("/cache/libthrift-0.6.1.jar"), new URI("/cache/log4j-1.2.16.jar") },
                job.getConfiguration());
        job.setJobName(conf.getJobName());
        System.out.println("Setting jar class " + conf.getJarClass());
        ((JobConf) job.getConfiguration()).setJar("/opt/mnemosyne.jar");
        job.setJarByClass(conf.getJarClass());
        job.setMapperClass(conf.getMapperClass());
        job.setInputFormatClass((Class<? extends InputFormat>) conf.getInputFormatClass());

        if (conf.getOutputFormatClass() != null) {
            job.setOutputFormatClass((Class<? extends OutputFormat>) conf.getOutputFormatClass());
        }
        if (conf.getOutputKeyClass() != null) {
            job.setOutputKeyClass(conf.getOutputKeyClass());
        }
        if (conf.getOutputValueClass() != null) {
            job.setOutputValueClass(conf.getOutputValueClass());
        }
        if (conf.getReducerClass() != null) {
            job.setReducerClass(conf.getReducerClass());
        }

        job.setNumReduceTasks(conf.getNumReduceTasks());
        Configuration conf1 = job.getConfiguration();
        if (conf.getInputFormatClass() == AccumuloInputFormat.class) {
            AccumuloInputFormat.setInputInfo(conf1, MnemosyneConstants.getAccumuloUser(),
                    MnemosyneConstants.getAccumuloPassword().getBytes(), conf.getDefaultTable(),
                    conf.getDefaultAuths());
            AccumuloInputFormat.setZooKeeperInstance(conf1, MnemosyneConstants.getZookeeperInstanceName(),
                    MnemosyneConstants.getZookeeperInstance());

        }
        if (conf.getFetchColumns() != null) {
            AccumuloInputFormat.fetchColumns(conf1, conf.getFetchColumns());
        } else if (conf.getInputFormatClass() == TextInputFormat.class) {
            if (conf.getPathToProcess() != null) {
                FileInputFormat.setInputPaths(job, conf.getPathToProcess());
            }
        }
        if (conf.getOutputFormatClass() == AccumuloOutputFormat.class) {
            AccumuloOutputFormat.setOutputInfo(conf1, MnemosyneConstants.getAccumuloUser(),
                    MnemosyneConstants.getAccumuloPassword().getBytes(), true, conf.getDefaultTable());
            AccumuloOutputFormat.setZooKeeperInstance(conf1, MnemosyneConstants.getZookeeperInstanceName(),
                    MnemosyneConstants.getZookeeperInstance());

        }

        return job;

    } catch (IOException e) {
        String gripe = "Could not configure a Hadoop job";
        log.log(Level.SEVERE, gripe, e);
        throw new HadoopException(gripe, e);

    } catch (URISyntaxException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return null;

}

From source file:edu.berkeley.amplab.adam.modules.ConvertFilesMR.java

License:Apache License

@Override
public int moduleRun() throws Exception {
    Configuration configuration = new Configuration();
    configuration.set(CONFIG_KEY_OUTPUT, outputPath);
    Job job = new Job(configuration);
    job.setNumReduceTasks(1);
    job.setMapperClass(ConvertFileMapper.class);
    job.setReducerClass(ConvertFileReducer.class);

    BAMInputFormat.setInputPaths(job, new Path(inputPath));
    job.setInputFormatClass(BAMInputFormat.class);

    FileOutputFormat.setOutputPath(job, new Path(outputPath).getParent());

    AvroJob.setMapOutputValueSchema(job, ADAMRecord.SCHEMA$);

    job.waitForCompletion(true);//from w w w.j av  a 2 s .c  o m

    return 0;
}

From source file:edu.bigdata.training.serialization.UserHistory.java

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "postHistory");
    job.setJarByClass(UserHistory.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);

    MultipleInputs.addInputPath(job, new Path("input/posts/user_info.txt"), TextInputFormat.class,
            UserCityMapper.class);
    MultipleInputs.addInputPath(job, new Path("input/posts/user_posts.txt"), TextInputFormat.class,
            UserPostsMapper.class);

    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, UserPostSummary.getClassSchema());
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);

    Path outPath = new Path("output/user/posts");
    FileOutputFormat.setOutputPath(job, outPath);
    job.setReducerClass(UserPostHistory.class);
    //outPath.getFileSystem(job.getConfiguration()).delete(outPath, true);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:edu.cuhk.hccl.hadoop.HadoopApp.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args == null || args.length < 4) {
        System.out.println("Please specify parameters: input, output, domain, num-reducers!");
        System.exit(-1);//w  ww  .j  a v a 2 s  . co m
    }

    String input = args[0];
    String output = args[1];
    String domain = args[2];
    int numReducers = Integer.parseInt(args[3]);
    float similarity = Float.parseFloat(args[4]);
    int range = Integer.parseInt(args[5]);

    Job job = new Job(new Configuration(), this.getClass().getSimpleName());

    // Must below the line of job creation
    Configuration conf = job.getConfiguration();
    // Reuse the JVM
    conf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
    conf.setFloat("SIM_THRESHOLD", similarity);
    conf.setInt("SEARCH_RANGE", range);

    if (domain.equalsIgnoreCase("restaurant")) {
        conf.setStrings("ASPECTS", Constant.RESTAURANT_ASPECTS);
        job.setMapperClass(YelpMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        // args[4] is the business file to select matching business_ids to restaurant
        String busiFile = args[6];
        DistributedCache.addCacheFile(new URI(busiFile), conf);
    } else if (domain.equalsIgnoreCase("hotel")) {
        conf.setStrings("ASPECTS", Constant.TRIPADVISOR_ASPECTS);
        job.setMapperClass(TripAdvisorMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
    } else {
        System.out.println("Wrong domain type!");
        System.exit(-1);
    }

    job.setJarByClass(HadoopApp.class);
    job.setReducerClass(ReviewReducer.class);
    job.setNumReduceTasks(numReducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(UserItemPair.class);
    job.setOutputValueClass(NounPhrase.class);

    // Delete output if exists
    Path outputDir = new Path(output);
    FileSystem hdfs = FileSystem.get(conf);
    if (hdfs.exists(outputDir))
        hdfs.delete(outputDir, true);

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.indiana.cs.b649.HadoopBlast.java

License:Open Source License

/**
 * Launch the MapReduce computation./*from   www . j a v  a 2 s  .  com*/
 * This method first, remove any previous working directories and create a new one
 * Then the data (file names) is copied to this new directory and launch the 
 * MapReduce (map-only though) computation.
 * @param numReduceTasks - Number of reduce tasks = 0.
 * @param binAndDbArchive - the uploaded databaseArchive filename on HDFS
 * @param execName - Name of the binary executable.
 * @param workingDir - the local disk working directory when computing the downloaded *.fa from HDFS
 * @param databaseArchiveDir - The directory where the Blast+/Cap3 program is after unzip the distributed cached archive. 
 * @param databaseName - the Blast+ database name, normally "nr"   
 * @param inputDir - Directory where the input data set is located on HDFS.
 * @param outputDir - Output directory to place the output on HDFS.
 * @param cmdArgs - These are the command line arguments to the Blast+ program.
 * @throws Exception - Throws any exception occurs in this program.
 * 
 * you are free to change this launch function to support your own program
 */
void launch(int numReduceTasks, String binAndDbArchive, String execName, String workingDir,
        String databaseArchiveDir, String databaseName, String inputDir, String outputDir, String cmdArgs)
        throws Exception {

    Configuration conf = new Configuration();
    Job job = new Job(conf, execName);

    Path hdMainDir = new Path(outputDir);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(hdMainDir, true);
    Path hdOutDir = new Path(hdMainDir, "out");

    Configuration jc = job.getConfiguration();

    jc.set(Bin_DB_Archive, binAndDbArchive); // this the name of the executable archive
    jc.set(EXECUTABLE, execName);
    jc.set(WORKING_DIR, workingDir);
    jc.set(DB_ARCHIVE_DIR, databaseArchiveDir);
    jc.set(DB_NAME, databaseName);
    jc.set(OUTPUT_DIR, outputDir);
    jc.set(PARAMETERS, cmdArgs);
    jc.set(OUTPUT_DIR, outputDir);

    FileInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, hdOutDir);

    DistributedCache.addCacheArchive(new URI(Bin_DB_Archive), jc);

    /*
     * Your code here
     */
    System.out.println("so far so good");
    job.setJarByClass(HadoopBlast.class);
    job.setMapperClass(RunnerMap.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(DataFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReduceTasks);

    int exitStatus = job.waitForCompletion(true) ? 0 : 1;

    //clean the cache

    System.exit(exitStatus);
}

From source file:edu.indiana.d2i.htrc.corpus.analysis.LDAAnalysisDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    /**/* ww w . j  a va 2s . c  om*/
     * Following generic arguments should be specified in command line
     * 
     * -D user.args.mapping.table.filename=<mappingtable_filename> -D
     * user.args.topics.filename=<topics_filename> -D
     * user.args.topdoctable.capacity.stepsize=<stepSize> -D
     * user.args.lda.state.filepath=</hdfs/path/to/lda/state/file> (being
     * set automatically) -files
     * </local/path/to/mapping/table/file>,</local/path/to/topics/file>
     * -libjars <dependent jars> (if any)
     * 
     */
    if (args.length != 2) {
        System.err.printf(
                "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory/prefix> <path/to/property/file>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Configuration conf = getConf();

    Job job = new Job(conf, "HTRC LDA Analysis");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(LDAAnalysisDriver.class);
    job.setMapperClass(LDAAnalysisMapper.class);
    job.setReducerClass(LDAAnalysisReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LDAState.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    /* set number of reduce tasks to be 1 */
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:edu.indiana.d2i.htrc.corpus.wordset.ComposeWordsetDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    if (args.length != 2) {
        System.err.printf(/*  ww w .  j a  v  a 2  s  .  c o m*/
                "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Configuration conf = getConf();

    Job job = new Job(conf, "HTRC Composing Wordset");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(ComposeWordsetDriver.class);
    job.setMapperClass(ComposeWordsetMapper.class);
    job.setReducerClass(ComposeWordsetReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    /* Only one reducer */
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*  w ww  . jav  a2  s. co m*/
    }

    // all directories are in HDFS
    tokenizedDocDir = args[0];
    dictDir = args[1];
    outputDir = args[2];
    numReducers = Integer.valueOf(args[3]);

    logger.info("PartialVectorsFromTokenizedDoc ");
    logger.info(" - tokenizedDocDir: " + tokenizedDocDir);
    logger.info(" - dictDir: " + dictDir);
    logger.info(" - outputDir: " + outputDir);
    logger.info(" - numReducers: " + numReducers);

    Path tokenizedDocPath = new Path(tokenizedDocDir);
    Path dictPath = new Path(dictDir);
    Path outputPath = new Path(outputDir);

    // get dimension
    Configuration conf = getConf();

    int dimension = 0;
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true,
            conf)) {
        dimension++;
    }
    logger.info("dimension of a vector: " + dimension);

    // submit job
    long t0 = System.currentTimeMillis();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir
            + ", dictionary-file: " + dictDir);
    job.setJarByClass(PartialVectorsFromTokenizedDoc.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, tokenizedDocPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    HadoopUtil.delete(conf, outputPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    job.waitForCompletion(true);

    long t1 = System.currentTimeMillis();
    logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 6) {
        printUsage();/* w  w w .  j av  a2 s . c  o  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    String analyzerClassName = args[4];
    int maxIdsPerReq = Integer.valueOf(args[5]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    // upload dictionary file to HDFS
    //      FileSystem fs = FileSystem.get(getConf());
    //      Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile));
    //      BufferedWriter writer = new BufferedWriter(
    //            new OutputStreamWriter(fs.create(dictionaryPath, true)));
    //      BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile));
    //      String line = null;
    //      while ((line = reader.readLine()) != null) {
    //         writer.write(line + "\n");
    //      }
    //      writer.close();

    // 
    Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely.");
    job.setJarByClass(DataCopyTokenizerJob.class);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set distributed cache
    //      Path dictionaryPath = new Path(dictionaryFile);
    //      DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration());

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);

    job.setMapperClass(DataCopyTokenizerMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}