Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:edu.indiana.d2i.htrc.io.ParallelDataCopyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();/*from w w  w.j a  v  a 2 s.  com*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    int maxIdsPerReq = Integer.valueOf(args[4]);

    logger.info("ParallelDataCopyJob ");
    logger.info(" - input: " + inputPath); // id list
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);

    Job job = new Job(getConf(), "Copy data from HTRC data storage parallely.");
    job.setJarByClass(ParallelDataCopyJob.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("ParallelDataCopyJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromHDFSRawText.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from plain text in HDFS.");
    job.setJarByClass(SparseVectorsFromHDFSRawText.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);//from  ww w  . j  a  va2s  .  c  o  m
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromLucene.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Lucene.");
    job.setJarByClass(SparseVectorsFromLucene.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set("htrc.lucene.index.path", indexLoc);

    job.setInputFormatClass(LuceneIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);/*  www .  j av  a  2  s.  c o m*/
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 7) {
        printUsage();/*  ww  w  . j a  v  a2 s .c  o  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    int maxIdsPerSplit = Integer.valueOf(args[3]);
    String dataAPIConfClassName = args[4];
    String analyzerClassName = args[5];
    int maxIdsPerReq = Integer.valueOf(args[6]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SparseVectorsFromRawText.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromSolr.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Solr.");
    job.setJarByClass(SparseVectorsFromSolr.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and solr
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set(HTRCConstants.SOLR_MAIN_URL, solrEPR);

    job.setInputFormatClass(SolrIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);// w  ww.  j av  a2 s .  c  o m
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ////from   w w w  . ja v  a  2s .co  m
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SparseVectorsToMemcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(idListDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from w ww .  j  a  va 2s.co  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    String analyzerClassName = args[3];

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - analyzerName: " + analyzerClassName);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SVFromHDFS2HDFS.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ///*from  ww  w  . ja  va 2s  . c  o m*/
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SVFromHDFS2Memcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(vecDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static void clusterDataMR(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure, String convergenceDelta)
        throws IOException, InterruptedException, ClassNotFoundException {

    conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn.toString());
    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);

    Job job = new Job(conf, "KMeans Driver running clusterData over input: " + input);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WeightedPropertyVectorWritable.class);

    FileInputFormat.setInputPaths(job, input);
    HadoopUtil.delete(conf, output);//  w  w  w . j a  v  a 2  s.c o m
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(KMeansClusterMapper.class);
    job.setNumReduceTasks(0);
    job.setJarByClass(MemCachedKMeansDriver.class);

    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("K-Means Clustering failed processing " + clustersIn);
    }
}

From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java

License:Open Source License

public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir)
        throws Exception {
    /* input parameters */
    LOG.info(sequenceFileFullPath);//from  w  w  w .  j av  a2s.  com
    Job job = new Job(conf, "Pairwise-calc-" + sequenceFile);

    /* create the base dir for this job. Delete and recreates if it exists */
    Path hdMainDir = new Path(distDir + "/" + sequenceFile);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(hdMainDir, true);
    Path hdInputDir = new Path(hdMainDir, "data");
    if (!fs.mkdirs(hdInputDir)) {
        throw new IOException("Mkdirs failed to create " + hdInputDir.toString());
    }

    int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs);
    int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize);
    int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2;
    LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :"
            + blockSize);

    // Retrieving the configuration form the job to set the properties
    // Setting properties to the original conf does not work (possible
    // Hadoop bug)
    Configuration jobConf = job.getConfiguration();

    // Input dir in HDFS. Create this in newly created job base dir
    Path inputDir = new Path(hdMainDir, "input");
    if (!fs.mkdirs(inputDir)) {
        throw new IOException("Mkdirs failed to create " + inputDir.toString());
    }

    Long dataPartitionStartTime = System.nanoTime();
    partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir);

    distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions);

    long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000;
    LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime);

    // Output dir in HDFS
    Path hdOutDir = new Path(hdMainDir, "out");

    jobConf.setInt(Constants.BLOCK_SIZE, blockSize);
    jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions);
    jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences);
    jobConf.set(Constants.DIST_FUNC, distFunc);

    job.setJarByClass(PairWiseDistance.class);
    job.setMapperClass(SWGMap.class);
    job.setReducerClass(SWGReduce.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(SWGWritable.class);
    FileInputFormat.setInputPaths(job, hdInputDir);
    FileOutputFormat.setOutputPath(job, hdOutDir);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(noOfDivisions);

    long startTime = System.currentTimeMillis();
    int exitStatus = job.waitForCompletion(true) ? 0 : 1;
    double executionTime = (System.currentTimeMillis() - startTime) / 1000.0;
    LOG.info("Job Finished in " + executionTime + " seconds");
    LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t"
            + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir);

    return exitStatus;
}