Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.shopping.hbase.mapreduce.Import.java

License:Apache License

/**
 * Job configuration./*from ww  w.j  a  va  2s.  co  m*/
 */
protected Job configureJob(Configuration conf, String inputPathName, String tableName) throws IOException {
    Path inputPath = new Path(inputPathName);
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Importer.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    // job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Importer.class);
    // No reducers. Just write straight to table. Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.shopping.hbase.sample.mapreduce.SampleUploader.java

License:Apache License

/**
 * Job configuration.// w  w w  .java 2  s.  c o m
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.shopzilla.hadoop.mapreduce.MiniMRClusterContextMRTest.java

License:Apache License

@Test
public void testWordCount() throws Exception {
    Path input = new Path("/user/test/keywords_data");
    Path output = new Path("/user/test/word_count");

    Job job = new Job(configuration);

    job.setJobName("Word Count Test");

    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(SumReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    assertTrue("All files from /data classpath directory should have been copied into HDFS",
            miniMRClusterContext.getFileSystem().exists(input));

    job.waitForCompletion(true);/*from w ww . j av  a2s.  c  om*/

    assertTrue("Output file should have been created", miniMRClusterContext.getFileSystem().exists(output));

    final LinkedList<String> expectedLines = new LinkedList<String>();
    expectedLines.add("goodbye\t1");
    expectedLines.add("hello\t1");
    expectedLines.add("world\t2");

    miniMRClusterContext.processData(output, new Function<String, Void>() {
        @Override
        public Void apply(String line) {
            assertEquals(expectedLines.pop(), line);
            return null;
        }
    });
    assertEquals(0, expectedLines.size());
}

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//  w  w w .  j  av a2 s .  co  m
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("indexSizes", null, "index sizes Path", true);
    addOption("startIteration", null, "start iteration number", String.valueOf(0));
    addOption("oldM", null, "old M matrix Path.", null);
    addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory",
            String.valueOf(true));
    addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true));
    addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true));
    addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true));
    addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2));
    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    try {
        /** step 0: fetch dimention of training set matrix. */
        Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")),
                DELIMETER, Arrays.asList(0), Arrays.asList(1));

        numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
        numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
        lambda = Double.parseDouble(parsedArgs.get("--lambda"));
        alpha = Double.parseDouble(parsedArgs.get("--alpha"));
        implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
        numUsers = Integer.parseInt(indexSizesTmp.get("0"));
        numItems = Integer.parseInt(indexSizesTmp.get("1"));

        numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks;
        startIteration = Integer.parseInt(parsedArgs.get("--startIteration"));
        largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures"));
        useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve"));
        cleanUp = Boolean.parseBoolean(getOption("cleanUp"));
        useTransform = Boolean.parseBoolean(getOption("useTransform"));
        rateIndex = Integer.parseInt(getOption("rateIndex"));
        FileSystem fs = FileSystem.get(getConf());
        if (!fs.exists(pathToTransformed())) {
            if (useTransform) {
                // transform price into rating
                Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class,
                        TransformColumnValueMapper.class, NullWritable.class, Text.class,
                        TextOutputFormat.class);
                transformJob.waitForCompletion(true);
            } else {

                FileUtil.copy(FileSystem.get(getConf()), getInputPath(), FileSystem.get(getConf()),
                        pathToTransformed(), false, getConf());
            }
        }
        /*
        if (getOption("oldM") != null) {
          runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM")));
          return 0;
        }
        */
        /*
            * compute the factorization A = U M'
            *
            * where A (users x items) is the matrix of known ratings
            *           U (users x features) is the representation of users in the feature space
            *           M (items x features) is the representation of items in the feature space
            */
        if (startIteration == 0) {
            if (!fs.exists(pathToItemRatings())) {
                // create A' 
                Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(), TextInputFormat.class,
                        ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class,
                        VectorSumReducer.class, IntWritable.class, VectorWritable.class,
                        SequenceFileOutputFormat.class);
                itemRatings.setCombinerClass(VectorSumReducer.class);
                long matrixSizeExp = (long) (8L * numUsers * numFeatures * SAFE_MARGIN);
                long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT
                        / (long) HadoopClusterUtil.MAP_TASKS_PER_NODE;
                int numTaskPerDataNode = Math.max(1,
                        (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double) matrixSizeExp));
                //log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode);
                if (matrixSizeExp > memoryThreshold) {
                    //log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                    int numReducer = Math.min(
                            numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()),
                            HadoopClusterUtil.getMaxMapTasks(getConf()));
                    //log.info("Number Of Reducer: " + numReducer);
                    itemRatings.setNumReduceTasks(numReducer);
                }

                itemRatings.waitForCompletion(true);
            }

            if (!fs.exists(pathToUserRatings())) {
                Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
                        IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                        VectorWritable.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                userRatings.setCombinerClass(MergeVectorsCombiner.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf()));
                userRatings.waitForCompletion(true);
            }
            if (!fs.exists(getOutputPath("userItemCnt"))) {
                // count item per user
                Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"),
                        SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class,
                        IntWritable.class, SequenceFileOutputFormat.class);
                userItemCntsJob.setJobName("user ratings count");
                userItemCntsJob.waitForCompletion(true);
            }

            if (!fs.exists(getTempPath("averageRatings"))) {
                //TODO this could be fiddled into one of the upper jobs
                Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
                        AverageRatingMapper.class, IntWritable.class, VectorWritable.class,
                        MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
                averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
                averageItemRatings.waitForCompletion(true);
            }
            if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) {
                Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());

                /** create an initial M */
                initializeM(averageRatings);
            }
        }

        for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) {
            DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1),
                    getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration - 1) + "/M"), numItems,
                    numFeatures);
            curM.setConf(getConf());
            DistributedRowMatrix YtransposeY = curM.times(curM);
            /** broadcast M, read A row-wise, recompute U row-wise */
            log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                    YtransposeY.getRowPath(), numItems, false);

            DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration),
                    getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"), numUsers, numFeatures);
            curU.setConf(getConf());
            DistributedRowMatrix XtransposeX = curU.times(curU);

            /** broadcast U, read A' row-wise, recompute M row-wise */
            log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
                    XtransposeX.getRowPath(), numUsers, largeUserFeatures);

            /** calculate rmse on each updated matrix U, M and decide to further iteration */
            if (currentIteration > startIteration && useRMSECurve) {
                Pair<Integer, Double> UsquaredError = calculateMatrixDistanceSquared(
                        pathToU(currentIteration - 1), pathToU(currentIteration), currentIteration);
                Pair<Integer, Double> MsquaredError = calculateMatrixDistanceSquared(
                        pathToM(currentIteration - 1), pathToM(currentIteration), currentIteration);
                String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() + DELIMETER
                        + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() + DELIMETER
                        + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE;
                rmsePerIteration += currentRMSE;
                log.info("iteration {}: {}", currentIteration, currentRMSE);
            }
            if (currentIteration >= startIteration + 2 && cleanUp) {
                fs.deleteOnExit(pathToU(currentIteration - 2));
                fs.deleteOnExit(pathToM(currentIteration - 2));
            }
        }
        return 0;
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        if (useRMSECurve) {
            HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration);
        }
    }
}

From source file:com.skp.experiment.cf.als.hadoop.UploloadToHbaseTableJob.java

License:Apache License

/**
 * Job configuration.//  www. jav  a2 s  . c o m
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf./*from ww  w . j  a  va2s .c om*/
 *
 * @param table  The output Splice table name, The format should be Schema.tableName.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.client
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 * @throws SQLException
 */
public static void initTableReducerJob(String table, Class<? extends Reducer> reducer, Job job,
        Class partitioner, String quorumAddress, String serverClass, String serverImpl,
        boolean addDependencyJars, Class<? extends OutputFormat> outputformatClass) throws IOException {

    Configuration conf = job.getConfiguration();
    job.setOutputFormatClass(outputformatClass);
    if (reducer != null)
        job.setReducerClass(reducer);
    conf.set(MRConstants.SPLICE_OUTPUT_TABLE_NAME, table);
    if (sqlUtil == null)
        sqlUtil = SMSQLUtil.getInstance(conf.get(MRConstants.SPLICE_JDBC_STR));
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    String hbaseTableID = null;
    try {
        hbaseTableID = sqlUtil.getConglomID(table);
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        throw new IOException(e);
    }
    conf.set(MRConstants.HBASE_OUTPUT_TABLE_NAME, table);

    if (quorumAddress != null) {
        // Calling this will validate the format
        HBasePlatformUtils.validateClusterKey(quorumAddress);
        conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
        conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
        conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);

    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Object.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        // TODO Where are the keys?
        int regions = getReduceNumberOfRegions(hbaseTableID);
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(regions);
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
        addDependencyJars(job);
    }

    //initCredentials(job);
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Ensures that the given number of reduce tasks for the given job
 * configuration does not exceed the number of regions for the given table.
 *
 * @param table  The Splice table to get the region count for.
 * @param job  The current job to adjust.
 * @throws IOException When retrieving the table details fails.
 * @throws SQLException When Splice retrieving conglom ID fails.
 *//*from   w  w  w  .j  av a 2  s  . com*/
public static void limitNumReduceTasks(String table, Job job) throws IOException, SQLException {
    Configuration conf = job.getConfiguration();
    if (sqlUtil == null)
        sqlUtil = SMSQLUtil.getInstance(conf.get(MRConstants.SPLICE_JDBC_STR));
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    String hbaseTableID = sqlUtil.getConglomID(table);
    int regions = getReduceNumberOfRegions(hbaseTableID);
    if (job.getNumReduceTasks() > regions)
        job.setNumReduceTasks(regions);
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Sets the number of reduce tasks for the given job configuration to the
 * number of regions the given table has.
 *
 * @param table  The Splice table to get the region count for.
 * @param job  The current job to adjust.
 * @throws IOException When retrieving the table details fails.
 * @throws SQLException When Splice retrieving conglom ID fails.
 *//*  ww  w  . j  a  v a2 s .  c o  m*/
public static void setNumReduceTasks(String table, Job job) throws IOException, SQLException {
    Configuration conf = job.getConfiguration();
    if (sqlUtil == null)
        sqlUtil = SMSQLUtil.getInstance(conf.get(MRConstants.SPLICE_JDBC_STR));
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    String hbaseTableID = sqlUtil.getConglomID(table);
    int regions = getReduceNumberOfRegions(hbaseTableID);
    job.setNumReduceTasks(regions);
}

From source file:com.sreejith.loganalyzer.mapreduce.LogDriver.java

License:Apache License

public static void main(String[] args) throws Exception {
    Job job = new Job();
    job.setJarByClass(LogDriver.class);
    job.setJobName("Log Analyzer");

    job.setMapperClass(LogMapper.class);
    job.setPartitionerClass(LogPartitioner.class);
    job.setCombinerClass(LogReducer.class);
    job.setReducerClass(LogReducer.class);

    job.setNumReduceTasks(2);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);/* w w  w .j  a va2  s.c  o  m*/

}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    // We're explicitly disabling speculative execution
    conf.set("mapreduce.map.speculative", "false");
    conf.set("mapreduce.map.maxattempts", "1");

    conf.set("mapreduce.job.user.classpath.first", "true");
    conf.set("mapreduce.task.classpath.user.precedence", "true");
    conf.set("mapreduce.task.classpath.first", "true");

    addNecessaryJarsToJob(conf);/*from w  w w .j a  va2  s.c  om*/

    Job job = Job.getInstance(conf);

    // IO formats
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(NullOutputFormat.class);

    // Mapper & job output
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // It's map only job
    job.setNumReduceTasks(0);

    // General configuration
    job.setJarByClass(getClass());

    return job;
}