Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:org.apache.hive.hcatalog.hbase.TestHiveHBaseTableOutputFormat.java

License:Apache License

@Test
public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException {
    String testName = "directOutputFormatTest";
    Path methodTestDir = new Path(getTestDir(), testName);

    String tableName = newTableName(testName).toLowerCase();
    String familyName = "my_family";
    byte[] familyNameBytes = Bytes.toBytes(familyName);

    //include hbase config in conf file
    Configuration conf = new Configuration(allConf);
    conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties()));

    //create table
    createTable(tableName, new String[] { familyName });

    String data[] = { "1,english:ONE,spanish:UNO", "2,english:TWO,spanish:DOS",
            "3,english:THREE,spanish:TRES" };

    // input/output settings
    Path inputPath = new Path(methodTestDir, "mr_input");
    getFileSystem().mkdirs(inputPath);/*from www.  j  ava 2 s .  c om*/
    FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt"));
    for (String line : data)
        os.write(Bytes.toBytes(line + "\n"));
    os.close();

    //create job
    JobConf job = new JobConf(conf);
    job.setJobName(testName);
    job.setWorkingDirectory(new Path(methodTestDir, "mr_work"));
    job.setJarByClass(this.getClass());
    job.setMapperClass(MapWrite.class);

    job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class);
    org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath);
    // why we need to set all the 3 properties??
    job.setOutputFormat(HiveHBaseTableOutputFormat.class);
    job.set(HBaseSerDe.HBASE_TABLE_NAME, tableName);
    job.set(TableOutputFormat.OUTPUT_TABLE, tableName);
    job.set(HCatConstants.HCAT_DEFAULT_TOPIC_PREFIX + ".hbase.mapreduce.outputTableName", tableName);

    try {
        OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null);
        job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo));
    } catch (Exception ex) {
        throw new IOException("Serialization error " + ex.getMessage(), ex);
    }

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(HCatRecord.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(HCatRecord.class);
    job.setNumReduceTasks(0);
    System.getProperty("java.classpath");
    RunningJob runJob = JobClient.runJob(job);
    runJob.waitForCompletion();
    assertTrue(runJob.isSuccessful());

    //verify
    HTable table = new HTable(conf, tableName);
    Scan scan = new Scan();
    scan.addFamily(familyNameBytes);
    ResultScanner scanner = table.getScanner(scan);
    int index = 0;
    for (Result result : scanner) {
        String vals[] = data[index].toString().split(",");
        for (int i = 1; i < vals.length; i++) {
            String pair[] = vals[i].split(":");
            assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0])));
            assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0]))));
        }
        index++;
    }
    assertEquals(data.length, index);
}

From source file:org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.java

License:Apache License

public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setInputFormat(SequenceFileInputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    conf.setMapperClass(OutputMapper.class);

    conf.setReducerClass(Reducer.class);
    conf.setNumReduceTasks(0);

    client.setConf(conf);// w  w  w .  j a v a2s .  co m
    JobClient.runJob(conf);
}

From source file:org.apache.mahout.df.mapred.inmem.InMemBuilder.java

License:Apache License

@Override
protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException {
    FileOutputFormat.setOutputPath(conf, getOutputPath(conf));

    // put the data in the DistributedCache
    DistributedCache.addCacheFile(getDataPath().toUri(), conf);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(MapredOutput.class);

    conf.setMapperClass(InMemMapper.class);
    conf.setNumReduceTasks(0); // no reducers

    conf.setInputFormat(InMemInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
}

From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java

License:Apache License

@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
    FileInputFormat.setInputPaths(job, getDataPath());
    FileOutputFormat.setOutputPath(job, getOutputPath(job));

    job.setOutputKeyClass(TreeID.class);
    job.setOutputValueClass(MapredOutput.class);

    job.setMapperClass(Step1Mapper.class);
    job.setNumReduceTasks(0); // no reducers

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    // if we are in 'local' mode, correct the number of maps
    // or the mappers won't be able to compute the right indexes
    String tracker = job.get("mapred.job.tracker", "local");
    if ("local".equals(tracker)) {
        log.warn("Hadoop running in 'local' mode, only one map task will be launched");
        job.setNumMapTasks(1);/* w w  w .  j a  v  a 2s. c  o m*/
    }
}

From source file:org.apache.mahout.df.mapred.partial.Step0Job.java

License:Apache License

/**
 * Computes the partitions' first ids in Hadoop's order
 * /*from w w  w.jav  a 2s .  com*/
 * @param conf
 *          configuration
 * @return first ids for all the partitions
 * @throws IOException
 */
public Step0Output[] run(Configuration conf) throws IOException {

    JobConf job = new JobConf(conf, Step0Job.class);

    // check the output
    if (outputPath.getFileSystem(job).exists(outputPath)) {
        throw new IOException("Output path already exists : " + outputPath);
    }

    // put the dataset into the DistributedCache
    // use setCacheFiles() to overwrite the first-step cache files
    URI[] files = { datasetPath.toUri() };
    DistributedCache.setCacheFiles(files, job);

    FileInputFormat.setInputPaths(job, dataPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Step0Output.class);

    job.setMapperClass(Step0Mapper.class);
    job.setNumReduceTasks(0); // no reducers

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    // run the job
    JobClient.runJob(job);

    return parseOutput(job);
}

From source file:org.apache.mahout.df.mapred.partial.Step2Job.java

License:Apache License

/**
 * Run the second step./* w  w  w . j av a 2 s .  c o m*/
 * 
 * @param conf
 *          configuration
 * @param keys
 *          keys returned by the first step
 * @param trees
 *          trees returned by the first step
 * @param callback
 * @throws IOException
 */
public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback)
        throws IOException {
    if (callback == null) {
        // no need to launch the job
        return;
    }

    int numTrees = keys.length;

    JobConf job = new JobConf(conf, Step2Job.class);

    // check the output
    if (outputPath.getFileSystem(job).exists(outputPath)) {
        throw new IOException("Output path already exists : " + outputPath);
    }

    int[] sizes = Step0Output.extractSizes(partitions);

    InterResults.store(forestPath.getFileSystem(job), forestPath, keys, trees, sizes);

    // needed by the mapper
    Builder.setNbTrees(job, numTrees);

    // put the dataset and the forest into the DistributedCache
    // use setCacheFiles() to overwrite the first-step cache files
    URI[] files = { datasetPath.toUri(), forestPath.toUri() };
    DistributedCache.setCacheFiles(files, job);

    FileInputFormat.setInputPaths(job, dataPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(TreeID.class);
    job.setOutputValueClass(MapredOutput.class);

    job.setMapperClass(Step2Mapper.class);
    job.setNumReduceTasks(0); // no reducers

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    // run the job
    JobClient.runJob(job);

    parseOutput(job, callback);
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config)
        throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
        LOG.info("CrawlDb db: " + crawlDb);
    }/*from  ww w  . j  a  v a2  s.  c  om*/

    Path outFolder = new Path(output);
    Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: done");
    }

}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * /*from  ww w . j  a  va  2  s  .  co m*/
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}

From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java

License:Apache License

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN/*ww w  .j  a  v a 2s.c  o m*/
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType,
        AggrType aggrType, boolean asSequenceFile) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();

    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);

    if (nameType == null) {
        dumper.setMapperClass(Sorter.class);
        dumper.setReducerClass(Sorter.class);
        dumper.setMapOutputKeyClass(FloatWritable.class);
        dumper.setMapOutputValueClass(Text.class);
    } else {
        dumper.setMapperClass(Dumper.class);
        dumper.setReducerClass(Dumper.class);
        dumper.setMapOutputKeyClass(Text.class);
        dumper.setMapOutputValueClass(FloatWritable.class);
    }

    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);

    if (asSequenceFile) {
        dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        dumper.setOutputFormat(TextOutputFormat.class);
    }

    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);

    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);

    dumper.setLong("topn", topN);

    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
        dumper.set("mapred.textoutputformat.separator", "=");
    }

    try {
        LOG.info("NodeDumper: running");
        JobClient.runJob(dumper);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.tools.FreeGenerator.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
        System.err.println("\tinputDir\tinput directory containing one or more input files.");
        System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
        System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
        System.err.println("\t-filter\trun current URLFilters on input URLs");
        System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
        return -1;
    }/*from w  w w  .ja v a 2 s.co  m*/
    boolean filter = false;
    boolean normalize = false;
    if (args.length > 2) {
        for (int i = 2; i < args.length; i++) {
            if (args[i].equals("-filter")) {
                filter = true;
            } else if (args[i].equals("-normalize")) {
                normalize = true;
            } else {
                LOG.error("Unknown argument: " + args[i] + ", exiting ...");
                return -1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FreeGenerator: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        LOG.error("FAILED: " + StringUtils.stringifyException(e));
        return -1;
    }
    long end = System.currentTimeMillis();
    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
    return 0;
}