Example usage for org.apache.hadoop.mapreduce Job getNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job getNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getNumReduceTasks.

Prototype

public int getNumReduceTasks() 

Source Link

Document

Get configured the number of reduce tasks for this job.

Usage

From source file:ComRoughSetApproInputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *///from ww w .j a  v  a 2  s .c o  m
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = (K[]) sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:ComRoughSetApproInputSampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line.
 * Configures a JobConf instance and calls {@link #writePartitionFile}.
 *//*from ww  w.  j  av a2  s  . c  o  m*/
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else if ("-splitRandom".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else if ("-splitInterval".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new IntervalSampler<K, V>(pcnt, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new RandomSampler<K, V>(0.1, 10000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    ComRoughSetApproInputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:cmd.sampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line. Configures a JobConf
 * instance and calls {@link #writePartitionFile}.
 *///from   w ww .j ava  2s .c  om
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new SplitSampler<K, V>(1000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
    assert job.getJobID() != null;
    TaskID taskId = newMapTaskId(job.getJobID(), 0);
    Configuration conf = job.getConfiguration();
    OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
    OutputCommitter committer = output//from ww  w. j a va  2s.co  m
            .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0)));
    boolean succeed = false;
    committer.setupJob(job);
    try {
        if (job.getNumReduceTasks() == 0) {
            runMap(job, null);
        } else {
            try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(),
                    job.getMapOutputValueClass())) {
                runMap(job, sorter);
                runReduce(job, sorter);
            }
        }
        committer.commitJob(job);
        succeed = true;
    } finally {
        if (succeed == false) {
            try {
                committer.abortJob(job, State.FAILED);
            } catch (IOException e) {
                LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(),
                        job.getJobName()), e);
            }
        }
    }
}

From source file:com.asakusafw.runtime.stage.optimizer.ReducerSimplifierConfigurator.java

License:Apache License

@Override
public void configure(Job job) throws IOException, InterruptedException {
    int count = job.getNumReduceTasks();
    if (count <= TASKS_TINY) {
        return;/*from w w w.ja va2s. com*/
    }
    Configuration conf = job.getConfiguration();
    long limit = conf.getLong(KEY_TINY_LIMIT, -1L);
    if (limit < 0L) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Reducer simplifier is disabled for tiny inputs: {0}", //$NON-NLS-1$
                    job.getJobName()));
        }
        return;
    }
    long estimated = StageInputDriver.estimateInputSize(job);
    if (LOG.isDebugEnabled()) {
        LOG.debug(MessageFormat.format("Reducer simplifier: job={0}, tiny-limit={1}, estimated={2}", //$NON-NLS-1$
                job.getJobName(), limit, estimated));
    }
    if (estimated < 0L || estimated > limit) {
        return;
    }

    LOG.info(MessageFormat.format("The number of reduce task ({0}) is configured: {1}->{2}", job.getJobName(),
            job.getNumReduceTasks(), TASKS_TINY));

    job.setNumReduceTasks(TASKS_TINY);
}

From source file:com.baynote.kafka.hadoop.KafkaJobBuilderTest.java

License:Apache License

@Test
public void testConfigureWholeJob() throws Exception {
    // base configuration
    builder.setZkConnect("localhost:2181");
    builder.addQueueInput("queue_name", "group_name", MockMapper.class);
    builder.setTextFileOutputFormat("/a/hdfs/path");

    // extended configuration
    builder.setJobName("job_name");
    builder.setMapOutputKeyClass(Text.class);
    builder.setMapOutputValueClass(BytesWritable.class);
    builder.setReducerClass(MockReducer.class);
    builder.setTaskMemorySettings("-Xmx2048m");
    builder.setNumReduceTasks(100);/*www .  j ava2s .c  o  m*/
    builder.setParitioner(MockPartitioner.class);
    builder.setKafkaFetchSizeBytes(1024);

    Job job = builder.configureJob(conf);

    assertEquals("job_name", job.getJobName());
    assertEquals(Text.class, job.getMapOutputKeyClass());
    assertEquals(BytesWritable.class, job.getMapOutputValueClass());
    assertEquals(MockReducer.class, job.getReducerClass());
    assertEquals(MockMapper.class, job.getMapperClass());
    assertEquals("-Xmx2048m", job.getConfiguration().get("mapred.child.java.opts"));
    assertEquals(100, job.getNumReduceTasks());
    assertEquals(MockPartitioner.class, job.getPartitionerClass());
    assertEquals(1024, KafkaInputFormat.getKafkaFetchSizeBytes(job.getConfiguration()));
    assertEquals(TextOutputFormat.class, job.getOutputFormatClass());
    assertEquals(KafkaInputFormat.class, job.getInputFormatClass());
    assertEquals("file:/a/hdfs/path", TextOutputFormat.getOutputPath(job).toString());

    builder.setJobName(null);
    builder.setSequenceFileOutputFormat();
    builder.setUseLazyOutput();
    builder.addQueueInput("queue_name_2", "group_name_2", MockMapper.class);

    job = builder.configureJob(conf);
    assertEquals(LazyOutputFormat.class, job.getOutputFormatClass());
    assertEquals(MultipleKafkaInputFormat.class, job.getInputFormatClass());
    assertEquals(DelegatingMapper.class, job.getMapperClass());
    assertEquals(BytesWritable.class, job.getOutputKeyClass());
    assertEquals(BytesWritable.class, job.getOutputValueClass());
    assertNotNull(SequenceFileOutputFormat.getOutputPath(job));
    assertNotNull(job.getJobName());

    // use s3
    builder.useS3("my_aws_key", "s3cr3t", "my-bucket");
    builder.setTextFileOutputFormat("/a/hdfs/path");
    job = builder.configureJob(conf);

    assertEquals("my_aws_key", job.getConfiguration().get("fs.s3n.awsAccessKeyId"));
    assertEquals("s3cr3t", job.getConfiguration().get("fs.s3n.awsSecretAccessKey"));
    assertEquals("my_aws_key", job.getConfiguration().get("fs.s3.awsAccessKeyId"));
    assertEquals("s3cr3t", job.getConfiguration().get("fs.s3.awsSecretAccessKey"));
}

From source file:com.cloudera.castagna.logparser.Utils.java

License:Apache License

public static void log(Job job, Logger log) throws ClassNotFoundException {
    log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}",
            new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(),
                    job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(),
                    job.getReducerClass().getSimpleName(), job.getNumReduceTasks(),
                    job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(),
                    job.getOutputFormatClass().getSimpleName() });
    Path[] inputs = FileInputFormat.getInputPaths(job);
    Path output = FileOutputFormat.getOutputPath(job);
    log.debug("input: {}", inputs[0]);
    log.debug("output: {}", output);
}

From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java

License:Apache License

/**
   * Read in the partition file and build indexing data structures.
   * If the keytype is {@link BinaryComparable} and
   * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
   * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
   * will be built. Otherwise, keys will be located using a binary search of
   * the partition keyset using the {@link RawComparator}
   * defined for this job. The input file must be sorted with the same
   * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
   *//*w  ww .jav  a  2s. co m*/
  @SuppressWarnings("unchecked") // keytype from conf not static
  public void setConf(Configuration conf) {
      try {
          this.conf = conf;
          String parts = getPartitionFile(conf);
          final Path partFile = new Path(parts);
          final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                  : partFile.getFileSystem(conf);

          Job job = new Job(conf);
          Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
          K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
          if (splitPoints.length != job.getNumReduceTasks() - 1) {
              throw new IOException("Wrong number of partitions in keyset");
          }
          RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
          for (int i = 0; i < splitPoints.length - 1; ++i) {
              if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                  throw new IOException("Split points are out of order");
              }
          }
          boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
          if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
              partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                      // Now that blocks of identical splitless trie nodes are 
                      // represented reentrantly, and we develop a leaf for any trie
                      // node with only one split point, the only reason for a depth
                      // limit is to refute stack overflow or bloat in the pathological
                      // case where the split points are long and mostly look like bytes 
                      // iii...iixii...iii   .  Therefore, we make the default depth
                      // limit large but not huge.
                      conf.getInt(MAX_TRIE_DEPTH, 200));
          } else {
              partitions = new BinarySearchNode(splitPoints, comparator);
          }
      } catch (IOException e) {
          throw new IllegalArgumentException("Can't read partitions file", e);
      }
  }

From source file:com.conversantmedia.mapreduce.tool.annotation.handler.MapperInfoHandler.java

License:Apache License

/**
 * Is this a map-only job?/*from  w ww . j a v a2 s .c  o  m*/
 * 
 * @param job      the job
 * @param jobField   the field to reflect for annotations
 * @return         <code>true</code> if map only, <code>false</code> otherwise.
 */
protected boolean isMapOnlyJob(Job job, Field jobField) {
    if (job.getNumReduceTasks() > 0) {
        return false;
    }

    // See if we have a ReducerInfo annotation - otherwise 
    // we'll consider this a "map only" job
    return !jobField.isAnnotationPresent(ReducerInfo.class);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*  ww w  . ja v a  2  s . co  m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}