Example usage for org.apache.hadoop.mapred JobConf setSpeculativeExecution

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setSpeculativeExecution.

Prototype

public void setSpeculativeExecution(boolean speculativeExecution)

Source Link

Document

Turn speculative execution on or off for this job.

Usage

From source file:org.apache.nutch.fetcher.Fetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    checkConfiguration();/*from w w w . j  a v  a 2 s.  co m*/

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.fetcher.Fetcher2.java

License:Apache License

public void fetch(Path segment, int threads, boolean parsing) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting");
        LOG.info("Fetcher: segment: " + segment);
    }/*from   w ww . java  2 s  .c om*/

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
    job.setBoolean("fetcher.parse", parsing);

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher2.class);

    job.setOutputPath(segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FetcherOutput.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: done");
    }
}

From source file:org.apache.nutch.fetcher.NIOFetcher.java

License:Apache License

public void fetch(Path segment) throws IOException {

    checkConfiguration();/*from  www.j a  va  2 s .  c o  m*/

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("NIOFetcher: starting at " + sdf.format(start));
        LOG.info("NIOFetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is
    // enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("NIOFetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("NIOFetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(FetcherInputFormat.class);

    job.setMapRunnerClass(NIOFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("NIOFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.fetcher.OldFetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("OldFetcher: starting at " + sdf.format(start));
        LOG.info("OldFetcher: segment: " + segment);
    }//www  . j  a  va2s .  c  om

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(OldFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public void dedup(Path[] indexDirs) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: starting");
    }//from   w w  w .j a  v a  2s.  co m

    Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    for (int i = 0; i < indexDirs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
        }
        job.addInputPath(indexDirs[i]);
    }
    job.setJobName("dedup 1: urls by time");

    job.setInputFormat(InputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IndexDoc.class);

    job.setReducerClass(UrlsReducer.class);
    job.setOutputPath(outDir1);

    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    job = new NutchJob(getConf());
    job.setJobName("dedup 2: content by hash");

    job.addInputPath(outDir1);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(MD5Hash.class);
    job.setMapOutputValueClass(IndexDoc.class);
    job.setPartitionerClass(HashPartitioner.class);
    job.setSpeculativeExecution(false);

    job.setReducerClass(HashReducer.class);
    job.setOutputPath(outDir2);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    // remove outDir1 - no longer needed
    fs.delete(outDir1);

    job = new NutchJob(getConf());
    job.setJobName("dedup 3: delete from index(es)");

    job.addInputPath(outDir2);
    job.setInputFormat(SequenceFileInputFormat.class);
    //job.setInputKeyClass(Text.class);
    //job.setInputValueClass(IndexDoc.class);

    job.setInt("io.file.buffer.size", 4096);
    job.setMapperClass(DeleteDuplicates.class);
    job.setReducerClass(DeleteDuplicates.class);

    job.setOutputFormat(DeleteDuplicates.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    JobClient.runJob(job);

    fs.delete(outDir2);

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: done");
    }
}

From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java

License:Apache License

public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException {

    checkConfiguration();/*  w  ww . j  av a2s.c  om*/

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    // push the zipped_webdriver binaries onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zippedDriverPath), job);

    job.set("webdriver.binaries.path", zippedDriverPath);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(SeleniumFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.oozie.action.hadoop.LauncherMapperHelper.java

License:Apache License

public static void setupLauncherInfo(JobConf launcherConf, String jobId, String actionId, Path actionDir,
        String recoveryId, Configuration actionConf, String prepareXML)
        throws IOException, HadoopAccessorException {

    launcherConf.setMapperClass(LauncherMapper.class);
    launcherConf.setSpeculativeExecution(false);
    launcherConf.setNumMapTasks(1);//from w ww .java 2 s  . c o  m
    launcherConf.setNumReduceTasks(0);

    launcherConf.set(LauncherMapper.OOZIE_JOB_ID, jobId);
    launcherConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId);
    launcherConf.set(LauncherMapper.OOZIE_ACTION_DIR_PATH, actionDir.toString());
    launcherConf.set(LauncherMapper.OOZIE_ACTION_RECOVERY_ID, recoveryId);
    launcherConf.set(LauncherMapper.ACTION_PREPARE_XML, prepareXML);

    actionConf.set(LauncherMapper.OOZIE_JOB_ID, jobId);
    actionConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId);

    if (Services.get().getConf().getBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache",
            false)) {
        List<String> purgedEntries = new ArrayList<String>();
        Collection<String> entries = actionConf.getStringCollection("mapreduce.job.cache.files");
        for (String entry : entries) {
            if (entry.contains("#")) {
                purgedEntries.add(entry);
            }
        }
        actionConf.setStrings("mapreduce.job.cache.files",
                purgedEntries.toArray(new String[purgedEntries.size()]));
        launcherConf.setBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache", true);
    }

    FileSystem fs = Services.get().get(HadoopAccessorService.class)
            .createFileSystem(launcherConf.get("user.name"), actionDir.toUri(), launcherConf);
    fs.mkdirs(actionDir);

    OutputStream os = fs.create(new Path(actionDir, LauncherMapper.ACTION_CONF_XML));
    try {
        actionConf.writeXml(os);
    } finally {
        IOUtils.closeSafely(os);
    }

    launcherConf.setInputFormat(OozieLauncherInputFormat.class);
    launcherConf.set("mapred.output.dir", new Path(actionDir, "output").toString());
}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);/*ww  w  .j  a v a 2s  . c om*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.slc.sli.aggregation.mapreduce.map.ConfigurableMapReduceJob.java

License:Apache License

@SuppressWarnings("rawtypes")
protected static JobConf finalizeConfig(JobConf jobConf, ConfigSections s) throws IOException {

    Class<? extends Mapper> mapperClass = JobConfiguration.mapper.getMapClass(mapper);
    Class<? extends Reducer> reducerClass = JobConfiguration.function.getReduceClass(reduceFunction);
    Map<String, String> idFields = s.getMapper().getMapIdFields();

    // validate we have enough to continue
    boolean valid = true;
    if (mapperClass == null) {
        log.severe("Invalid map/reduce configuration detected : no mapper class specified.");
        valid = false;/*from w ww . j a v a2 s  . c om*/
    }
    if (idFields == null) {
        idFields = new HashMap<String, String>();
        log.severe("Invalid map/reduce configuration detected : no map id fields specified.");
        valid = false;
    }
    if (mapCollection == null) {
        log.severe("Invalid map/reduce configuration detected : no map collection specified.");
        valid = false;
    }
    if (mapQuery == null) {
        log.severe("Invalid map/reduce configuration detected : no map query specified.");
        valid = false;
    }
    if (mapFields == null) {
        log.severe("Invalid map/reduce configuration detected : no map input fields specified.");
        valid = false;
    }
    if (reducerClass == null) {
        log.severe("Invalid map/reduce configuration detected : no reducer class specified.");
        valid = false;
    }
    if (reduceCollection == null) {
        log.severe("Invalid map/reduce configuration detected : no reduce collection specified.");
        valid = false;
    }
    if (reduceField == null) {
        log.severe("Invalid map/reduce configuration detected : no reduce field specified.");
        valid = false;
    }

    if (!valid) {
        throw new IllegalArgumentException("Invalid mapper specified. Check log for details.");
    }

    jobConf.set("mapred.output.dir", String.format("%s-%s-%d", s.getMapper().getMapper(),
            s.getMetadata().getFunction(), System.currentTimeMillis()));

    jobConf.setJobName(s.getMetadata().getDescription() == null ? "M/R Job" : s.getMetadata().getDescription());

    // enable speculative execution. Multiple mapper tasks are created for the same split.
    // First one to finish wins; the remaining tasks are terminated.
    jobConf.setSpeculativeExecution(true);
    jobConf.setUseNewMapper(true);
    jobConf.setUseNewReducer(true);

    /**
     * TODO -- decide if this is required.
    String id = conf.get("@ID@");
    String tenantId = conf.get("@TENANT_ID@");
    for (Map.Entry<String, Object> entry : query.entrySet()) {
    Object value = entry.getValue();
    if (value instanceof String) {
        String s = (String) value;
        if (s.indexOf("@ID@") >= 0 && id != null) {
            s = s.replace("@ID@", id);
            query.put(entry.getKey(), s);
        }
        if (s.indexOf("@TENANT_ID@") >= 0 && tenantId != null) {
            s = s.replace("@TENANT_ID@", tenantId);
            query.put(entry.getKey(), s);
        }
    }
    }
            
    if (updateField.indexOf("@ID@") >= 0 && id != null) {
    updateField = updateField.replace("@ID@", id);
    }
    if (updateField.indexOf("@TENANT_ID@") >= 0 && tenantId != null) {
    updateField = updateField.replace("@TENANT_ID@", tenantId);
    }
    */

    MongoConfigUtil.setQuery(jobConf, new BasicDBObject(mapQuery));

    Map<String, Object> fullFields = new HashMap<String, Object>();
    for (String f : idFields.values()) {
        fullFields.put(f, 1);
    }
    fullFields.putAll(mapFields);

    MongoConfigUtil.setFields(jobConf, new BasicDBObject(fullFields));
    MongoConfigUtil.setInputKey(jobConf, idFields.get("id"));
    MongoConfigUtil.setInputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + mapCollection);
    MongoConfigUtil.setMapperOutputKey(jobConf, TenantAndIdEmittableKey.class);
    MongoConfigUtil.setMapperOutputValue(jobConf, BSONWritable.class);
    MongoConfigUtil.setOutputKey(jobConf, TenantAndIdEmittableKey.class);
    MongoConfigUtil.setOutputValue(jobConf, BSONWritable.class);

    // TODO - this probably should be configurable
    MongoConfigUtil.setReadSplitsFromSecondary(jobConf, true);

    MongoConfigUtil.setSplitSize(jobConf, 32);

    jobConf.setClass("mapred.input.key.class", TenantAndIdEmittableKey.class, EmittableKey.class);
    jobConf.setClass("mapred.input.value.class", BSONWritable.class, Object.class);

    jobConf.setClass("mapred.output.key.class", TenantAndIdEmittableKey.class, EmittableKey.class);
    jobConf.setClass("mapred.output.value.class", BSONWritable.class, Object.class);

    jobConf.setClass("mapreduce.inputformat.class", MongoTenantAndIdInputFormat.class, MongoInputFormat.class);
    jobConf.setClass("mapreduce.outputformat.class", MongoAggFormatter.class, MongoOutputFormat.class);
    MongoConfigUtil.setInputFormat(jobConf, MongoTenantAndIdInputFormat.class);
    MongoConfigUtil.setOutputFormat(jobConf, MongoAggFormatter.class);

    /**
     * Configure how hadoop calculates splits.
     *
     * We enable input splits to avoid having the entire job executed on a single hadoop node.
     *
     * We enable shard chunk splitting to allow mongo to specify how to split the input.
     *
     * We disable read splits from shards because we want hadoop connecting to mongos, not
     * mongod directly. This avoids incorrect results in situations where data is in the process
     * of migration at the same time hadoop is trying to read it.
     *
     * TODO - determine if we also need to set the input split key pattern. This depends
     * on how well data is distributed by _id. Setting the key pattern gives finer grained
     * control over how splits are calculated.
     */
    MongoConfigUtil.setCreateInputSplits(jobConf, true);
    MongoConfigUtil.setShardChunkSplittingEnabled(jobConf, true);
    MongoConfigUtil.setReadSplitsFromShards(jobConf, false);

    MongoConfigUtil.setOutputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + reduceCollection);

    jobConf.setJarByClass(JobConfiguration.class);

    MongoConfigUtil.setMapper(jobConf, mapperClass);
    jobConf.setClass(JobContext.MAP_CLASS_ATTR, mapperClass, Mapper.class);

    MongoConfigUtil.setReducer(jobConf, reducerClass);
    jobConf.setClass(JobContext.REDUCE_CLASS_ATTR, reducerClass, Reducer.class);

    // Set this relatively high to keep the total map execution time low.
    // Formula:  1.75 * (# nodes * max tasks)
    // TODO : replace this hardcoded value with one calculated from configuration information.
    jobConf.setNumReduceTasks(52);

    // Add the configuration itself to the JobConf.
    JobConfiguration.toHadoopConfiguration(s, jobConf);

    return jobConf;
}