Example usage for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution.

Prototype

public void setReduceSpeculativeExecution(boolean speculativeExecution)

Source Link

Document

Turn speculative execution on or off for this job for reduce tasks.

Usage

From source file:cascading.jdbc.db.DBOutputFormat.java

License:Apache License

/**
 * Initializes the reduce-part of the job with the appropriate output settings
 *
 * @param job                 The job//from   w  w w .jav a  2s  . c  o  m
 * @param dbOutputFormatClass
 * @param tableName           The table to insert data into
 * @param fieldNames          The field names in the table. If unknown, supply the appropriate
 */
public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName,
        String[] fieldNames, String[] updateFields, int batchSize) {
    if (dbOutputFormatClass == null)
        job.setOutputFormat(DBOutputFormat.class);
    else
        job.setOutputFormat(dbOutputFormatClass);

    // writing doesn't always happen in reduce
    job.setReduceSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setOutputTableName(tableName);
    dbConf.setOutputFieldNames(fieldNames);

    if (updateFields != null)
        dbConf.setOutputUpdateFieldNames(updateFields);

    if (batchSize != -1)
        dbConf.setBatchStatementsNum(batchSize);
}

From source file:co.nubetech.hiho.mapreduce.lib.db.apache.DBOutputFormat.java

License:Apache License

private static DBConfiguration setOutput(Job job, JobConf jobConf, String tableName) throws IOException {
    job.setOutputFormatClass(DBOutputFormat.class);
    jobConf.setReduceSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());

    dbConf.setOutputTableName(tableName);
    return dbConf;
}

From source file:com.scaleunlimited.cascading.hadoop.HadoopUtils.java

License:Apache License

public static JobConf getDefaultJobConf() throws IOException, InterruptedException {
    JobConf conf = new JobConf();

    // We explicitly set task counts to 1 for local so that code which depends on
    // things like the reducer count runs properly.
    if (isJobLocal(conf)) {
        conf.setNumMapTasks(1);/*from  w w  w.  ja v a  2 s.co  m*/
        conf.setNumReduceTasks(1);
    } else {
        conf.setNumReduceTasks(getNumReducers(conf));

        // TODO - By default we want to use 0.95 * the number of reduce slots, as per
        // Hadoop wiki. But we want to round, versus truncate, to avoid setting it to
        // 0 if we have one reducer. This way it only impacts you if you have more
        // than 10 reducers.
        // conf.setNumReduceTasks((getNumReducers(conf) * 95) / 100);
    }

    conf.setMapSpeculativeExecution(false);
    conf.setReduceSpeculativeExecution(false);

    return conf;
}

From source file:com.scaleunlimited.helpful.tools.AnalyzeEmail.java

License:Apache License

private static JobConf getDefaultJobConf() throws IOException {
    JobClient jobClient = new JobClient(new JobConf());
    ClusterStatus status = jobClient.getClusterStatus();
    int trackers = status.getTaskTrackers();

    JobConf conf = new JobConf();
    conf.setNumMapTasks(trackers * 10);//from   w  w  w  .j a va2s . c  o  m

    conf.setNumReduceTasks((trackers * conf.getInt("mapred.tasktracker.reduce.tasks.maximum", 2)));

    conf.setMapSpeculativeExecution(false);
    conf.setReduceSpeculativeExecution(false);
    conf.set("mapred.child.java.opts", "-server -Xmx512m -Xss128k");

    // Should match the value used for Xss above. Note no 'k' suffix for the ulimit command.
    // New support that one day will be in Hadoop.
    conf.set("mapred.child.ulimit.stack", "128");

    return conf;
}

From source file:com.twitter.maple.jdbc.db.DBOutputFormat.java

License:Apache License

/**
 * Initializes the reduce-part of the job with the appropriate output settings
 *
 * @param job                 The job/*  w w  w .ja  va  2 s. c  o m*/
 * @param dbOutputFormatClass
 * @param tableName           The table to insert data into
 * @param fieldNames          The field names in the table. If unknown, supply the appropriate
 */
public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName,
        String[] fieldNames, String[] updateFields, int batchSize) {
    if (dbOutputFormatClass == null) {
        job.setOutputFormat(DBOutputFormat.class);
    } else {
        job.setOutputFormat(dbOutputFormatClass);
    }

    // writing doesn't always happen in reduce
    job.setReduceSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setOutputTableName(tableName);
    dbConf.setOutputFieldNames(fieldNames);

    if (updateFields != null) {
        dbConf.setOutputUpdateFieldNames(updateFields);
    }

    if (batchSize != -1) {
        dbConf.setBatchStatementsNum(batchSize);
    }
}

From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args//w ww  .  ja v  a  2s . c  o m
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(TerrierIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:de.l3s.streamcorpus.StreamCorpusIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args//from w w w.  jav  a  2 s.c o m
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(StreamCorpusIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:eu.larkc.iris.Main.java

License:Apache License

private JobConf setupJob(Configuration conf) {
    JobConf jobConf = new JobConf(conf, Main.class);

    // run the job here.

    /* REAL CLUSTER */
    jobConf.set("dfs.blocksize", "536870912");
    jobConf.set("dfs.namenode.handler.count", "40");
    //jobConf.set("dfs.replication", "1");
    jobConf.set("mapreduce.reduce.shuffle.parallelcopies", "10");
    jobConf.set("mapreduce.task.io.sort.factor", "100");
    jobConf.set("mapreduce.task.io.sort.mb", "1024");
    jobConf.set("io.file.buffer.size", "131072");
    jobConf.set("mapred.child.java.opts", "-Xmx2560m");
    jobConf.set("mapred.child.ulimit", "4194304");
    jobConf.set("mapred.min.split.size", "536870912");
    jobConf.set("mapreduce.input.fileinputformat.split.minsize", "536870912");
    jobConf.set("mapreduce.reduce.merge.inmem.threshold", "0");
    /**//*from   ww w  .j  av  a 2  s. c o m*/

    /* compression settings 
    jobConf.set("mapreduce.map.output.compress", "false");
    jobConf.set("mapreduce.output.fileoutputformat.compress", "true");
    jobConf.set("mapreduce.output.fileoutputformat.compression.type", "BLOCK");
     ~~~ */

    //!!!IMPORTANT, if not : Caused by: java.io.FileNotFoundException: File does not exist: hdfs://ec2-50-19-191-200.compute-1.amazonaws.com:8020/user/root/lubm/facts/lubm50/data
    jobConf.setBoolean("mapred.input.dir.recursive", true);

    jobConf.set("cascading.serialization.tokens",
            "130=eu.larkc.iris.storage.IRIWritable,131=eu.larkc.iris.storage.StringTermWritable");
    defaultConfiguration.flowProperties.put("cascading.serialization.tokens",
            "130=eu.larkc.iris.storage.IRIWritable,131=eu.larkc.iris.storage.StringTermWritable");

    /*
     if( System.getProperty("log4j.logger") != null )
        defaultConfiguration.flowProperties.put( "log4j.logger", System.getProperty("log4j.logger") );
    */

    //jobConf.set("mapred.min.split.size", "134217728");
    //jobConf.set("mapred.child.java.opts", "-Xms64m -Xmx512m");
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setReduceSpeculativeExecution(false);

    //FIXME
    //jobConf.setNumMapTasks(8);
    jobConf.setNumReduceTasks(32);

    FlowConnector.setDebugLevel(defaultConfiguration.flowProperties, DebugLevel.VERBOSE);
    MultiMapReducePlanner.setJobConf(defaultConfiguration.flowProperties, jobConf);

    //Flow.setJobPollingInterval(defaultConfiguration.flowProperties, 500);

    return jobConf;
}

From source file:infinidb.hadoop.db.InfiniDBOutputFormat.java

License:Apache License

/**
  * Initializes the reduce-part of the job with the appropriate output settings
  * //from   w  w w .j  a v  a2  s  . c om
  * @param job
  *          The job
  * @param tableName
  *          The table to insert data into
  * @param fieldNames
  *          The field names in the table. If unknown, supply the appropriate
  *          number of nulls.
  */
public static void setOutput(JobConf job, String schemaName, String... tableNames) {
    job.setOutputFormat(InfiniDBOutputFormat.class);
    job.setReduceSpeculativeExecution(false);

    InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job);
    dbConf.setOutputSchemaName(schemaName);
    dbConf.setOutputTableNames(tableNames);
}

From source file:infinidb.hadoop.db.InfiniDBOutputFormat.java

License:Apache License

/**
 * Initializes the reduce-part of the job with the appropriate output settings
 * /* ww w.  ja v  a  2  s . c om*/
 * @param job
 *          The job
 * @param tableName
 *          The table to insert data into
 * @param fieldNames
 *          The field names in the table. If unknown, supply the appropriate
 *          number of nulls.
 */
public static void setOutput(JobConf job, String schemaName) {
    job.setOutputFormat(InfiniDBOutputFormat.class);
    job.setReduceSpeculativeExecution(false);

    InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job);

    dbConf.setOutputSchemaName(schemaName);
}