List of usage examples for org.apache.hadoop.mapred JobConf setReduceSpeculativeExecution
public void setReduceSpeculativeExecution(boolean speculativeExecution)
From source file:cascading.jdbc.db.DBOutputFormat.java
License:Apache License
/** * Initializes the reduce-part of the job with the appropriate output settings * * @param job The job//from w w w .jav a 2s . c o m * @param dbOutputFormatClass * @param tableName The table to insert data into * @param fieldNames The field names in the table. If unknown, supply the appropriate */ public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName, String[] fieldNames, String[] updateFields, int batchSize) { if (dbOutputFormatClass == null) job.setOutputFormat(DBOutputFormat.class); else job.setOutputFormat(dbOutputFormatClass); // writing doesn't always happen in reduce job.setReduceSpeculativeExecution(false); job.setMapSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setOutputTableName(tableName); dbConf.setOutputFieldNames(fieldNames); if (updateFields != null) dbConf.setOutputUpdateFieldNames(updateFields); if (batchSize != -1) dbConf.setBatchStatementsNum(batchSize); }
From source file:co.nubetech.hiho.mapreduce.lib.db.apache.DBOutputFormat.java
License:Apache License
private static DBConfiguration setOutput(Job job, JobConf jobConf, String tableName) throws IOException { job.setOutputFormatClass(DBOutputFormat.class); jobConf.setReduceSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setOutputTableName(tableName); return dbConf; }
From source file:com.scaleunlimited.cascading.hadoop.HadoopUtils.java
License:Apache License
public static JobConf getDefaultJobConf() throws IOException, InterruptedException { JobConf conf = new JobConf(); // We explicitly set task counts to 1 for local so that code which depends on // things like the reducer count runs properly. if (isJobLocal(conf)) { conf.setNumMapTasks(1);/*from w w w. ja v a 2 s.co m*/ conf.setNumReduceTasks(1); } else { conf.setNumReduceTasks(getNumReducers(conf)); // TODO - By default we want to use 0.95 * the number of reduce slots, as per // Hadoop wiki. But we want to round, versus truncate, to avoid setting it to // 0 if we have one reducer. This way it only impacts you if you have more // than 10 reducers. // conf.setNumReduceTasks((getNumReducers(conf) * 95) / 100); } conf.setMapSpeculativeExecution(false); conf.setReduceSpeculativeExecution(false); return conf; }
From source file:com.scaleunlimited.helpful.tools.AnalyzeEmail.java
License:Apache License
private static JobConf getDefaultJobConf() throws IOException { JobClient jobClient = new JobClient(new JobConf()); ClusterStatus status = jobClient.getClusterStatus(); int trackers = status.getTaskTrackers(); JobConf conf = new JobConf(); conf.setNumMapTasks(trackers * 10);//from w w w .j a va2s . c o m conf.setNumReduceTasks((trackers * conf.getInt("mapred.tasktracker.reduce.tasks.maximum", 2))); conf.setMapSpeculativeExecution(false); conf.setReduceSpeculativeExecution(false); conf.set("mapred.child.java.opts", "-server -Xmx512m -Xss128k"); // Should match the value used for Xss above. Note no 'k' suffix for the ulimit command. // New support that one day will be in Hadoop. conf.set("mapred.child.ulimit.stack", "128"); return conf; }
From source file:com.twitter.maple.jdbc.db.DBOutputFormat.java
License:Apache License
/** * Initializes the reduce-part of the job with the appropriate output settings * * @param job The job/* w w w .ja va 2 s. c o m*/ * @param dbOutputFormatClass * @param tableName The table to insert data into * @param fieldNames The field names in the table. If unknown, supply the appropriate */ public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName, String[] fieldNames, String[] updateFields, int batchSize) { if (dbOutputFormatClass == null) { job.setOutputFormat(DBOutputFormat.class); } else { job.setOutputFormat(dbOutputFormatClass); } // writing doesn't always happen in reduce job.setReduceSpeculativeExecution(false); job.setMapSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setOutputTableName(tableName); dbConf.setOutputFieldNames(fieldNames); if (updateFields != null) { dbConf.setOutputUpdateFieldNames(updateFields); } if (batchSize != -1) { dbConf.setBatchStatementsNum(batchSize); } }
From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args//w ww . ja v a 2s . c o m * @throws Exception */ public int run(String[] args) throws Exception { long time = System.currentTimeMillis(); // For the moment: Hard-code the terrier home to quick test System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer"); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return 0; } else if (args.length == 0) { logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } /*else { logger.fatal(usage()); return 0; }*/ if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return 0; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJarByClass(TerrierIndexing.class); conf.setJobName("StreamCorpusIndexer: Terrier Indexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 0; } // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; boolean blockIndexing = true; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); // not sure if this is effective in YARN conf.setNumMapTasks(2000); // increase the heap usage conf.set("mapreduce.map.memory.mb", "6100"); conf.set("mapred.job.map.memory.mb", "6100"); conf.set("mapreduce.reduce.memory.mb", "6144"); conf.set("mapred.job.reduce.memory.mb", "6144"); conf.set("mapreduce.map.java.opts", "-Xmx6100m"); conf.set("mapred.map.child.java.opts", "-Xmx6100m"); conf.set("mapreduce.reduce.java.opts", "-Xmx6144m"); conf.set("mapred.reduce.child.opts", "-Xmx6144m"); //conf.setBoolean("mapred.used.genericoptionsparser", true) ; // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it conf.set("mapreduce.job.user.classpath.first", "true"); // increase the yarn memory to 10 GB conf.set("yarn.nodemanager.resource.memory-mb", "12288"); conf.set("yarn.nodemanager.resource.cpu-vcores", "16"); conf.set("yarn.scheduler.minimum-allocation-mb", "4096"); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } /*JobID jobId = null; boolean ranOK = true; try{ RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); e.printStackTrace(); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } */ //if (ranOK) //{ System.out.println("Merging indices"); if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); //} System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); return 0; }
From source file:de.l3s.streamcorpus.StreamCorpusIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args//from w w w. jav a 2 s.c o m * @throws Exception */ public int run(String[] args) throws Exception { long time = System.currentTimeMillis(); // For the moment: Hard-code the terrier home to quick test System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer"); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return 0; } else if (args.length == 0) { logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } /*else { logger.fatal(usage()); return 0; }*/ if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return 0; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJarByClass(StreamCorpusIndexing.class); conf.setJobName("StreamCorpusIndexer: Terrier Indexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 0; } // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; boolean blockIndexing = true; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); // not sure if this is effective in YARN conf.setNumMapTasks(2000); // increase the heap usage conf.set("mapreduce.map.memory.mb", "6100"); conf.set("mapred.job.map.memory.mb", "6100"); conf.set("mapreduce.reduce.memory.mb", "6144"); conf.set("mapred.job.reduce.memory.mb", "6144"); conf.set("mapreduce.map.java.opts", "-Xmx6100m"); conf.set("mapred.map.child.java.opts", "-Xmx6100m"); conf.set("mapreduce.reduce.java.opts", "-Xmx6144m"); conf.set("mapred.reduce.child.opts", "-Xmx6144m"); //conf.setBoolean("mapred.used.genericoptionsparser", true) ; // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it conf.set("mapreduce.job.user.classpath.first", "true"); // increase the yarn memory to 10 GB conf.set("yarn.nodemanager.resource.memory-mb", "12288"); conf.set("yarn.nodemanager.resource.cpu-vcores", "16"); conf.set("yarn.scheduler.minimum-allocation-mb", "4096"); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } /*JobID jobId = null; boolean ranOK = true; try{ RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); e.printStackTrace(); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } */ //if (ranOK) //{ System.out.println("Merging indices"); if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); //} System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); return 0; }
From source file:eu.larkc.iris.Main.java
License:Apache License
private JobConf setupJob(Configuration conf) { JobConf jobConf = new JobConf(conf, Main.class); // run the job here. /* REAL CLUSTER */ jobConf.set("dfs.blocksize", "536870912"); jobConf.set("dfs.namenode.handler.count", "40"); //jobConf.set("dfs.replication", "1"); jobConf.set("mapreduce.reduce.shuffle.parallelcopies", "10"); jobConf.set("mapreduce.task.io.sort.factor", "100"); jobConf.set("mapreduce.task.io.sort.mb", "1024"); jobConf.set("io.file.buffer.size", "131072"); jobConf.set("mapred.child.java.opts", "-Xmx2560m"); jobConf.set("mapred.child.ulimit", "4194304"); jobConf.set("mapred.min.split.size", "536870912"); jobConf.set("mapreduce.input.fileinputformat.split.minsize", "536870912"); jobConf.set("mapreduce.reduce.merge.inmem.threshold", "0"); /**//*from ww w .j av a 2 s. c o m*/ /* compression settings jobConf.set("mapreduce.map.output.compress", "false"); jobConf.set("mapreduce.output.fileoutputformat.compress", "true"); jobConf.set("mapreduce.output.fileoutputformat.compression.type", "BLOCK"); ~~~ */ //!!!IMPORTANT, if not : Caused by: java.io.FileNotFoundException: File does not exist: hdfs://ec2-50-19-191-200.compute-1.amazonaws.com:8020/user/root/lubm/facts/lubm50/data jobConf.setBoolean("mapred.input.dir.recursive", true); jobConf.set("cascading.serialization.tokens", "130=eu.larkc.iris.storage.IRIWritable,131=eu.larkc.iris.storage.StringTermWritable"); defaultConfiguration.flowProperties.put("cascading.serialization.tokens", "130=eu.larkc.iris.storage.IRIWritable,131=eu.larkc.iris.storage.StringTermWritable"); /* if( System.getProperty("log4j.logger") != null ) defaultConfiguration.flowProperties.put( "log4j.logger", System.getProperty("log4j.logger") ); */ //jobConf.set("mapred.min.split.size", "134217728"); //jobConf.set("mapred.child.java.opts", "-Xms64m -Xmx512m"); jobConf.setMapSpeculativeExecution(false); jobConf.setReduceSpeculativeExecution(false); //FIXME //jobConf.setNumMapTasks(8); jobConf.setNumReduceTasks(32); FlowConnector.setDebugLevel(defaultConfiguration.flowProperties, DebugLevel.VERBOSE); MultiMapReducePlanner.setJobConf(defaultConfiguration.flowProperties, jobConf); //Flow.setJobPollingInterval(defaultConfiguration.flowProperties, 500); return jobConf; }
From source file:infinidb.hadoop.db.InfiniDBOutputFormat.java
License:Apache License
/** * Initializes the reduce-part of the job with the appropriate output settings * //from w w w .j a v a2 s . c om * @param job * The job * @param tableName * The table to insert data into * @param fieldNames * The field names in the table. If unknown, supply the appropriate * number of nulls. */ public static void setOutput(JobConf job, String schemaName, String... tableNames) { job.setOutputFormat(InfiniDBOutputFormat.class); job.setReduceSpeculativeExecution(false); InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job); dbConf.setOutputSchemaName(schemaName); dbConf.setOutputTableNames(tableNames); }
From source file:infinidb.hadoop.db.InfiniDBOutputFormat.java
License:Apache License
/** * Initializes the reduce-part of the job with the appropriate output settings * /* ww w. ja v a 2 s . c om*/ * @param job * The job * @param tableName * The table to insert data into * @param fieldNames * The field names in the table. If unknown, supply the appropriate * number of nulls. */ public static void setOutput(JobConf job, String schemaName) { job.setOutputFormat(InfiniDBOutputFormat.class); job.setReduceSpeculativeExecution(false); InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job); dbConf.setOutputSchemaName(schemaName); }