List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass
public void setReducerClass(Class<? extends Reducer> theClass)
From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);/*w w w.j a va 2s . c o m*/ } JobConf jobConf = new JobConf(conf); jobConf.setMapperClass(MapClass.class); jobConf.setReducerClass(Reduce.class); jobConf.setPartitionerClass(FirstPartitioner.class); jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class); jobConf.setMapOutputKeyClass(IntPair.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(IntWritable.class); // // Job job = new Job(conf, "secondary sort"); // job.setJarByClass(SecondarySort_MapRed.class); // job.setMapperClass(MapClass.class); // job.setReducerClass(Reduce.class); // // // group and partition by the first int in the pair // job.setPartitionerClass(FirstPartitioner.class); // job.setGroupingComparatorClass(FirstGroupingComparator.class); // conf.setClass("mapred.output.key.comparator.class", // KeyComparator.class, RawComparator.class); // // job.setSortComparatorClass(SecondGroupingComparator.class); // // the map output is IntPair, IntWritable // job.setMapOutputKeyClass(IntPair.class); // job.setMapOutputValueClass(IntWritable.class); // // // the reduce output is Text, IntWritable // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1])); }
From source file:com.hadoopilluminated.examples.dancing.DistributedPentomino.java
License:Apache License
public int run(String[] args) throws Exception { JobConf conf; int depth = 5; int width = 9; int height = 10; Class<? extends Pentomino> pentClass; if (args.length == 0) { System.out.println("pentomino <output>"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/* w ww. j a v a2 s . co m*/ conf = new JobConf(getConf()); width = conf.getInt("pent.width", width); height = conf.getInt("pent.height", height); depth = conf.getInt("pent.depth", depth); pentClass = conf.getClass("pent.class", OneSidedPentonimo.class, Pentomino.class); Path output = new Path(args[0]); Path input = new Path(output + "_input"); FileSystem fileSys = FileSystem.get(conf); try { FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); conf.setJarByClass(PentMap.class); conf.setJobName("dancingElephant"); Pentomino pent = ReflectionUtils.newInstance(pentClass, conf); pent.initialize(width, height); createInputDirectory(fileSys, input, pent, depth); // the keys are the prefix strings conf.setOutputKeyClass(Text.class); // the values are puzzle solutions conf.setOutputValueClass(Text.class); conf.setMapperClass(PentMap.class); conf.setReducerClass(IdentityReducer.class); conf.setNumMapTasks(2000); conf.setNumReduceTasks(1); JobClient.runJob(conf); } finally { fileSys.delete(input, true); } return 0; }
From source file:com.hadoopilluminated.examples.Grep.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/* w ww . j av a 2 s .c o m*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) { grepJob.set("mapred.mapper.regex.group", args[3]); } grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(getConf(), Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }
From source file:com.hadoopilluminated.examples.Join.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from w w w . j a va 2 s . co m * * @throws IOException When there is communication problems with the job * tracker. */ @Override public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("join"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = cluster.getTaskTrackers() * jobConf.getInt("test.sort.maps_per_host", 10); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = TupleWritable.class; String op = "inner"; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { num_maps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-joinOp".equals(args[i])) { op = args[++i]; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumMapTasks(num_maps); jobConf.setNumReduceTasks(num_reduces); if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.remove(otherArgs.size() - 1))); List<Path> plist = new ArrayList<Path>(otherArgs.size()); for (String s : otherArgs) { plist.add(new Path(s)); } jobConf.setInputFormat(CompositeInputFormat.class); jobConf.set("mapred.join.expr", CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0]))); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.hp.hplc.mr.driver.WordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from w w w . j a v a2 s.c o m */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); System.out.println("# of reduces: " + conf.getNumReduceTasks()); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java
License:Open Source License
@Override protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-DPMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(DataPartitionerRemoteMR.class); if (_pfid >= 0) //use in parfor job.setJobName(jobname + _pfid); else //use for partition instruction job.setJobName("Partition-MR"); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try {/*from w w w.j av a 2 s. co m*/ //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size) in.exportData(); //written to disk iff dirty Path path = new Path(in.getFileName()); ///// //configure the MR job MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew, _keepIndexes); //set mappers, reducers, combiners job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(DataPartitionerRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { //binary cell intermediates for reduced IO job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableBlock.class); //check Alignment if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0) || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) { throw new DMLRuntimeException( "Data partitioning format " + _format + " requires aligned blocks."); } } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path and output path FileInputFormat.setInputPaths(job, path); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); //FileOutputFormat.setOutputPath(job, pathNew); job.setOutputFormat(NullOutputFormat.class); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = -1; switch (_format) { case ROW_WISE: reducerGroups = rlen; break; case COLUMN_WISE: reducerGroups = clen; break; case ROW_BLOCK_WISE: reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE: reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1); break; case ROW_BLOCK_WISE_N: reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE_N: reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1); break; default: //do nothing } job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //use FLEX scheduler configuration properties /*if( ParForProgramBlock.USE_FLEX_SCHEDULER_CONF ) { job.setInt("flex.map.min", 0); job.setInt("flex.map.max", _numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", _numMappers); }*/ //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set("mapred.compress.map.output", "true"); //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt("dfs.replication", _replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS && _pfid >= 0) { long t1 = System.nanoTime(); //only for parfor Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Open Source License
/** * //from w w w .ja v a 2s .c o m * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-DPEMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteDPParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the reducers MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //setup input matrix Path path = new Path(input.getFileName()); long rlen = input.getNumRows(); long clen = input.getNumColumns(); int brlen = (int) input.getNumRowsPerBlock(); int bclen = (int) input.getNumColumnsPerBlock(); MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol); job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass); FileInputFormat.setInputPaths(job, path); //set mapper and reducers classes job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(RemoteDPParWorkerReducer.class); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema //parfor partitioning outputs (intermediates) job.setMapOutputKeyClass(LongWritable.class); if (oi == OutputInfo.BinaryBlockOutputInfo) job.setMapOutputValueClass(PairWritableBlock.class); else if (oi == OutputInfo.BinaryCellOutputInfo) job.setMapOutputValueClass(PairWritableCell.class); else throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi); //parfor exec output job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumReduceTasks(numReducers); //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //disable JVM reuse job.setNumTasksToExecutePerJvm(1); //-1 for unlimited //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task //note: currently disabled to use cluster config //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java
License:Open Source License
/** * /*from w w w .j a v a2 s .c o m*/ * @param fname null if no comparison required * @param fnameNew * @param srcFnames * @param ii * @param oi * @param rlen * @param clen * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings({ "unused", "deprecation" }) protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-RMMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(ResultMergeRemoteMR.class); job.setJobName(jobname + _pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); //warning for textcell/binarycell without compare boolean withCompare = (fname != null); if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES) LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR."); try { Path pathCompare = null; Path pathNew = new Path(fnameNew); ///// //configure the MR job if (withCompare) { pathCompare = new Path(fname).makeQualified(FileSystem.get(job)); MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen); } else MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen); //set mappers, reducers, combiners job.setMapperClass(ResultMergeRemoteMapper.class); job.setReducerClass(ResultMergeRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { //setup partitioning, grouping, sorting for composite key (old API) job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path Path[] paths = null; if (withCompare) { paths = new Path[srcFnames.length + 1]; paths[0] = pathCompare; for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]); } else { paths = new Path[srcFnames.length]; for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]); } FileInputFormat.setInputPaths(job, paths); //set output format job.setOutputFormat(oi.outputFormatClass); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); FileOutputFormat.setOutputPath(job, pathNew); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = _numReducers; if (oi == OutputInfo.BinaryBlockOutputInfo) reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1); else //textcell/binarycell reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1); job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.map.min", 0); job.setInt("flex.map.max", _numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", _numMappers); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set("mapred.compress.map.output", "true"); //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt("dfs.replication", _replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:com.ibm.bi.dml.runtime.matrix.CMCOVMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String cmNcomInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(CMCOVMR.class); job.setJobName("CM-COV-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;/* w w w.jav a 2s. co m*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCM_N_COMInstructions(job, cmNcomInstructions); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, null, cmNcomInstructions, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, new byte[resultIndexes.length], outputs, outputInfos, false); // configure mapper and the mapper output key value pairs job.setMapperClass(CMCOVMRMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(CM_N_COVCell.class); job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class); job.setPartitionerClass(TaggedFirstSecondIndexes.TagPartitioner.class); //configure reducer job.setReducerClass(CMCOVMRReducer.class); //job.setReducerClass(PassThroughReducer.class); MatrixCharacteristics[] stats = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, null, null, cmNcomInstructions, resultIndexes, mapoutputIndexes, false).stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, mapoutputIndexes.size(), numReducers);//each output tag is a group // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.matrix.CombineMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job; job = new JobConf(CombineMR.class); job.setJobName("Standalone-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] inputIndexes = new byte[inputs.length]; for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b;/* w w w . j ava 2 s. com*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, ""); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, ""); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, ""); MRJobConfiguration.setCombineInstructions(job, combineInstructions); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null, null, combineInstructions, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixCell.class); //configure reducer job.setReducerClass(InnerReducer.class); //job.setReducerClass(PassThroughReducer.class); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null, null, null, combineInstructions, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, runjob.isSuccessful()); }