List of usage examples for org.apache.hadoop.mapred RunningJob isSuccessful
public boolean isSuccessful() throws IOException;
From source file:com.cloudera.recordservice.tests.TestMiniClusterController.java
License:Apache License
/** * This method gets a JobConf object from the static MiniClusterController * method, fills it with a sample MR job and then executes the job. *///from w w w . j a v a2 s .co m @Test public void testGetConfigForMiniCluster() throws IOException { JobConf sampleJob = miniCluster_.getJobConf(WordCount.class); fillInWordCountMRJobConf(sampleJob); RunningJob runningJob = JobClient.runJob(sampleJob); runningJob.waitForCompletion(); assertTrue(runningJob.isSuccessful()); }
From source file:com.digitalpebble.behemoth.mahout.util.ClusterDocIDDumper.java
License:Apache License
public int extract(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, input); job.setInputFormat(SequenceFileInputFormat.class); job.setNumReduceTasks(0);/*from w w w . j a v a 2s.co m*/ job.setMapperClass(ClusterDocIDDumper.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); if (rj.isSuccessful() == false) return -1; return 0; }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int vectorToString(Path vectorPath, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, vectorPath); job.setInputFormat(SequenceFileInputFormat.class); job.setNumReduceTasks(0);/*from w ww. j a v a2 s.c o m*/ job.setMapperClass(Mahout2LibSVM.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); if (rj.isSuccessful() == false) return -1; return 0; }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int convert(Path vectorPath, Path labelPath, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, vectorPath); FileInputFormat.addInputPath(job, labelPath); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // 1 reducers job.setNumReduceTasks(1);//from ww w . j av a 2s .c o m job.setReducerClass(Mahout2LibSVM.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); boolean success = rj.isSuccessful(); if (!success) return -1; if (log.isInfoEnabled()) { log.info("Conversion: done"); } return 0; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Open Source License
/** * /*from w w w. j a v a 2 s . c o m*/ * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-DPEMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteDPParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the reducers MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //setup input matrix Path path = new Path(input.getFileName()); long rlen = input.getNumRows(); long clen = input.getNumColumns(); int brlen = (int) input.getNumRowsPerBlock(); int bclen = (int) input.getNumColumnsPerBlock(); MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol); job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass); FileInputFormat.setInputPaths(job, path); //set mapper and reducers classes job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(RemoteDPParWorkerReducer.class); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema //parfor partitioning outputs (intermediates) job.setMapOutputKeyClass(LongWritable.class); if (oi == OutputInfo.BinaryBlockOutputInfo) job.setMapOutputValueClass(PairWritableBlock.class); else if (oi == OutputInfo.BinaryCellOutputInfo) job.setMapOutputValueClass(PairWritableCell.class); else throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi); //parfor exec output job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumReduceTasks(numReducers); //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //disable JVM reuse job.setNumTasksToExecutePerJvm(1); //-1 for unlimited //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task //note: currently disabled to use cluster config //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java
License:Open Source License
/** * //w ww.j av a 2s . c om * @param pfid * @param program * @param taskFile * @param resultFile * @param _enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, //inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); //map-only //set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else //default case { job.setInputFormat(NLineInputFormat.class); } //set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumMapTasks(numMappers); //numMappers job.setNumReduceTasks(0); //job.setInt("mapred.map.tasks.maximum", 1); //system property //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.priority", 0); //highest job.setInt("flex.map.min", 0); job.setInt("flex.map.max", numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", numMappers); } //set jvm memory size (if require) String memKey = "mapred.child.java.opts"; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt("io.sort.mb", 8); //8MB //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.matrix.CleanupMR.java
License:Open Source License
public static boolean runJob(DMLConfig conf) throws Exception { boolean ret = false; try {// www . j ava 2 s .c om JobConf job; job = new JobConf(CleanupMR.class); job.setJobName("Cleanup-MR"); //set up SystemML local tmp dir String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR); MRJobConfiguration.setSystemMLLocalTmpDir(job, dir); //set mappers, reducers int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes(); job.setMapperClass(CleanupMapper.class); //map-only job.setNumMapTasks(numNodes); //numMappers job.setNumReduceTasks(0); //set input/output format, input path String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks"; job.setInputFormat(NLineInputFormat.class); job.setOutputFormat(NullOutputFormat.class); Path path = new Path(inFileName); FileInputFormat.setInputPaths(job, path); writeCleanupTasksToFile(path, numNodes); //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); ret = runjob.isSuccessful(); } catch (Exception ex) { //don't raise an exception, just gracefully an error message. LOG.error("Failed to run cleanup MR job. ", ex); } return ret; }
From source file:com.ibm.bi.dml.runtime.matrix.CMCOVMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String cmNcomInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(CMCOVMR.class); job.setJobName("CM-COV-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;/*from ww w . j a va 2 s.c om*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCM_N_COMInstructions(job, cmNcomInstructions); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, null, cmNcomInstructions, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, new byte[resultIndexes.length], outputs, outputInfos, false); // configure mapper and the mapper output key value pairs job.setMapperClass(CMCOVMRMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(CM_N_COVCell.class); job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class); job.setPartitionerClass(TaggedFirstSecondIndexes.TagPartitioner.class); //configure reducer job.setReducerClass(CMCOVMRReducer.class); //job.setReducerClass(PassThroughReducer.class); MatrixCharacteristics[] stats = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, null, null, cmNcomInstructions, resultIndexes, mapoutputIndexes, false).stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, mapoutputIndexes.size(), numReducers);//each output tag is a group // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.matrix.CombineMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job;/*from w w w .ja v a 2 s . com*/ job = new JobConf(CombineMR.class); job.setJobName("Standalone-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] inputIndexes = new byte[inputs.length]; for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b; //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, ""); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, ""); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, ""); MRJobConfiguration.setCombineInstructions(job, combineInstructions); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null, null, combineInstructions, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixCell.class); //configure reducer job.setReducerClass(InnerReducer.class); //job.setReducerClass(PassThroughReducer.class); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null, null, null, combineInstructions, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception { JobConf job;/* w w w .j a v a2s.c om*/ job = new JobConf(ReblockMR.class); job.setJobName("CSV-Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); // Print the complete instruction //if (LOG.isTraceEnabled()) // inst.printCompelteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVReblockMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); Path cachefile = new Path(counterFile, "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(ROWID_FILE_NAME, cachefile.toString()); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }