List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:org.apache.sysml.runtime.matrix.CombineMR.java
License:Apache License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job; job = new JobConf(CombineMR.class); job.setJobName("Standalone-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] inputIndexes = new byte[inputs.length]; for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b;/* w ww.j a v a2 s. c o m*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, ""); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, ""); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, ""); MRJobConfiguration.setCombineInstructions(job, combineInstructions); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null, null, combineInstructions, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixCell.class); //configure reducer job.setReducerClass(InnerReducer.class); //job.setReducerClass(PassThroughReducer.class); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null, null, null, combineInstructions, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java
License:Apache License
public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, String reblockInstructions, int replication, String[] smallestFiles) throws Exception { AssignRowIDMRReturn ret = new AssignRowIDMRReturn(); JobConf job; job = new JobConf(CSVReblockMR.class); job.setJobName("Assign-RowID-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;//w w w. j a v a 2 s. c o m //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up the number of reducers job.setNumReduceTasks(1); // Print the complete instruction //if (LOG.isTraceEnabled()) //inst.printCompelteMRJobInstruction(); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVAssignRowIDMapper.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(OffsetCount.class); //configure reducer job.setReducerClass(CSVAssignRowIDReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //set up the output file ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename()); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, ret.counterFile); job.setOutputKeyClass(ByteWritable.class); job.setOutputValueClass(OffsetCount.class); RunningJob runjob = JobClient.runJob(job); /* Process different counters */ Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX); Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX); ret.rlens = new long[inputs.length]; ret.clens = new long[inputs.length]; for (int i = 0; i < inputs.length; i++) { // number of non-zeros ret.rlens[i] = rgroup.getCounter(Integer.toString(i)); ret.clens[i] = cgroup.getCounter(Integer.toString(i)); } return ret; }
From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java
License:Apache License
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception { JobConf job; job = new JobConf(ReblockMR.class); job.setJobName("CSV-Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;//from www . j av a2s . co m //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); // Print the complete instruction //if (LOG.isTraceEnabled()) // inst.printCompelteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVReblockMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); Path cachefile = new Path(counterFile, "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(ROWID_FILE_NAME, cachefile.toString()); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.DataGenMR.java
License:Apache License
/** * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p> * //from w ww. j a va 2s . c o m * @param inst MR job instruction * @param dataGenInstructions array of data gen instructions * @param instructionsInMapper instructions in mapper * @param aggInstructionsInReducer aggregate instructions in reducer * @param otherInstructionsInReducer other instructions in reducer * @param numReducers number of reducers * @param replication file replication * @param resultIndexes result indexes for each random object * @param dimsUnknownFilePrefix file path prefix when dimensions unknown * @param outputs output file for each random object * @param outputInfos output information for each random object * @return matrix characteristics for each random object * @throws Exception if Exception occurs */ public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir()); maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); PrintWriter pw = null; try { pw = new PrintWriter(fs.create(new Path(inputs[i]))); //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); //seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); PrimitiveIterator.OfLong nnzIter = nnz.iterator(); for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnzIter.nextLong()); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } } finally { IOUtilFunctions.closeSilently(pw); } inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; //always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; //handle default 1 to -1 for special case of from>to incr = LibMatrixDatagen.updateSeqIncr(from, to, incr); // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = UtilFunctions.getSeqLength(from, to, incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; PrintWriter pw = null; try { pw = new PrintWriter(fs.create(new Path(inputs[i]))); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } finally { IOUtilFunctions.closeSilently(pw); } inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1);//remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //determine degree of parallelism (nmappers: 1<=n<=capacity) //TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); //correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.GMR.java
License:Apache License
/** * Execute job.//from w ww. j a v a 2 s. c o m * * @param inst MR job instruction * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string * @param inputInfos the input format information for the input matrices * @param rlens array of number of rows * @param clens array of number of columns * @param brlens array of number of rows in block * @param bclens array of number of columns in block * @param partitioned boolean array of partitioned status * @param pformats array of data partition formats * @param psizes does nothing * @param recordReaderInstruction record reader instruction * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations * that need to be performed on each input matrix * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations * @param numReducers the number of reducers * @param replication the replication factor for the output * @param jvmReuse if true, reuse JVM * @param resultIndexes the indexes of the result matrices that needs to be outputted * @param dimsUnknownFilePrefix file path prefix when dimensions unknown * @param outputs the names for the output directories, one for each result index * @param outputInfos output format information for the output matrices * @return job return object * @throws Exception if Exception occurs */ @SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(GMR.class); job.setJobName("G-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) { assert (inputs.length <= 2); PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction .parseInstruction(recordReaderInstruction); PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass); job.setInputFormat(PickFromCompactInputFormat.class); PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata); if (ins.isValuePick) { double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]); PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs); realinputs = new String[inputs.length - 1]; realinputInfos = new InputInfo[inputs.length - 1]; realrlens = new long[inputs.length - 1]; realclens = new long[inputs.length - 1]; realbrlens = new int[inputs.length - 1]; realbclens = new int[inputs.length - 1]; realIndexes = new byte[inputs.length - 1]; byte realIndex = 0; for (byte i = 0; i < inputs.length; i++) { if (i == ins.input2) continue; realinputs[realIndex] = inputs[i]; realinputInfos[realIndex] = inputInfos[i]; if (i == ins.input1) { realrlens[realIndex] = rlens[ins.input2]; realclens[realIndex] = clens[ins.input2]; realbrlens[realIndex] = 1; realbclens[realIndex] = 1; realIndexes[realIndex] = ins.output; } else { realrlens[realIndex] = rlens[i]; realclens[realIndex] = clens[i]; realbrlens[realIndex] = brlens[i]; realbclens[realIndex] = bclens[i]; realIndexes[realIndex] = i; } realIndex++; } } else { //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst); PickFromCompactInputFormat.setRangePickPartFiles(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst); realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile( (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst); realclens[ins.input1] = clens[ins.input1]; realbrlens[ins.input1] = 1; realbclens[ins.input1] = 1; realIndexes[ins.input1] = ins.output; } } boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens); //set up the input files and their format information boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); MRJobConfiguration.setInputPartitioningInfo(job, pformats); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instructions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up jvm reuse (incl. reuse of loaded dist cache matrices) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] dimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { dimsUnknown[i] = (byte) 1; } else { dimsUnknown[i] = (byte) 0; } } //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixPackedCell.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) { job.setCombinerClass(GMRCombiner.class); } //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i))); //cleanups String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); if (resetDistCache) MRBaseForCommonInstructions.resetDistCache(); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.GroupedAggMR.java
License:Apache License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String grpAggInstructions, String simpleReduceInstructions/*only scalar or reorg instructions allowed*/, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(GroupedAggMR.class); job.setJobName("GroupedAgg-MR"); //whether use block representation or cell representation //MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true); MRJobConfiguration.setMatrixValueClass(job, false); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;// w w w. ja va 2 s .c o m //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up the grouped aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions); //set up the number of reducers MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions, resultIndexes); MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) stats[i] = new MatrixCharacteristics(); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); byte[] resultDimsUnknown = new byte[resultIndexes.length]; // Update resultDimsUnknown based on computed "stats" for (int i = 0; i < resultIndexes.length; i++) resultDimsUnknown[i] = (byte) 2; //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false); // configure mapper and the mapper output key value pairs job.setMapperClass(GroupedAggMRMapper.class); job.setCombinerClass(GroupedAggMRCombiner.class); job.setMapOutputKeyClass(TaggedMatrixIndexes.class); job.setMapOutputValueClass(WeightedCell.class); //configure reducer job.setReducerClass(GroupedAggMRReducer.class); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //execute job RunningJob runjob = JobClient.runJob(job); //get important output statistics Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i] = new MatrixCharacteristics(); stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.MMCJMR.java
License:Apache License
private static MatrixCharacteristics[] commonSetup(JobConf job, boolean inBlockRepresentation, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrction, int numReducers, int replication, byte resultDimsUnknown, String output, OutputInfo outputinfo) throws Exception { job.setJobName("MMCJ-MR"); if (numReducers <= 0) throw new Exception("MMCJ-MR has to have at least one reduce task!"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;//from www . j a va 2 s .c om //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the aggregate binary operation for the mmcj job MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrction); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); byte[] resultIndexes = new byte[] { MRInstructionParser.parseSingleInstruction(aggBinInstrction).output }; byte[] resultDimsUnknown_Array = new byte[] { resultDimsUnknown }; // byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output}; //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, resultIndexes); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array, new String[] { output }, new OutputInfo[] { outputinfo }, inBlockRepresentation); // configure mapper job.setMapperClass(MMCJMRMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(MatrixBlock.class); else job.setMapOutputValueClass(MatrixCell.class); job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class); job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexPartitioner.class); //configure combiner //TODO: cannot set up combiner, because it will destroy the stable numerical algorithms // for sum or for central moments //if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty()) // job.setCombinerClass(MMCJMRCombiner.class); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, null, resultIndexes, mapoutputIndexes, true); //set up the number of reducers if (AUTOMATIC_CONFIG_NUM_REDUCERS) { int numRed = determineNumReducers(rlens, clens, numReducers, ret.numReducerGroups); job.setNumReduceTasks(numRed); } else MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); //configure reducer // note: the alternative MMCJMRReducer is not maintained job.setReducerClass(MMCJMRReducerWithAggregator.class); return ret.stats; }
From source file:org.apache.sysml.runtime.matrix.MMRJMR.java
License:Apache License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrctions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(MMRJMR.class); job.setJobName("MMRJ-MR"); if (numReducers <= 0) throw new Exception("MMRJ-MR has to have at least one reduce task!"); // TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary. boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;/* ww w.jav a 2s . co m*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the aggregate binary operation for the mmcj job MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrctions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); // byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output}; //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); byte[] dimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { dimsUnknown[i] = (byte) 1; } else { dimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation); // configure mapper job.setMapperClass(MMRJMRMapper.class); job.setMapOutputKeyClass(TripleIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class); job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class); //configure combiner //TODO: cannot set up combiner, because it will destroy the stable numerical algorithms // for sum or for central moments // if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty()) // job.setCombinerClass(MMCJMRCombiner.class); //configure reducer job.setReducerClass(MMRJMRReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.ReblockMR.java
License:Apache License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, long[] nnz, String instructionsInMapper, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(ReblockMR.class); job.setJobName("Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;/*from w w w .j a v a 2s . c o m*/ //set up the input files and their format information //(internally used input converters: text2bc for text, identity for binary inputs) MRJobConfiguration.setUpMultipleInputsReblock(job, realIndexes, inputs, inputInfos, brlens, bclens); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens, nnz); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //enable jvm reuse (based on SystemML configuration) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers (according to output size) int numRed = determineNumReducers(rlens, clens, nnz, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); //setup in-memory reduce buffers budget (reblock reducer dont need much memory) //job.set(MRConfigurationNames.MR_REDUCE_INPUT_BUFFER_PERCENT, "0.70"); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(ReblockMapper.class); job.setMapOutputKeyClass(MatrixIndexes.class); //represent key offsets for block job.setMapOutputValueClass(TaggedAdaptivePartialBlock.class); //binary cell/block //configure reducer job.setReducerClass(ReblockReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. // at this point, both reblock_binary and reblock_text are similar MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.SortMR.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); FileInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (!InfrastructureAnalyzer.isLocalMode(job)) { MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //ensure partition size <= 10M records to avoid scalability bottlenecks //on cp-side qpick instructions for quantile/iqm/median (~128MB) if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes)) job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000)); } else //in case of local mode job.setNumReduceTasks(1);// w ww . j av a 2s . c o m //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }