List of usage examples for org.apache.hadoop.mapred RunningJob getID
public JobID getID();
From source file:org.apache.oozie.action.hadoop.TestMapReduceActionExecutor.java
License:Apache License
private String _testSubmit(String name, String actionXml) throws Exception { Context context = createContext(name, actionXml); final RunningJob launcherJob = submitAction(context); String launcherId = context.getAction().getExternalId(); waitFor(120 * 2000, new Predicate() { public boolean evaluate() throws Exception { return launcherJob.isComplete(); }//from ww w. jav a 2s .c o m }); assertTrue(launcherJob.isSuccessful()); Map<String, String> actionData = LauncherMapperHelper.getActionData(getFileSystem(), context.getActionDir(), context.getProtoActionConf()); assertTrue(LauncherMapperHelper.hasIdSwap(actionData)); MapReduceActionExecutor ae = new MapReduceActionExecutor(); ae.check(context, context.getAction()); assertTrue(launcherId.equals(context.getAction().getExternalId())); JobConf conf = ae.createBaseHadoopConf(context, XmlUtils.parseXml(actionXml)); String user = conf.get("user.name"); String group = conf.get("group.name"); JobClient jobClient = Services.get().get(HadoopAccessorService.class).createJobClient(user, conf); final RunningJob mrJob = jobClient.getJob(JobID.forName(context.getAction().getExternalChildIDs())); waitFor(120 * 1000, new Predicate() { public boolean evaluate() throws Exception { return mrJob.isComplete(); } }); assertTrue(mrJob.isSuccessful()); ae.check(context, context.getAction()); assertEquals("SUCCEEDED", context.getAction().getExternalStatus()); assertNull(context.getAction().getData()); ae.end(context, context.getAction()); assertEquals(WorkflowAction.Status.OK, context.getAction().getStatus()); //hadoop.counters will always be set in case of MR action. assertNotNull(context.getVar("hadoop.counters")); String counters = context.getVar("hadoop.counters"); assertTrue(counters.contains("Counter")); assertTrue(counters.contains("\"MAP_OUTPUT_RECORDS\":2")); //External Child IDs used to be null, but after 4.0, become Non-Null in case of MR action. assertNotNull(context.getExternalChildIDs()); return mrJob.getID().toString(); }
From source file:org.apache.oozie.command.wf.TestWorkflowActionKillXCommand.java
License:Apache License
private String launchSleepJob(int sleep) throws Exception { JobConf jobConf = Services.get().get(HadoopAccessorService.class) .createJobConf(new URI(getNameNodeUri()).getAuthority()); JobClient jobClient = createJobClient(); SleepJob sleepjob = new SleepJob(); sleepjob.setConf(jobConf);//from w w w .j a v a 2s.c o m jobConf = sleepjob.setupJobConf(1, 1, sleep, 1, sleep, 1); final RunningJob runningJob = jobClient.submitJob(jobConf); return runningJob.getID().toString(); }
From source file:org.apache.sysml.runtime.matrix.DataGenMR.java
License:Apache License
/** * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p> * /*from ww w . j a va 2 s . c o m*/ * @param inst MR job instruction * @param dataGenInstructions array of data gen instructions * @param instructionsInMapper instructions in mapper * @param aggInstructionsInReducer aggregate instructions in reducer * @param otherInstructionsInReducer other instructions in reducer * @param numReducers number of reducers * @param replication file replication * @param resultIndexes result indexes for each random object * @param dimsUnknownFilePrefix file path prefix when dimensions unknown * @param outputs output file for each random object * @param outputInfos output information for each random object * @return matrix characteristics for each random object * @throws Exception if Exception occurs */ public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir()); maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); PrintWriter pw = null; try { pw = new PrintWriter(fs.create(new Path(inputs[i]))); //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); //seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); PrimitiveIterator.OfLong nnzIter = nnz.iterator(); for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnzIter.nextLong()); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } } finally { IOUtilFunctions.closeSilently(pw); } inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; //always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; //handle default 1 to -1 for special case of from>to incr = LibMatrixDatagen.updateSeqIncr(from, to, incr); // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = UtilFunctions.getSeqLength(from, to, incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; PrintWriter pw = null; try { pw = new PrintWriter(fs.create(new Path(inputs[i]))); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } finally { IOUtilFunctions.closeSilently(pw); } inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1);//remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //determine degree of parallelism (nmappers: 1<=n<=capacity) //TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); //correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.GMR.java
License:Apache License
/** * Execute job.//from w w w.j a v a2 s . co m * * @param inst MR job instruction * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string * @param inputInfos the input format information for the input matrices * @param rlens array of number of rows * @param clens array of number of columns * @param brlens array of number of rows in block * @param bclens array of number of columns in block * @param partitioned boolean array of partitioned status * @param pformats array of data partition formats * @param psizes does nothing * @param recordReaderInstruction record reader instruction * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations * that need to be performed on each input matrix * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations * @param numReducers the number of reducers * @param replication the replication factor for the output * @param jvmReuse if true, reuse JVM * @param resultIndexes the indexes of the result matrices that needs to be outputted * @param dimsUnknownFilePrefix file path prefix when dimensions unknown * @param outputs the names for the output directories, one for each result index * @param outputInfos output format information for the output matrices * @return job return object * @throws Exception if Exception occurs */ @SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes, String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(GMR.class); job.setJobName("G-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) { assert (inputs.length <= 2); PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction .parseInstruction(recordReaderInstruction); PickFromCompactInputFormat.setKeyValueClasses(job, (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass, inputInfos[ins.input1].inputValueClass); job.setInputFormat(PickFromCompactInputFormat.class); PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata); if (ins.isValuePick) { double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]); PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs); realinputs = new String[inputs.length - 1]; realinputInfos = new InputInfo[inputs.length - 1]; realrlens = new long[inputs.length - 1]; realclens = new long[inputs.length - 1]; realbrlens = new int[inputs.length - 1]; realbclens = new int[inputs.length - 1]; realIndexes = new byte[inputs.length - 1]; byte realIndex = 0; for (byte i = 0; i < inputs.length; i++) { if (i == ins.input2) continue; realinputs[realIndex] = inputs[i]; realinputInfos[realIndex] = inputInfos[i]; if (i == ins.input1) { realrlens[realIndex] = rlens[ins.input2]; realclens[realIndex] = clens[ins.input2]; realbrlens[realIndex] = 1; realbclens[realIndex] = 1; realIndexes[realIndex] = ins.output; } else { realrlens[realIndex] = rlens[i]; realclens[realIndex] = clens[i]; realbrlens[realIndex] = brlens[i]; realbclens[realIndex] = bclens[i]; realIndexes[realIndex] = i; } realIndex++; } } else { //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst); PickFromCompactInputFormat.setRangePickPartFiles(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst); realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile( (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst); realclens[ins.input1] = clens[ins.input1]; realbrlens[ins.input1] = 1; realbclens[ins.input1] = 1; realIndexes[ins.input1] = ins.output; } } boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens, realclens); //set up the input files and their format information boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); MRJobConfiguration.setInputPartitioningInfo(job, pformats); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instructions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up jvm reuse (incl. reuse of loaded dist cache matrices) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] dimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { dimsUnknown[i] = (byte) 1; } else { dimsUnknown[i] = (byte) 0; } } //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixPackedCell.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) { job.setCombinerClass(GMRCombiner.class); } //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) stats[i].setNonZeros(group.getCounter(Integer.toString(i))); //cleanups String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); if (resetDistCache) MRBaseForCommonInstructions.resetDistCache(); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.GroupedAggMR.java
License:Apache License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String grpAggInstructions, String simpleReduceInstructions/*only scalar or reorg instructions allowed*/, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(GroupedAggMR.class); job.setJobName("GroupedAgg-MR"); //whether use block representation or cell representation //MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true); MRJobConfiguration.setMatrixValueClass(job, false); //added for handling recordreader instruction String[] realinputs = inputs; InputInfo[] realinputInfos = inputInfos; long[] realrlens = rlens; long[] realclens = clens; int[] realbrlens = brlens; int[] realbclens = bclens; byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;/*from w w w .j av a2 s.co m*/ //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens); //set up the grouped aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions); //set up the number of reducers MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions, resultIndexes); MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) stats[i] = new MatrixCharacteristics(); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); byte[] resultDimsUnknown = new byte[resultIndexes.length]; // Update resultDimsUnknown based on computed "stats" for (int i = 0; i < resultIndexes.length; i++) resultDimsUnknown[i] = (byte) 2; //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false); // configure mapper and the mapper output key value pairs job.setMapperClass(GroupedAggMRMapper.class); job.setCombinerClass(GroupedAggMRCombiner.class); job.setMapOutputKeyClass(TaggedMatrixIndexes.class); job.setMapOutputValueClass(WeightedCell.class); //configure reducer job.setReducerClass(GroupedAggMRReducer.class); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //execute job RunningJob runjob = JobClient.runJob(job); //get important output statistics Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i] = new MatrixCharacteristics(); stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java
License:Apache License
/** * Run the job.//from ww w.jav a 2 s.c om */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("Archive File Extractor"); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // turn off speculative execution job.setBoolean("mapred.map.tasks.speculative.execution", false); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); //tolerate task exceptions job.setBoolean("soft", false); int arg = 0; int numMaps = 10; String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n" + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n"; String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION, DateUtils.getLog17Date(System.currentTimeMillis())); while (arg < args.length - 1) { if (args[arg].equals("-soft")) { job.setBoolean("soft", true); arg++; } else if (args[arg].equals("-mappers")) { arg++; numMaps = Integer.parseInt(args[arg]); job.setNumMapTasks(numMaps); arg++; } else if (args[arg].equals("-timestamp14")) { arg++; String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg])); job.set("timestamp14", timestamp14); arg++; } else if (args[arg].equals("-warc-header-local-file")) { arg++; File f = new File(args[arg]); FileInputStream fis = new FileInputStream(f); warcHeaderString = IOUtils.toString(fis, "UTF-8"); arg++; } else if (args[arg].equals("-hmacname")) { arg++; String hmacName = args[arg]; job.set("hmacName", hmacName); arg++; } else if (args[arg].equals("-hmacsignature")) { arg++; String hmacSignature = args[arg]; job.set("hmacSignature", hmacSignature); arg++; } else if (args[arg].equals("-timeout")) { arg++; int taskTimeout = Integer.parseInt(args[arg]); job.setInt("mapred.task.timeout", taskTimeout); arg++; } else if (args[arg].equals("-failpct")) { arg++; int failPct = Integer.parseInt(args[arg]); job.setInt("mapred.max.map.failures.percent", failPct); arg++; } else { break; } } job.set("warcHeaderString", warcHeaderString); if (args.length - 2 != arg) { printUsage(); return 1; } Path inputPath = new Path(args[arg]); arg++; String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); Path outputPath = new Path(outputDir); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(ArchiveFileExtractorMapper.class); job.setJarByClass(ArchiveFileExtractor.class); TextInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.CDXGenerator.java
License:Apache License
/** * Run the job.// w w w . j ava 2 s .c o m */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("CDX Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating CDXs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CDXGeneratorMapper.class); job.setJarByClass(CDXGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to CDXGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java
License:Apache License
/** * Run the job./* w ww. j av a 2 s. com*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WARCMetadataRecord Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WARCMetadataRecordGeneratorMapper.class); job.setJarByClass(WARCMetadataRecordGenerator.class); //extract outlinks by default job.set("outputType", "outlinks"); int arg = 0; if (args[arg].equals("-hopinfo")) { job.set("outputType", "hopinfo"); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WARCMetadataRecordGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WATGenerator.java
License:Apache License
/** * Run the job./* ww w . j av a2s . c om*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WAT Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating WATs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WATGeneratorMapper.class); job.setJarByClass(WATGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WATGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.jbs.Parse.java
License:Apache License
/** * Run the job./*w ww . j a v a 2s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } FileSystem fs = FileSystem.get(getConf()); // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("jbs.Parse " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // Use the Parse-specific output format. job.setOutputFormat(PerMapOutputFormat.class); // Use our ParseMapper, with output keys and values of type // Text. job.setMapperClass(ParseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Configure the input and output paths, from the command-line. Path outputDir = new Path(args[0]); FileOutputFormat.setOutputPath(job, outputDir); boolean atLeastOneInput = false; for (int i = 1; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); Path outputPath = new Path(outputDir, inputPath.getName()); if (fs.exists(outputPath)) { LOG.debug("Output path already exists: " + outputPath); } else { atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } } if (!atLeastOneInput) { LOG.info("No input files to parse."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }