List of usage examples for org.apache.hadoop.mapred JobConf getNumReduceTasks
public int getNumReduceTasks()
From source file:org.apache.ignite.internal.processors.hadoop.HadoopUtils.java
License:Apache License
/** * Creates JobInfo from hadoop configuration. * * @param cfg Hadoop configuration./* ww w .j av a 2 s. co m*/ * @return Job info. * @throws IgniteCheckedException If failed. */ public static HadoopDefaultJobInfo createJobInfo(Configuration cfg) throws IgniteCheckedException { JobConf jobConf = new JobConf(cfg); boolean hasCombiner = jobConf.get("mapred.combiner.class") != null || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null; int numReduces = jobConf.getNumReduceTasks(); jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null); if (jobConf.getUseNewMapper()) { String mode = "new map API"; ensureNotSet(jobConf, "mapred.input.format.class", mode); ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode); if (numReduces != 0) ensureNotSet(jobConf, "mapred.partitioner.class", mode); else ensureNotSet(jobConf, "mapred.output.format.class", mode); } else { String mode = "map compatibility"; ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode); ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode); if (numReduces != 0) ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode); else ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode); } if (numReduces != 0) { jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null); if (jobConf.getUseNewReducer()) { String mode = "new reduce API"; ensureNotSet(jobConf, "mapred.output.format.class", mode); ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode); } else { String mode = "reduce compatibility"; ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode); ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode); } } Map<String, String> props = new HashMap<>(); for (Map.Entry<String, String> entry : jobConf) props.put(entry.getKey(), entry.getValue()); return new HadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces, props); }
From source file:org.apache.nutch.fetcher.FetcherOutputFormat.java
License:Apache License
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException("Output directory not set in JobConf."); }//from ww w . j ava2 s . c o m if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) throw new IOException("Segment already fetched!"); }
From source file:org.apache.nutch.parse.ParseOutputFormat.java
License:Apache License
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException("Output directory not set in JobConf."); }/*from ww w . j a v a2 s . c o m*/ if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) throw new IOException("Segment already parsed!"); }
From source file:org.apache.nutch.segment.SegmentMerger.java
License:Apache License
public void configure(JobConf conf) { setConf(conf); if (sliceSize > 0) { sliceSize = sliceSize / conf.getNumReduceTasks(); } }
From source file:org.apache.sysml.runtime.matrix.mapred.CSVWriteReducer.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); byte maxIndex = 0; HashMap<Byte, CSVWriteInstruction> out2Ins = new HashMap<>(); try {//from w w w. j a v a2 s .co m CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job); for (CSVWriteInstruction in : ins) { out2Ins.put(in.output, in); if (in.output > maxIndex) maxIndex = in.output; } } catch (Exception e) { throw new RuntimeException(e); } int numParitions = job.getNumReduceTasks(); int taskID = MapReduceTool.getUniqueTaskId(job); //LOG.info("## taks id: "+taskID); //for efficiency only, the arrays may have missing values rowIndexes = new long[maxIndex + 1]; colIndexes = new long[maxIndex + 1]; maxRowIndexes = new long[maxIndex + 1]; minRowIndexes = new long[maxIndex + 1]; numColBlocks = new long[maxIndex + 1]; lastBlockNCols = new int[maxIndex + 1]; colsPerBlock = new int[maxIndex + 1]; delims = new String[maxIndex + 1]; sparses = new boolean[maxIndex + 1]; tagToResultIndex = new int[maxIndex + 1]; for (int i = 0; i < resultIndexes.length; i++) { byte ri = resultIndexes[i]; tagToResultIndex[ri] = i; CSVWriteInstruction in = out2Ins.get(ri); MatrixCharacteristics dim = MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input); delims[ri] = in.delim; sparses[ri] = in.sparse; numColBlocks[ri] = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock()); lastBlockNCols[ri] = (int) (dim.getCols() % dim.getColsPerBlock()); colsPerBlock[ri] = dim.getColsPerBlock(); long rstep = (long) Math.ceil((double) dim.getRows() / (double) numParitions); minRowIndexes[ri] = rowIndexes[ri] = rstep * taskID; maxRowIndexes[ri] = Math.min(rstep * (taskID + 1), dim.getRows()); colIndexes[ri] = 0; } zeroBlock.setData(new MatrixBlock()); }
From source file:org.apache.sysml.runtime.matrix.mapred.GMRMapper.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); mapperID = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix"); _filterEmptyInputBlocks = allowsFilterEmptyInputBlocks(); //assign the temporay vairables try {/* w ww .j a v a 2s . c om*/ // System.out.println(valueClass.getName()); // System.out.println(MatrixCell.class.getName()); if (job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class)) taggedValueBuffer = TaggedMatrixValue.createObject(MatrixPackedCell.class); else taggedValueBuffer = TaggedMatrixValue.createObject(valueClass); } catch (Exception e) { throw new RuntimeException(e); } //decide whether it is a maponly job mapOnlyJob = (job.getNumReduceTasks() <= 0); if (!mapOnlyJob) return; //get the indexes of the final output matrices resultIndexes = MRJobConfiguration.getResultIndexes(job); resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job); //initialize SystemML Counters (defined in MRJobConfiguration) resultsNonZeros = new long[resultIndexes.length]; resultsMaxRowDims = new long[resultIndexes.length]; resultsMaxColDims = new long[resultIndexes.length]; tagMapping = new HashMap<>(); for (int i = 0; i < resultIndexes.length; i++) { byte output = resultIndexes[i]; ArrayList<Integer> vec = tagMapping.get(output); if (vec == null) { vec = new ArrayList<>(); tagMapping.put(output, vec); } vec.add(i); } //for map only job, get the map output converters collectFinalMultipleOutputs = MRJobConfiguration.getMultipleConvertedOutputs(job); }
From source file:org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * /*from w w w .ja v a 2 s .com*/ * @param conf the job to sample * @param partFile where to write the output file to * @return index value * @throws IOException if something goes wrong * @throws InstantiationException if InstantiationException occurs * @throws IllegalAccessException if IllegalAccessException occurs */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; i++) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = null; int index0 = -1; try { writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; } finally { IOUtilFunctions.closeSilently(writer); } return index0; }
From source file:org.apache.sysml.runtime.matrix.SortMR.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); FileInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (!InfrastructureAnalyzer.isLocalMode(job)) { MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //ensure partition size <= 10M records to avoid scalability bottlenecks //on cp-side qpick instructions for quantile/iqm/median (~128MB) if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes)) job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000)); } else //in case of local mode job.setNumReduceTasks(1);/* w w w. j a v a2s . com*/ //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }
From source file:org.apache.trevni.avro.AvroTrevniOutputFormat.java
License:Apache License
@Override public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(FileSystem ignore, final JobConf job, final String name, Progressable prog) throws IOException { boolean isMapOnly = job.getNumReduceTasks() == 0; final Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job); final ColumnFileMetaData meta = filterMetadata(job); final Path dir = FileOutputFormat.getTaskOutputPath(job, name); final FileSystem fs = dir.getFileSystem(job); if (!fs.mkdirs(dir)) throw new IOException("Failed to create directory: " + dir); final long blockSize = fs.getDefaultBlockSize(); return new RecordWriter<AvroWrapper<T>, NullWritable>() { private int part = 0; private AvroColumnWriter<T> writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get()); private void flush() throws IOException { OutputStream out = fs.create(new Path(dir, "part-" + (part++) + EXT)); try { writer.writeTo(out);/*from w w w. j av a 2 s.c o m*/ } finally { out.close(); } writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get()); } public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException { writer.write(wrapper.datum()); if (writer.sizeEstimate() >= blockSize) // block full flush(); } public void close(Reporter reporter) throws IOException { flush(); } }; }
From source file:org.cloudata.core.parallel.hadoop.KeyRangePartitioner.java
License:Apache License
@Override public void configure(JobConf jobConf) { String tableName = jobConf.get(AbstractTabletInputFormat.OUTPUT_TABLE); int numReduce = jobConf.getNumReduceTasks(); try {/*from www.ja va 2 s . c om*/ CTable ctable = CTable.openTable(new CloudataConf(jobConf), tableName); TabletInfo[] tabletInfos = ctable.listTabletInfos(); if (tabletInfos == null) { confException = new Exception("No Tablets in table [" + tableName + "]"); LOG.error("No Tablets in table [" + tableName + "]"); return; } if (numReduce > tabletInfos.length) { for (int i = 0; i < tabletInfos.length; i++) { //LOG.info("Add endRowKey: " + new String(tabletInfos[i].getEndRowKey().getBytes(), "EUC-KR")); tabletInfoSet.add(new RowKeyItem(tabletInfos[i].getEndRowKey(), i)); } } else { int modValue = tabletInfos.length / numReduce; int partition = 0; for (int i = 0; i < tabletInfos.length; i++) { if (i % modValue == 0) { //LOG.info("Add endRowKey: " + new String(tabletInfos[i].getEndRowKey().getBytes(), "EUC-KR")); tabletInfoSet.add(new RowKeyItem(tabletInfos[i].getEndRowKey(), partition)); partition++; } } } if (tabletInfoSet.isEmpty()) { confException = new Exception("No Key raneg in table [" + tableName + "]"); LOG.error("No Key raneg in table [" + tableName + "]"); } } catch (Exception e) { LOG.error("KeyRangePartitioner config error:" + tableName + "," + e.getMessage()); e.printStackTrace(System.out); confException = e; } }