Example usage for org.apache.hadoop.mapred JobConf getNumReduceTasks

List of usage examples for org.apache.hadoop.mapred JobConf getNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getNumReduceTasks.

Prototype

public int getNumReduceTasks() 

Source Link

Document

Get the configured number of reduce tasks for this job.

Usage

From source file:org.apache.ignite.internal.processors.hadoop.HadoopUtils.java

License:Apache License

/**
 * Creates JobInfo from hadoop configuration.
 *
 * @param cfg Hadoop configuration./* ww w  .j  av a  2  s.  co m*/
 * @return Job info.
 * @throws IgniteCheckedException If failed.
 */
public static HadoopDefaultJobInfo createJobInfo(Configuration cfg) throws IgniteCheckedException {
    JobConf jobConf = new JobConf(cfg);

    boolean hasCombiner = jobConf.get("mapred.combiner.class") != null
            || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null;

    int numReduces = jobConf.getNumReduceTasks();

    jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null);

    if (jobConf.getUseNewMapper()) {
        String mode = "new map API";

        ensureNotSet(jobConf, "mapred.input.format.class", mode);
        ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, "mapred.partitioner.class", mode);
        else
            ensureNotSet(jobConf, "mapred.output.format.class", mode);
    } else {
        String mode = "map compatibility";

        ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode);
        else
            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
    }

    if (numReduces != 0) {
        jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null);

        if (jobConf.getUseNewReducer()) {
            String mode = "new reduce API";

            ensureNotSet(jobConf, "mapred.output.format.class", mode);
            ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode);
        } else {
            String mode = "reduce compatibility";

            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
            ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode);
        }
    }

    Map<String, String> props = new HashMap<>();

    for (Map.Entry<String, String> entry : jobConf)
        props.put(entry.getKey(), entry.getValue());

    return new HadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces, props);
}

From source file:org.apache.nutch.fetcher.FetcherOutputFormat.java

License:Apache License

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
    Path out = FileOutputFormat.getOutputPath(job);
    if ((out == null) && (job.getNumReduceTasks() != 0)) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }//from ww  w . j ava2 s  .  c o m
    if (fs == null) {
        fs = out.getFileSystem(job);
    }
    if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
        throw new IOException("Segment already fetched!");
}

From source file:org.apache.nutch.parse.ParseOutputFormat.java

License:Apache License

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
    Path out = FileOutputFormat.getOutputPath(job);
    if ((out == null) && (job.getNumReduceTasks() != 0)) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }/*from  ww w . j a v a2 s . c  o m*/
    if (fs == null) {
        fs = out.getFileSystem(job);
    }
    if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME)))
        throw new IOException("Segment already parsed!");
}

From source file:org.apache.nutch.segment.SegmentMerger.java

License:Apache License

public void configure(JobConf conf) {
    setConf(conf);
    if (sliceSize > 0) {
        sliceSize = sliceSize / conf.getNumReduceTasks();
    }
}

From source file:org.apache.sysml.runtime.matrix.mapred.CSVWriteReducer.java

License:Apache License

@Override
public void configure(JobConf job) {
    super.configure(job);
    byte maxIndex = 0;
    HashMap<Byte, CSVWriteInstruction> out2Ins = new HashMap<>();
    try {//from  w w w. j a v  a2 s .co m
        CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job);
        for (CSVWriteInstruction in : ins) {
            out2Ins.put(in.output, in);
            if (in.output > maxIndex)
                maxIndex = in.output;
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    int numParitions = job.getNumReduceTasks();
    int taskID = MapReduceTool.getUniqueTaskId(job);
    //LOG.info("## taks id: "+taskID);
    //for efficiency only, the arrays may have missing values
    rowIndexes = new long[maxIndex + 1];
    colIndexes = new long[maxIndex + 1];
    maxRowIndexes = new long[maxIndex + 1];
    minRowIndexes = new long[maxIndex + 1];
    numColBlocks = new long[maxIndex + 1];
    lastBlockNCols = new int[maxIndex + 1];
    colsPerBlock = new int[maxIndex + 1];
    delims = new String[maxIndex + 1];
    sparses = new boolean[maxIndex + 1];
    tagToResultIndex = new int[maxIndex + 1];

    for (int i = 0; i < resultIndexes.length; i++) {
        byte ri = resultIndexes[i];
        tagToResultIndex[ri] = i;
        CSVWriteInstruction in = out2Ins.get(ri);
        MatrixCharacteristics dim = MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input);
        delims[ri] = in.delim;
        sparses[ri] = in.sparse;
        numColBlocks[ri] = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock());
        lastBlockNCols[ri] = (int) (dim.getCols() % dim.getColsPerBlock());
        colsPerBlock[ri] = dim.getColsPerBlock();
        long rstep = (long) Math.ceil((double) dim.getRows() / (double) numParitions);
        minRowIndexes[ri] = rowIndexes[ri] = rstep * taskID;
        maxRowIndexes[ri] = Math.min(rstep * (taskID + 1), dim.getRows());
        colIndexes[ri] = 0;
    }

    zeroBlock.setData(new MatrixBlock());
}

From source file:org.apache.sysml.runtime.matrix.mapred.GMRMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    super.configure(job);

    mapperID = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
    dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix");

    _filterEmptyInputBlocks = allowsFilterEmptyInputBlocks();

    //assign the temporay vairables
    try {/*  w ww .j a  v  a 2s .  c om*/
        //   System.out.println(valueClass.getName());
        //   System.out.println(MatrixCell.class.getName());
        if (job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class))
            taggedValueBuffer = TaggedMatrixValue.createObject(MatrixPackedCell.class);
        else
            taggedValueBuffer = TaggedMatrixValue.createObject(valueClass);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    //decide whether it is a maponly job
    mapOnlyJob = (job.getNumReduceTasks() <= 0);
    if (!mapOnlyJob)
        return;

    //get the indexes of the final output matrices
    resultIndexes = MRJobConfiguration.getResultIndexes(job);
    resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job);

    //initialize SystemML Counters (defined in MRJobConfiguration)
    resultsNonZeros = new long[resultIndexes.length];
    resultsMaxRowDims = new long[resultIndexes.length];
    resultsMaxColDims = new long[resultIndexes.length];

    tagMapping = new HashMap<>();
    for (int i = 0; i < resultIndexes.length; i++) {
        byte output = resultIndexes[i];
        ArrayList<Integer> vec = tagMapping.get(output);
        if (vec == null) {
            vec = new ArrayList<>();
            tagMapping.put(output, vec);
        }
        vec.add(i);
    }
    //for map only job, get the map output converters 
    collectFinalMultipleOutputs = MRJobConfiguration.getMultipleConvertedOutputs(job);
}

From source file:org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*from w  w w .ja  v a  2 s .com*/
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile)
        throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();

    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS,
            WritableComparable.class);
    //get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);

    //indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();

    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input

    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat
                .getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: "
                            + pair.getKey().getClass() + ":" + pair.getValue().getClass());

                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }

    if (totalcount == 0) //empty input files
        sampler.addValue(new DoubleWritable(0));

    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }

    //note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }

    return index0;
}

From source file:org.apache.sysml.runtime.matrix.SortMR.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen,
        int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication,
        String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
    boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
    String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;

    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortMR");

    //setup partition file
    String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
    Path partitionFile = new Path(pfname);
    URI partitionUri = new URI(partitionFile.toString());

    //setup input/output paths
    Path inputDir = new Path(input);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    FileInputFormat.setInputPaths(job, inputDir);
    Path outpath = new Path(tmpOutput);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);

    //set number of reducers (1 if local mode)
    if (!InfrastructureAnalyzer.isLocalMode(job)) {
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
        //ensure partition size <= 10M records to avoid scalability bottlenecks
        //on cp-side qpick instructions for quantile/iqm/median (~128MB)
        if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes))
            job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000));
    } else //in case of local mode
        job.setNumReduceTasks(1);/* w w w. j  a  v  a2s  . com*/

    //setup input/output format
    job.setInputFormat(SamplingSortMRInputFormat.class);
    SamplingSortMRInputFormat.setTargetKeyValueClasses(job,
            (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);

    //setup instructions and meta information
    if (combineInst != null && !combineInst.trim().isEmpty())
        job.set(COMBINE_INSTRUCTION, combineInst);
    job.set(SORT_INSTRUCTION, sortInst);
    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
    boolean desc = getSortInstructionDescending(sortInst);
    job.setBoolean(SORT_DECREASING, desc);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
    int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);

    //setup mapper/reducer/partitioner/output classes
    if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        job.setMapperClass(IndexSortMapper.class);
        job.setReducerClass(IndexSortReducer.class);
        job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setOutputKeyClass(MatrixIndexes.class);
        job.setOutputValueClass(MatrixBlock.class);
    } else { //default case: SORT w/wo weights
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(CompactOutputFormat.class);
        job.setMapperClass(ValueSortMapper.class);
        job.setReducerClass(ValueSortReducer.class);
        job.setOutputKeyClass(outputInfo.outputKeyClass); //double
        job.setOutputValueClass(outputInfo.outputValueClass); //int
    }
    job.setPartitionerClass(TotalOrderPartitioner.class);

    //setup distributed cache
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);

    //setup replication factor
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    MatrixCharacteristics[] s = new MatrixCharacteristics[1];
    s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(s);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //run mr job
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
    numReducers = job.getNumReduceTasks();

    //process final meta data
    long[] counts = new long[numReducers];
    long total = 0;
    for (int i = 0; i < numReducers; i++) {
        counts[i] = group.getCounter(Integer.toString(i));
        total += counts[i];
    }

    //add missing 0s back to the results
    long missing0s = 0;
    if (total < rlen * clen) {
        if (partitionWith0 < 0)
            throw new RuntimeException("no partition contains 0, which is wrong!");
        missing0s = rlen * clen - total;
        counts[partitionWith0] += missing0s;
    } else
        partitionWith0 = -1;

    if (sortIndexes) {
        //run builtin job for shifting partially sorted blocks according to global offsets
        //we do this in this custom form since it would not fit into the current structure
        //of systemml to output two intermediates (partially sorted data, offsets) out of a 
        //single SortKeys lop
        boolean success = runjob.isSuccessful();
        if (success) {
            success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication,
                    output);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
    } else {
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
    }
}

From source file:org.apache.trevni.avro.AvroTrevniOutputFormat.java

License:Apache License

@Override
public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(FileSystem ignore, final JobConf job,
        final String name, Progressable prog) throws IOException {

    boolean isMapOnly = job.getNumReduceTasks() == 0;
    final Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job);

    final ColumnFileMetaData meta = filterMetadata(job);

    final Path dir = FileOutputFormat.getTaskOutputPath(job, name);
    final FileSystem fs = dir.getFileSystem(job);
    if (!fs.mkdirs(dir))
        throw new IOException("Failed to create directory: " + dir);
    final long blockSize = fs.getDefaultBlockSize();

    return new RecordWriter<AvroWrapper<T>, NullWritable>() {
        private int part = 0;

        private AvroColumnWriter<T> writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get());

        private void flush() throws IOException {
            OutputStream out = fs.create(new Path(dir, "part-" + (part++) + EXT));
            try {
                writer.writeTo(out);/*from  w  w  w.  j av a  2  s.c  o  m*/
            } finally {
                out.close();
            }
            writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get());
        }

        public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            writer.write(wrapper.datum());
            if (writer.sizeEstimate() >= blockSize) // block full
                flush();
        }

        public void close(Reporter reporter) throws IOException {
            flush();
        }
    };
}

From source file:org.cloudata.core.parallel.hadoop.KeyRangePartitioner.java

License:Apache License

@Override
public void configure(JobConf jobConf) {
    String tableName = jobConf.get(AbstractTabletInputFormat.OUTPUT_TABLE);
    int numReduce = jobConf.getNumReduceTasks();
    try {/*from   www.ja va  2  s  . c  om*/
        CTable ctable = CTable.openTable(new CloudataConf(jobConf), tableName);
        TabletInfo[] tabletInfos = ctable.listTabletInfos();
        if (tabletInfos == null) {
            confException = new Exception("No Tablets in table [" + tableName + "]");
            LOG.error("No Tablets in table [" + tableName + "]");
            return;
        }
        if (numReduce > tabletInfos.length) {
            for (int i = 0; i < tabletInfos.length; i++) {
                //LOG.info("Add endRowKey: " + new String(tabletInfos[i].getEndRowKey().getBytes(), "EUC-KR"));
                tabletInfoSet.add(new RowKeyItem(tabletInfos[i].getEndRowKey(), i));
            }
        } else {
            int modValue = tabletInfos.length / numReduce;
            int partition = 0;
            for (int i = 0; i < tabletInfos.length; i++) {
                if (i % modValue == 0) {
                    //LOG.info("Add endRowKey: " + new String(tabletInfos[i].getEndRowKey().getBytes(), "EUC-KR"));
                    tabletInfoSet.add(new RowKeyItem(tabletInfos[i].getEndRowKey(), partition));
                    partition++;
                }
            }
        }
        if (tabletInfoSet.isEmpty()) {
            confException = new Exception("No Key raneg in table [" + tableName + "]");
            LOG.error("No Key raneg in table [" + tableName + "]");
        }
    } catch (Exception e) {
        LOG.error("KeyRangePartitioner config error:" + tableName + "," + e.getMessage());
        e.printStackTrace(System.out);
        confException = e;
    }
}