Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration.java

License:Open Source License

public static void setNumReducers(JobConf job, long numReducerGroups, int numFromCompiler) throws IOException {
    JobClient client = new JobClient(job);
    int n = client.getClusterStatus().getMaxReduceTasks();
    //correction max number of reducers on yarn clusters
    if (InfrastructureAnalyzer.isYarnEnabled())
        n = (int) Math.max(n, YarnClusterAnalyzer.getNumCores() / 2);
    n = Math.min(n, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS));
    n = Math.min(n, numFromCompiler);
    if (numReducerGroups > 0)
        n = (int) Math.min(n, numReducerGroups);
    job.setNumReduceTasks(n);
}

From source file:com.ibm.bi.dml.runtime.matrix.MMCJMR.java

License:Open Source License

private static MatrixCharacteristics[] commonSetup(JobConf job, boolean inBlockRepresentation, String[] inputs,
        InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens,
        String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrction, int numReducers,
        int replication, byte resultDimsUnknown, String output, OutputInfo outputinfo) throws Exception {
    job.setJobName("MMCJ-MR");

    if (numReducers <= 0)
        throw new Exception("MMCJ-MR has to have at least one reduce task!");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;/*from   w w  w.  j a  v a 2  s . co m*/

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true,
            inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

    //set up the aggregate binary operation for the mmcj job
    MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrction);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

    byte[] resultIndexes = new byte[] { MRInstructionParser.parseSingleInstruction(aggBinInstrction).output };
    byte[] resultDimsUnknown_Array = new byte[] { resultDimsUnknown };
    // byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output};

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array,
            new String[] { output }, new OutputInfo[] { outputinfo }, inBlockRepresentation);

    // configure mapper
    job.setMapperClass(MMCJMRMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    if (inBlockRepresentation)
        job.setMapOutputValueClass(MatrixBlock.class);
    else
        job.setMapOutputValueClass(MatrixCell.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexPartitioner.class);

    //configure combiner
    //TODO: cannot set up combiner, because it will destroy the stable numerical algorithms 
    // for sum or for central moments 

    //if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
    //   job.setCombinerClass(MMCJMRCombiner.class);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, null, resultIndexes,
            mapoutputIndexes, true);

    //set up the number of reducers
    if (AUTOMATIC_CONFIG_NUM_REDUCERS) {
        int numRed = determineNumReducers(rlens, clens, numReducers, ret.numReducerGroups);
        job.setNumReduceTasks(numRed);
    } else
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    //configure reducer
    // note: the alternative MMCJMRReducer is not maintained
    job.setReducerClass(MMCJMRReducerWithAggregator.class);

    return ret.stats;
}

From source file:com.ibm.bi.dml.runtime.matrix.ReblockMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, long[] nnz, String instructionsInMapper,
        String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication,
        boolean jvmReuse, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(ReblockMR.class);
    job.setJobName("Reblock-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;/*from  w w  w .j  a v  a2 s  .com*/

    //set up the input files and their format information
    //(internally used input converters: text2bc for text, identity for binary inputs)
    MRJobConfiguration.setUpMultipleInputsReblock(job, realIndexes, inputs, inputInfos, brlens, bclens);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens, nnz);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setReblockInstructions(job, reblockInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //disable automatic tasks timeouts and speculative task exec
    job.setInt("mapred.task.timeout", 0);
    job.setMapSpeculativeExecution(false);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //enable jvm reuse (based on SystemML configuration)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, reblockInstructions, null, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes,
            mapoutputIndexes, false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens, nnz,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    //setup in-memory reduce buffers budget (reblock reducer dont need much memory)
    //job.set("mapred.job.reduce.input.buffer.percent", "0.70");

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ReblockMapper.class);
    job.setMapOutputKeyClass(MatrixIndexes.class); //represent key offsets for block
    job.setMapOutputValueClass(TaggedAdaptivePartialBlock.class); //binary cell/block

    //configure reducer
    job.setReducerClass(ReblockReducer.class);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.

    // at this point, both reblock_binary and reblock_text are similar
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        //   System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen,
        int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication,
        String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
    boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
    String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;

    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortMR");

    //setup partition file
    String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
    Path partitionFile = new Path(pfname);
    URI partitionUri = new URI(partitionFile.toString());

    //setup input/output paths
    Path inputDir = new Path(input);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    SamplingSortMRInputFormat.setInputPaths(job, inputDir);
    Path outpath = new Path(tmpOutput);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);

    //set number of reducers (1 if local mode)
    if (InfrastructureAnalyzer.isLocalMode(job))
        job.setNumReduceTasks(1);
    else//from w  w  w  .ja v a 2s . c  o m
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);

    //setup input/output format
    job.setInputFormat(SamplingSortMRInputFormat.class);
    SamplingSortMRInputFormat.setTargetKeyValueClasses(job,
            (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);

    //setup instructions and meta information
    if (combineInst != null && !combineInst.trim().isEmpty())
        job.set(COMBINE_INSTRUCTION, combineInst);
    job.set(SORT_INSTRUCTION, sortInst);
    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
    boolean desc = getSortInstructionDescending(sortInst);
    job.setBoolean(SORT_DECREASING, desc);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
    int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);

    //setup mapper/reducer/partitioner/output classes
    if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        job.setMapperClass(IndexSortMapper.class);
        job.setReducerClass(IndexSortReducer.class);
        job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setOutputKeyClass(MatrixIndexes.class);
        job.setOutputValueClass(MatrixBlock.class);
    } else { //default case: SORT w/wo weights
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(CompactOutputFormat.class);
        job.setMapperClass(ValueSortMapper.class);
        job.setReducerClass(ValueSortReducer.class);
        job.setOutputKeyClass(outputInfo.outputKeyClass); //double
        job.setOutputValueClass(outputInfo.outputValueClass); //int
    }
    job.setPartitionerClass(TotalOrderPartitioner.class);

    //setup distributed cache
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);

    //setup replication factor
    job.setInt("dfs.replication", replication);

    MatrixCharacteristics[] s = new MatrixCharacteristics[1];
    s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(s);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //run mr job
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
    numReducers = job.getNumReduceTasks();

    //process final meta data
    long[] counts = new long[numReducers];
    long total = 0;
    for (int i = 0; i < numReducers; i++) {
        counts[i] = group.getCounter(Integer.toString(i));
        total += counts[i];
    }

    //add missing 0s back to the results
    long missing0s = 0;
    if (total < rlen * clen) {
        if (partitionWith0 < 0)
            throw new RuntimeException("no partition contains 0, which is wrong!");
        missing0s = rlen * clen - total;
        counts[partitionWith0] += missing0s;
    } else
        partitionWith0 = -1;

    if (sortIndexes) {
        //run builtin job for shifting partially sorted blocks according to global offsets
        //we do this in this custom form since it would not fit into the current structure
        //of systemml to output two intermediates (partially sorted data, offsets) out of a 
        //single SortKeys lop
        boolean success = runjob.isSuccessful();
        if (success) {
            success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication,
                    output);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
    } else {
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java

License:Open Source License

/**
 * /*from  w  w w . j av  a 2  s . c  om*/
 * @param input
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param counts
 * @param numReducers
 * @param replication
 * @param output
 * @throws Exception
 */
private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts,
        int numReducers, int replication, String output) throws Exception {
    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortIndexesMR");

    //setup input/output paths
    Path inpath = new Path(input);
    Path outpath = new Path(output);
    FileInputFormat.setInputPaths(job, inpath);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);

    //set number of reducers (1 if local mode)
    if (InfrastructureAnalyzer.isLocalMode(job))
        job.setNumReduceTasks(1);
    else
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);

    //setup input/output format
    InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
    OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
    job.setInputFormat(iinfo.inputFormatClass);
    job.setOutputFormat(oinfo.outputFormatClass);
    CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class);

    //setup mapper/reducer/output classes
    MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen,
            ConvertTarget.BLOCK);
    job.setMapperClass(IndexSortStitchupMapper.class);
    job.setReducerClass(IndexSortStitchupReducer.class);
    job.setOutputKeyClass(oinfo.outputKeyClass);
    job.setOutputValueClass(oinfo.outputValueClass);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen });

    //compute shifted prefix sum of offsets and put into configuration
    long[] cumsumCounts = new long[counts.length];
    long sum = 0;
    for (int i = 0; i < counts.length; i++) {
        cumsumCounts[i] = sum;
        sum += counts[i];
    }
    job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts));

    //setup replication factor
    job.setInt("dfs.replication", replication);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //run mr job
    RunningJob runJob = JobClient.runJob(job);

    return runJob.isSuccessful();
}

From source file:com.ibm.bi.dml.runtime.matrix.WriteCSVMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication,
        byte[] resultIndexes, String[] outputs) throws Exception {
    JobConf job = new JobConf(WriteCSVMR.class);
    job.setJobName("WriteCSV-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;// ww w  .j  av a  2 s.c om

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true,
            ConvertTarget.CSVWRITE);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    long maxRlen = 0;
    for (long rlen : rlens)
        if (rlen > maxRlen)
            maxRlen = rlen;

    //set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
    job.setNumReduceTasks(numRed);

    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    OutputInfo[] outputInfos = new OutputInfo[outputs.length];
    HashMap<Byte, Integer> indexmap = new HashMap<Byte, Integer>();
    for (int i = 0; i < stats.length; i++) {
        indexmap.put(resultIndexes[i], i);
        resultDimsUnknown[i] = (byte) 0;
        stats[i] = new MatrixCharacteristics();
        outputInfos[i] = OutputInfo.CSVOutputInfo;
    }
    CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
    for (CSVWriteInstruction in : ins)
        stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    //set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions,
            resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVWriteMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(MatrixBlock.class);

    //configure reducer
    job.setReducerClass(CSVWriteReducer.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
    //job.setOutputFormat(UnPaddedOutputFormat.class);

    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath,
        String mapsPath, String tmpPath, String outputPath, String partOffsetsFile,
        CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter,
        int replication, String headerLine) throws Exception {

    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);

    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };

    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);

    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath },
            new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);

    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);

    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            rblkInst, null, otherInst, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 },
            new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }/*  w w  w.  j a va  2 s .  c  o m*/
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath,
        String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols,
        int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(ApplyTfCSVMR.class);
    job.setJobName("ApplyTfCSV");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfCSVMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfCSVMapper.class);
    job.setNumReduceTasks(0);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(partOffsetsFile);
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInt("dfs.replication", replication);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(outputPath);
    FileSystem fs = FileSystem.get(job);
    fs.delete(outPath, true);//from  w  w w . jav  a 2  s. co m
    FileOutputFormat.setOutputPath(job, outPath);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    // Run the job
    RunningJob runjob = JobClient.runJob(job);

    // Since transform CSV produces part files w/ prefix transform-part-*,
    // delete all the "default" part-..... files
    deletePartFiles(fs, outPath);

    MatrixCharacteristics mc = new MatrixCharacteristics();
    return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}

From source file:com.jyz.study.hadoop.mapreduce.datajoin.DataJoinJob.java

License:Apache License

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];/*from  w ww .  j  ava 2  s.  c o  m*/
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
        System.out.println("Using TextInputFormat: " + args[2]);
        inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileOutputFormat: " + args[7]);
        outputFormat = SequenceFileOutputFormat.class;
        outputValueClass = getClassByName(args[7]);
    } else {
        System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
        maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
        jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir), true);
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
}

From source file:com.kadwa.hadoop.DistExec.java

License:Open Source License

private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, DistExec.class);
    jobconf.setJobName(NAME);/*from  ww w  .  j  a  v  a2 s  .  c om*/

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobconf.setMapSpeculativeExecution(false);

    jobconf.setInputFormat(ExecInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(ExecFilesMapper.class);
    jobconf.setNumReduceTasks(0);
    // TODO implement singleOut by setting single reducer and prepending file name to output
    return jobconf;
}