Example usage for org.apache.hadoop.mapred JobConf setReducerClass

List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass) 

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:com.ibm.bi.dml.runtime.matrix.WriteCSVMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication,
        byte[] resultIndexes, String[] outputs) throws Exception {
    JobConf job = new JobConf(WriteCSVMR.class);
    job.setJobName("WriteCSV-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;//from   w w w . j  ava 2s.com

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true,
            ConvertTarget.CSVWRITE);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    long maxRlen = 0;
    for (long rlen : rlens)
        if (rlen > maxRlen)
            maxRlen = rlen;

    //set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
    job.setNumReduceTasks(numRed);

    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    OutputInfo[] outputInfos = new OutputInfo[outputs.length];
    HashMap<Byte, Integer> indexmap = new HashMap<Byte, Integer>();
    for (int i = 0; i < stats.length; i++) {
        indexmap.put(resultIndexes[i], i);
        resultDimsUnknown[i] = (byte) 0;
        stats[i] = new MatrixCharacteristics();
        outputInfos[i] = OutputInfo.CSVOutputInfo;
    }
    CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
    for (CSVWriteInstruction in : ins)
        stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    //set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions,
            resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVWriteMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(MatrixBlock.class);

    //configure reducer
    job.setReducerClass(CSVWriteReducer.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
    //job.setOutputFormat(UnPaddedOutputFormat.class);

    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath,
        String mapsPath, String tmpPath, String outputPath, String partOffsetsFile,
        CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter,
        int replication, String headerLine) throws Exception {

    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);

    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };

    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);

    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath },
            new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);

    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);

    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            rblkInst, null, otherInst, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);//from   w w w  .  jav  a2s  .  c  o  m

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 },
            new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.GenTfMtdMR.java

License:Open Source License

public static long runJob(String inputPath, String txMtdPath, String specFileWithIDs, String smallestFile,
        String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication,
        String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(GenTfMtdMR.class);
    job.setJobName("GenTfMTD");

    /* Setup MapReduce Job */
    job.setJarByClass(GenTfMtdMR.class);

    // set relevant classes
    job.setMapperClass(GTFMTDMapper.class);
    job.setReducerClass(GTFMTDReducer.class);

    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DistinctValue.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setInt("dfs.replication", replication);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(txMtdPath);
    FileSystem fs = FileSystem.get(job);
    fs.delete(outPath, true);/*from www .ja  v  a 2s .  com*/
    FileOutputFormat.setOutputPath(job, outPath);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specFileWithIDs);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);

    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath);

    // offsets file to store part-file names and offsets for each input split
    job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    // Run the job
    RunningJob runjob = JobClient.runJob(job);

    Counters c = runjob.getCounters();
    long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter();

    return tx_numRows;
}

From source file:com.impetus.code.examples.hadoop.mapred.wordcount.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from www .ja  v a2s  . c om

}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java

License:Open Source License

/**
 * @param inputpath//from  w w w  .ja  v  a2s  .  co  m
 *          the path to a unique vertex list. Each line is parsed into (vid,
 *          data) using {@code vidparser} and {@code vdataparser}.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {
    JobConf conf = new JobConf(HashIdMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(HashIdMapper.class);
    conf.setReducerClass(HashIdReducer.class);

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputFormat(MultiDirOutputFormat.class);

    conf.setInt("mapred.line.input.format.linespermap", linespermap);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("VdataParser", vdataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("====== Job: Create integer Id maps for vertices ==========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("Lines per map = 6000000");
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("VdataParser = " + vdataparser.getClass().getName());
    LOG.info("==========================================================");
    JobClient.runJob(conf);
    LOG.info("=======================Done =====================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java

License:Open Source License

/**
 * @param inputpath/*from  w w  w.ja v  a2s .  c  om*/
 *          the path to a rawId to newId dictionary.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortDictMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(SortDictMapper.class);
    conf.setReducerClass(SortDictReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setBoolean("hashRawVid", hashRawVid);
    conf.setInt("numChunks", numChunks);
    conf.set("VidParser", vidparser.getClass().getName());

    String outprefix = "vidhashmap";
    for (int i = 0; i < numChunks; i++) {
        MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("========== Job: Partition the map of rawid -> id ===========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("======================================================");
    if (hashRawVid)
        LOG.info("Partition on rawId.");
    else
        LOG.info("Partition on newId");
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    JobClient.runJob(conf);
    LOG.info("======================= Done ==========================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java

License:Open Source License

public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(SortEdgeMapper.class);
    conf.setReducerClass(SortEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("==== Job: Partition the input edges by hash(sourceid) =========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);/*  w  w w.  j a v  a 2s.  com*/
    LOG.info("=================== Done ====================================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java

License:Open Source License

/**
 * @param inputpath/*from  ww  w .  jav  a  2s . c  om*/
 *          path of the partitioned edge list
 * @param outputpath
 *          path of the output directory
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(TransEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(TransEdgeMapper.class);
    conf.setReducerClass(TransEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    conf.set("dictionaryPath", dictionaryPath);

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("============= Job: Normalize Ids in Edges ====================");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("Dictionary = " + dictionaryPath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);

    LOG.info("========================= Done ===============================");
}

From source file:com.intel.hadoop.graphbuilder.partition.mapreduce.vrecord.VrecordIngressMR.java

License:Open Source License

public void run(int numProcs, String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(VrecordIngressMR.class);
    conf.setJobName("Vrecord Mapreduce");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(VrecordIngressMapper.class);
    conf.setReducerClass(VrecordIngressReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(MultiDirOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    if (gzip) {/*from w w w.  j  a  v a2 s. c om*/
        TextOutputFormat.setCompressOutput(conf, true);
        TextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    }

    LOG.info("====== Job: Distributed Vertex Records to partitions =========");
    LOG.info("input: " + inputpath);
    LOG.info("output: " + outputpath);
    LOG.info("numProc = " + numProcs);
    LOG.info("gzip = " + Boolean.toString(gzip));
    LOG.info("==============================================================");

    JobClient.runJob(conf);
    LOG.info("==========================Done===============================");
}

From source file:com.jyz.study.hadoop.mapreduce.datajoin.DataJoinJob.java

License:Apache License

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];//from w  ww. j a  v a2s .  c  o  m
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
        System.out.println("Using TextInputFormat: " + args[2]);
        inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileOutputFormat: " + args[7]);
        outputFormat = SequenceFileOutputFormat.class;
        outputValueClass = getClassByName(args[7]);
    } else {
        System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
        maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
        jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir), true);
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
}