Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.beymani.proximity.AverageDistance.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour stat calculation  MR";
    job.setJobName(jobName);/*from  w w  w  . j  ava 2s  . c o  m*/

    job.setJarByClass(AverageDistance.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AverageDistance.TopMatchesMapper.class);
    job.setReducerClass(AverageDistance.TopMatchesReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(IdRankGroupComprator.class);
    job.setPartitionerClass(IdRankPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.beymani.proximity.NeighborDensity.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour density";
    job.setJobName(jobName);/*from   w  w w  .  jav  a2s .  c om*/

    job.setJarByClass(NeighborDensity.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(NeighborDensity.GroupingMapper.class);
    job.setReducerClass(NeighborDensity.GroupingReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }/*from w  w  w .  j a v a2s.co  m*/
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:org.bigdata.ComplexInvertIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration config = HadoopConfig.getConfig();
    Job job = Job.getInstance(config, "??");
    job.setJarByClass(ComplexInvertIndex.class);
    job.setInputFormatClass(FileNameInputFormat.class);
    job.setMapperClass(InvertIndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(InvertIndexCombiner.class);
    job.setReducerClass(InvertIndexReducer.class);
    job.setPartitionerClass(InvertIndexPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path("/input"));
    FileOutputFormat.setOutputPath(job, new Path("/output/"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

@Override
public int run(String[] argv) throws Exception {
    try {//  w w  w  .  j a v  a  2  s.co  m
        Configuration conf;
        FileSystem srcFs, outFs, fs;
        Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath;
        int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100;
        FileStatus[] content;
        ClusterStatus status;
        int numNodes, mapSlotsPerNode;
        long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime,
                splitSize;
        float inputBufpcnt;
        FSDataOutputStream out;
        FSDataInputStream in;
        SAMFileReader fileReader;
        InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler;
        double sampling_frequency = 0.01;

        // Job object can be used for Aligner job if enabled
        conf = getConf();
        Job job = new Job(conf);

        parseCommandLineArgs(argv, conf);

        maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks();

        maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks();
        if (!noalign) {
            System.out.println("Starting Alignment Job");
            startTime = System.currentTimeMillis();

            status = new JobClient(new JobConf(conf)).getClusterStatus();
            numNodes = status.getTaskTrackers();
            // Job specific setting of number of Reducers..
            if (nReducers == 0)
                nReducers = numNodes;
            conf.setInt("mapred.reduce.tasks", nReducers);

            Path refPath = new Path(refFileLoc);
            fs = refPath.getFileSystem(conf);
            blockSize = fs.getFileStatus(refPath).getBlockSize();
            splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize());

            if (reads_per_split == 0) {
                inputPath = new Path(readFile1);
                long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen();
                long numSplits = Math.round(readSize / splitSize);

                if (numSplits < maxMapTasks)
                    numSplits = maxMapTasks;

                if (numSplits < nReducers)
                    numSplits = nReducers;

                long numReads = Math.round(readSize / (long) fq_read_size);
                reads_per_split = numReads / numSplits;

                // Total Order Partitioner
                if ((double) reads_per_split <= (1 / sampling_frequency)) {
                    sampling_frequency = 1;
                    granularity = 1;
                } else if (((double) reads_per_split > (1 / sampling_frequency))
                        && ((double) reads_per_split <= (1 / sampling_frequency * 100))) {
                    sampling_frequency = 0.1;
                    granularity = 10;
                }
            }

            job.setJarByClass(GATKJobClient.class);
            job.setInputFormatClass(NLineXInputFormat.class);
            FileInputFormat.addInputPath(job, new Path(fqInput));
            FileOutputFormat.setOutputPath(job, new Path(BWAOutPath));

            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration());
            if (!is_azure) {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"),
                        job.getConfiguration());
            } else {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"),
                        job.getConfiguration());
            }
            DistributedCache.createSymlink(job.getConfiguration());

            // Setting local.cache.size - Add up the size of the files
            // distributed through the cache

            cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen();
            if (!is_azure) {
                cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen();
            }

            if (cacheSize > 8 * 1024 * 1024 * 1024) {
                conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024));
            }

            conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs..
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setPartitionerClass(BWAPartitioner.class);
            job.setReducerClass(BWAReducer.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            if (job.waitForCompletion(true)) {
                System.out.println("BWA Alignment done");
            }

            content = fs.listStatus(new Path(BWAOutPath));

            for (int i = 0; i < content.length; i++) {
                if (!((content[i].getPath().getName()).endsWith(".bam"))
                        && !((content[i].getPath().getName()).startsWith("_"))) {
                    fs.delete(content[i].getPath(), false);
                }
            }
            endTime = System.currentTimeMillis();
            System.out.println("BWA Alignment took: " + (endTime - startTime));
            startTime = System.currentTimeMillis();
            System.out.println("Starting Splitting BAM Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1");
            FileOutputFormat.setOutputPath(job, output);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setInt("gatk.hadoop.granularity", granularity);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.isindex", false);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("SplittingBAM Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Splitting BAM Indexing took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Sort Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            if (norealign && nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath));
            job.setInputFormatClass(ContigInputFormat.class);
            job.setPartitionerClass(ContigPartitioner.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            fs = inputPath.getFileSystem(conf);
            content = fs.listStatus(inputPath);
            for (int i = 0; i < content.length; i++) {
                if (content[i].getPath().getName().endsWith(".bam")) {
                    in = fs.open(content[i].getPath());
                    List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader())
                            .getSequenceDictionary().getSequences();
                    conf.setInt("mapred.reduce.tasks", sequences.size());

                    break;
                }
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            //conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);

            if (job.waitForCompletion(true)) {
                System.out.println("Sort completed successfully");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Sort job took: " + (endTime - startTime));
        }

        if (!norealign) {
            if (!noalign)
                BAMInputPath = SortBWAOutPath;

            startTime = System.currentTimeMillis();
            System.out.println("Starting Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);
            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2");
            FileOutputFormat.setOutputPath(job, output);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setBoolean("gatk.hadoop.isindex", true);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Indexing job took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Realigner Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);

            job.setInputFormatClass(BAMInputFormat.class);

            srcFs = new Path(outputDir).getFileSystem(conf);
            if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition")))
                System.out.println("mkdir failed");
            inputDir = new Path(outputDir + Path.SEPARATOR + "Partition");
            inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
            partition = new Path(inputDir, "_partition");
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(conf, partition);

            try {
                URI partitionURI = new URI(partition.toString() + "#_partition");
                DistributedCache.addCacheFile(partitionURI, conf);
            } catch (URISyntaxException e) {
                assert false;
            }

            if (nReducers == 0) {
                if (!nomarkdup || !noqrecab || !novariant) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10));
                }
            } else {
                conf.setInt("mapred.reduce.tasks", nReducers);
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            if (nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(IndelMapper.class);
            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(IndelOutPath));

            sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                    max_splits);
            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
            job.setInputFormatClass(LociInputFormat.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());
            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Indel realignment done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Indel Realigner took: " + (endTime - startTime));
        }

        if (!nomarkdup || !noqrecab || !novariant) {
            /* 
             * MarkDuplicate and Indexing Job 
             * FixMateInformation is not required as it is handled
             * automatically by GATK after IndelRealignment.
             */
            System.out.println("Starting MarkDup/Indexing job");
            startTime = System.currentTimeMillis();
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            if (!nomarkdup) {
                System.out.println("Starting MarkDuplicates job");
                conf.setBoolean("gatk.hadoop.ismarkdup", true);
                FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath));
            }
            if (!noqrecab || !novariant) {
                conf.setBoolean("gatk.hadoop.issindex", true);
                conf.setBoolean("gatk.hadoop.isindex", true);
                if (nomarkdup) {
                    System.out.println("Starting Indexing job");
                    FileOutputFormat.setOutputPath(job,
                            new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"));
                }
            }
            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Markdup/Indexing job done !!!");
            }
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }

            if (!nomarkdup) {
                Path rmdupOutPath = new Path(RmdupOutPath);
                fs = rmdupOutPath.getFileSystem(conf);
                content = fs.listStatus(rmdupOutPath);

                for (int i = 0; i < content.length; i++) {
                    if ((content[i].getPath().getName()).startsWith("part")) {
                        fs.delete(content[i].getPath(), false);
                    }
                }
                endTime = System.currentTimeMillis();
                System.out.println("MarkDuplicates took: " + (endTime - startTime));
            } else {
                endTime = System.currentTimeMillis();
                System.out.println("Indexing took: " + (endTime - startTime));
            }
        }

        if (!noqrecab) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Recal - Count Covariates Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(LociInputFormat.class);

            conf.setLong("local.cache.size", 20106127360L);
            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.set("gatk.hadoop.outputpath", outputDir);
            // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
            // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(RecalCovMapper.class);
            job.setCombinerClass(RecalCovCombiner.class);
            job.setReducerClass(RecalCovReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"),
                    job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("CountCovariates done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("CountCovariates took: " + (endTime - startTime));
        }

        if (!noqrecab || !novariant) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Table Recalibration / Unified Genotyper Job");
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            FileInputFormat.addInputPath(job, inputPath);

            if (!noqrecab) {
                conf.setBoolean("gatk.hadoop.recab", true);
                if (norealign) {
                    job.setInputFormatClass(BAMInputFormat.class);
                    srcFs = new Path(outputDir).getFileSystem(conf);
                    if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition")))
                        System.out.println("mkdir failed");
                } else {
                    job.setInputFormatClass(LociInputFormat.class);
                }
                inputDir = new Path(outputDir + "/" + "Partition");
                inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
                partition = new Path(inputDir, "_partition");
                job.setPartitionerClass(TotalOrderPartitioner.class);
                TotalOrderPartitioner.setPartitionFile(conf, partition);
                try {
                    URI partitionURI = new URI(partition.toString() + "#_partition");
                    DistributedCache.addCacheFile(partitionURI, conf);
                } catch (URISyntaxException e) {
                    assert false;
                }

                if (nReducers == 0) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", nReducers);
                }
                conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                if (!nomresults)
                    conf.setBoolean("gatk.hadoop.ismerge", true);
                job.setReducerClass(SortReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(SAMRecordWritable.class);
                job.setOutputFormatClass(SortOutputFormat.class);
                FileOutputFormat.setOutputPath(job, new Path(RecalOutPath));
            } else {
                job.setInputFormatClass(LociInputFormat.class);
                conf.setInt("mapred.reduce.tasks", 0);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"));
            }

            job.setMapperClass(RecalMapper.class);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);

            conf.set("gatk.hadoop.outputpath", outputDir);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            if (!novariant) {
                conf.setBoolean("gatk.hadoop.variant", true);
                if (!nofvariant)
                    conf.setBoolean("gatk.hadoop.fvariant", true);
                conf.setInt("gatk.hadoop.nthreads", nThreads);
                conf.setBoolean("gatk.hadoop.xvariant", xVariantCall);
            }

            if (!noqrecab && norealign) {
                sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                        max_splits);
                InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
                job.setInputFormatClass(LociInputFormat.class);
            }

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("TableRecalibration Job done !!");
            }
            endTime = System.currentTimeMillis();
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }
            System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime));
        }
        if (!novariant && !nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Merge Variant Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut");
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setReducerClass(VariantReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Merge Variants done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("MergeVariant job took: " + (endTime - startTime));

            if (xVariantCall && !novariant && !nomresults) {
                startTime = System.currentTimeMillis();

                System.out.println("Merge INDEL Variant Job");
                job = new Job();
                job.setJarByClass(GATKJobClient.class);
                conf = job.getConfiguration();
                inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut");
                FileInputFormat.addInputPath(job, inputPath);
                job.setInputFormatClass(WholeFileInputFormat.class);

                conf.setInt("mapred.reduce.tasks", 1);
                conf.setLong("mapred.task.timeout", 86400000L);
                conf.setBoolean("mapred.map.tasks.speculative.execution", false);
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

                conf.setBoolean("gatk.hadoop", true);
                conf.setBoolean("gatk.hadoop.isazure", is_azure);
                job.setReducerClass(VariantReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(NullWritable.class);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut"));

                DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
                // Standard inputs
                DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"),
                        job.getConfiguration());

                DistributedCache.createSymlink(job.getConfiguration());

                if (job.waitForCompletion(true)) {
                    System.out.println("Merge INDEL Variants done");
                }
                endTime = System.currentTimeMillis();
                System.out.println("MergeINDELVariant job took: " + (endTime - startTime));
            }
        }

        if (!nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Merge BAM Job");

            outputPath = new Path(FinalBAMPath);
            outFs = outputPath.getFileSystem(conf);

            if (!outFs.mkdirs(outputPath))
                System.out.println("mkdir failed");
            // Currently no support to merge output from MarkDuplicates 
            // from Job Client. Need to have a separate MR job for it.
            if (!noqrecab)
                inputPath = new Path(RecalOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else if (!nomarkdup)
                throw new Exception("Merge not implemented MarkDuplicates output.");
            else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant)
                inputPath = new Path(BAMInputPath);

            fs = inputPath.getFileSystem(conf);

            content = fs.listStatus(inputPath);
            mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam");

            Path p = null;
            int nfiles = 0;
            for (int i = 0; i < content.length; i++) {
                p = content[i].getPath();
                ++nfiles;
            }

            if (nfiles == 1) {
                boolean rename = fs.rename(p, mergeOutFile);
            } else {
                out = outFs.create(mergeOutFile, true);

                for (int i = 0; i < content.length; i++) {
                    p = content[i].getPath();
                    if ((p.getName()).endsWith(".bam")) {
                        in = fs.open(p);
                        IOUtils.copyBytes(in, out, conf, false);
                        in.close();
                    }
                }

                out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
                out.close();
            }

            endTime = System.currentTimeMillis();
            System.out.println("Final Merge took: " + (endTime - startTime));
        }
        System.out.println("JobCompleted");
    } catch (IOException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (InterruptedException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (ClassNotFoundException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (Exception e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    }
    return 0;
}

From source file:org.chombo.mr.Joiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Joiner  MR";
    job.setJobName(jobName);//from   w  w w. jav  a  2 s . c om

    job.setJarByClass(Joiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());

    job.setMapperClass(Joiner.JoinerMapper.class);
    job.setReducerClass(Joiner.JoinerReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    int numReducer = job.getConfiguration().getInt("joi.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.MultiJoiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "MultiJoiner  MR";
    job.setJobName(jobName);//ww w  . j  ava2  s  .c om

    job.setJarByClass(MultiJoiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());

    job.setMapperClass(MultiJoiner.JoinerMapper.class);
    job.setReducerClass(MultiJoiner.JoinerReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("muj.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.OutlierBasedDataValidation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Detecting invalid data as outliers";
    job.setJobName(jobName);/*from w  w  w.j a  v  a  2 s  .  co m*/

    job.setJarByClass(OutlierBasedDataValidation.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo");
    job.setMapperClass(OutlierBasedDataValidation.DataValidatorMapper.class);
    job.setReducerClass(OutlierBasedDataValidation.DataValidatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("obdv.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.Projection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Projection  and grouping  MR";
    job.setJobName(jobName);/*from   w w  w.j av a 2 s.c om*/

    job.setJarByClass(Projection.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());
    String operation = job.getConfiguration().get("projection.operation", "project");

    if (operation.startsWith("grouping")) {
        //group by
        job.setMapperClass(Projection.ProjectionMapper.class);
        job.setReducerClass(Projection.ProjectionReducer.class);

        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        int numReducer = job.getConfiguration().getInt("pro.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);

        //order by
        boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0;
        if (doOrderBy) {
            job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
            job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);
        }

    } else {
        //simple projection
        job.setMapperClass(Projection.SimpleProjectionMapper.class);
    }
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.RecordSetBulkMutator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "record set mutator  MR";
    job.setJobName(jobName);// w  ww  .j  a v  a 2  s  .  c  o  m

    job.setJarByClass(RecordSetBulkMutator.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RecordSetBulkMutator.BulkMutatorMapper.class);
    job.setReducerClass(RecordSetBulkMutator.BulkMutatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("rsb.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}