Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.beymani.proximity.AverageDistance.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour stat calculation  MR";
    job.setJobName(jobName);/*from  w w  w  . j  ava 2s  . c o  m*/

    job.setJarByClass(AverageDistance.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AverageDistance.TopMatchesMapper.class);
    job.setReducerClass(AverageDistance.TopMatchesReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(IdRankGroupComprator.class);
    job.setPartitionerClass(IdRankPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.beymani.proximity.NeighborDensity.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour density";
    job.setJobName(jobName);/*from   w  w w  .  jav  a2s .  c om*/

    job.setJarByClass(NeighborDensity.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(NeighborDensity.GroupingMapper.class);
    job.setReducerClass(NeighborDensity.GroupingReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }/*from w  w  w .  j a v a2s.co  m*/
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:org.bigdata.ComplexInvertIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration config = HadoopConfig.getConfig();
    Job job = Job.getInstance(config, "??");
    job.setJarByClass(ComplexInvertIndex.class);
    job.setInputFormatClass(FileNameInputFormat.class);
    job.setMapperClass(InvertIndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(InvertIndexCombiner.class);
    job.setReducerClass(InvertIndexReducer.class);
    job.setPartitionerClass(InvertIndexPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path("/input"));
    FileOutputFormat.setOutputPath(job, new Path("/output/"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

@Override
public int run(String[] argv) throws Exception {
    try {//  w w  w  .  j a v  a  2  s.co  m
        Configuration conf;
        FileSystem srcFs, outFs, fs;
        Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath;
        int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100;
        FileStatus[] content;
        ClusterStatus status;
        int numNodes, mapSlotsPerNode;
        long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime,
                splitSize;
        float inputBufpcnt;
        FSDataOutputStream out;
        FSDataInputStream in;
        SAMFileReader fileReader;
        InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler;
        double sampling_frequency = 0.01;

        // Job object can be used for Aligner job if enabled
        conf = getConf();
        Job job = new Job(conf);

        parseCommandLineArgs(argv, conf);

        maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks();

        maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks();
        if (!noalign) {
            System.out.println("Starting Alignment Job");
            startTime = System.currentTimeMillis();

            status = new JobClient(new JobConf(conf)).getClusterStatus();
            numNodes = status.getTaskTrackers();
            // Job specific setting of number of Reducers..
            if (nReducers == 0)
                nReducers = numNodes;
            conf.setInt("mapred.reduce.tasks", nReducers);

            Path refPath = new Path(refFileLoc);
            fs = refPath.getFileSystem(conf);
            blockSize = fs.getFileStatus(refPath).getBlockSize();
            splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize());

            if (reads_per_split == 0) {
                inputPath = new Path(readFile1);
                long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen();
                long numSplits = Math.round(readSize / splitSize);

                if (numSplits < maxMapTasks)
                    numSplits = maxMapTasks;

                if (numSplits < nReducers)
                    numSplits = nReducers;

                long numReads = Math.round(readSize / (long) fq_read_size);
                reads_per_split = numReads / numSplits;

                // Total Order Partitioner
                if ((double) reads_per_split <= (1 / sampling_frequency)) {
                    sampling_frequency = 1;
                    granularity = 1;
                } else if (((double) reads_per_split > (1 / sampling_frequency))
                        && ((double) reads_per_split <= (1 / sampling_frequency * 100))) {
                    sampling_frequency = 0.1;
                    granularity = 10;
                }
            }

            job.setJarByClass(GATKJobClient.class);
            job.setInputFormatClass(NLineXInputFormat.class);
            FileInputFormat.addInputPath(job, new Path(fqInput));
            FileOutputFormat.setOutputPath(job, new Path(BWAOutPath));

            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration());
            if (!is_azure) {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"),
                        job.getConfiguration());
            } else {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"),
                        job.getConfiguration());
            }
            DistributedCache.createSymlink(job.getConfiguration());

            // Setting local.cache.size - Add up the size of the files
            // distributed through the cache

            cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen();
            if (!is_azure) {
                cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen();
            }

            if (cacheSize > 8 * 1024 * 1024 * 1024) {
                conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024));
            }

            conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs..
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setPartitionerClass(BWAPartitioner.class);
            job.setReducerClass(BWAReducer.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            if (job.waitForCompletion(true)) {
                System.out.println("BWA Alignment done");
            }

            content = fs.listStatus(new Path(BWAOutPath));

            for (int i = 0; i < content.length; i++) {
                if (!((content[i].getPath().getName()).endsWith(".bam"))
                        && !((content[i].getPath().getName()).startsWith("_"))) {
                    fs.delete(content[i].getPath(), false);
                }
            }
            endTime = System.currentTimeMillis();
            System.out.println("BWA Alignment took: " + (endTime - startTime));
            startTime = System.currentTimeMillis();
            System.out.println("Starting Splitting BAM Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1");
            FileOutputFormat.setOutputPath(job, output);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setInt("gatk.hadoop.granularity", granularity);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.isindex", false);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("SplittingBAM Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Splitting BAM Indexing took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Sort Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            if (norealign && nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath));
            job.setInputFormatClass(ContigInputFormat.class);
            job.setPartitionerClass(ContigPartitioner.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            fs = inputPath.getFileSystem(conf);
            content = fs.listStatus(inputPath);
            for (int i = 0; i < content.length; i++) {
                if (content[i].getPath().getName().endsWith(".bam")) {
                    in = fs.open(content[i].getPath());
                    List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader())
                            .getSequenceDictionary().getSequences();
                    conf.setInt("mapred.reduce.tasks", sequences.size());

                    break;
                }
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            //conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);

            if (job.waitForCompletion(true)) {
                System.out.println("Sort completed successfully");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Sort job took: " + (endTime - startTime));
        }

        if (!norealign) {
            if (!noalign)
                BAMInputPath = SortBWAOutPath;

            startTime = System.currentTimeMillis();
            System.out.println("Starting Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);
            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2");
            FileOutputFormat.setOutputPath(job, output);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setBoolean("gatk.hadoop.isindex", true);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Indexing job took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Realigner Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);

            job.setInputFormatClass(BAMInputFormat.class);

            srcFs = new Path(outputDir).getFileSystem(conf);
            if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition")))
                System.out.println("mkdir failed");
            inputDir = new Path(outputDir + Path.SEPARATOR + "Partition");
            inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
            partition = new Path(inputDir, "_partition");
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(conf, partition);

            try {
                URI partitionURI = new URI(partition.toString() + "#_partition");
                DistributedCache.addCacheFile(partitionURI, conf);
            } catch (URISyntaxException e) {
                assert false;
            }

            if (nReducers == 0) {
                if (!nomarkdup || !noqrecab || !novariant) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10));
                }
            } else {
                conf.setInt("mapred.reduce.tasks", nReducers);
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            if (nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(IndelMapper.class);
            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(IndelOutPath));

            sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                    max_splits);
            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
            job.setInputFormatClass(LociInputFormat.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());
            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Indel realignment done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Indel Realigner took: " + (endTime - startTime));
        }

        if (!nomarkdup || !noqrecab || !novariant) {
            /* 
             * MarkDuplicate and Indexing Job 
             * FixMateInformation is not required as it is handled
             * automatically by GATK after IndelRealignment.
             */
            System.out.println("Starting MarkDup/Indexing job");
            startTime = System.currentTimeMillis();
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            if (!nomarkdup) {
                System.out.println("Starting MarkDuplicates job");
                conf.setBoolean("gatk.hadoop.ismarkdup", true);
                FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath));
            }
            if (!noqrecab || !novariant) {
                conf.setBoolean("gatk.hadoop.issindex", true);
                conf.setBoolean("gatk.hadoop.isindex", true);
                if (nomarkdup) {
                    System.out.println("Starting Indexing job");
                    FileOutputFormat.setOutputPath(job,
                            new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"));
                }
            }
            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Markdup/Indexing job done !!!");
            }
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }

            if (!nomarkdup) {
                Path rmdupOutPath = new Path(RmdupOutPath);
                fs = rmdupOutPath.getFileSystem(conf);
                content = fs.listStatus(rmdupOutPath);

                for (int i = 0; i < content.length; i++) {
                    if ((content[i].getPath().getName()).startsWith("part")) {
                        fs.delete(content[i].getPath(), false);
                    }
                }
                endTime = System.currentTimeMillis();
                System.out.println("MarkDuplicates took: " + (endTime - startTime));
            } else {
                endTime = System.currentTimeMillis();
                System.out.println("Indexing took: " + (endTime - startTime));
            }
        }

        if (!noqrecab) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Recal - Count Covariates Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(LociInputFormat.class);

            conf.setLong("local.cache.size", 20106127360L);
            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.set("gatk.hadoop.outputpath", outputDir);
            // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
            // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(RecalCovMapper.class);
            job.setCombinerClass(RecalCovCombiner.class);
            job.setReducerClass(RecalCovReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"),
                    job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("CountCovariates done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("CountCovariates took: " + (endTime - startTime));
        }

        if (!noqrecab || !novariant) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Table Recalibration / Unified Genotyper Job");
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            FileInputFormat.addInputPath(job, inputPath);

            if (!noqrecab) {
                conf.setBoolean("gatk.hadoop.recab", true);
                if (norealign) {
                    job.setInputFormatClass(BAMInputFormat.class);
                    srcFs = new Path(outputDir).getFileSystem(conf);
                    if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition")))
                        System.out.println("mkdir failed");
                } else {
                    job.setInputFormatClass(LociInputFormat.class);
                }
                inputDir = new Path(outputDir + "/" + "Partition");
                inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
                partition = new Path(inputDir, "_partition");
                job.setPartitionerClass(TotalOrderPartitioner.class);
                TotalOrderPartitioner.setPartitionFile(conf, partition);
                try {
                    URI partitionURI = new URI(partition.toString() + "#_partition");
                    DistributedCache.addCacheFile(partitionURI, conf);
                } catch (URISyntaxException e) {
                    assert false;
                }

                if (nReducers == 0) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", nReducers);
                }
                conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                if (!nomresults)
                    conf.setBoolean("gatk.hadoop.ismerge", true);
                job.setReducerClass(SortReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(SAMRecordWritable.class);
                job.setOutputFormatClass(SortOutputFormat.class);
                FileOutputFormat.setOutputPath(job, new Path(RecalOutPath));
            } else {
                job.setInputFormatClass(LociInputFormat.class);
                conf.setInt("mapred.reduce.tasks", 0);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"));
            }

            job.setMapperClass(RecalMapper.class);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);

            conf.set("gatk.hadoop.outputpath", outputDir);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            if (!novariant) {
                conf.setBoolean("gatk.hadoop.variant", true);
                if (!nofvariant)
                    conf.setBoolean("gatk.hadoop.fvariant", true);
                conf.setInt("gatk.hadoop.nthreads", nThreads);
                conf.setBoolean("gatk.hadoop.xvariant", xVariantCall);
            }

            if (!noqrecab && norealign) {
                sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                        max_splits);
                InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
                job.setInputFormatClass(LociInputFormat.class);
            }

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("TableRecalibration Job done !!");
            }
            endTime = System.currentTimeMillis();
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }
            System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime));
        }
        if (!novariant && !nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Merge Variant Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut");
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setReducerClass(VariantReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Merge Variants done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("MergeVariant job took: " + (endTime - startTime));

            if (xVariantCall && !novariant && !nomresults) {
                startTime = System.currentTimeMillis();

                System.out.println("Merge INDEL Variant Job");
                job = new Job();
                job.setJarByClass(GATKJobClient.class);
                conf = job.getConfiguration();
                inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut");
                FileInputFormat.addInputPath(job, inputPath);
                job.setInputFormatClass(WholeFileInputFormat.class);

                conf.setInt("mapred.reduce.tasks", 1);
                conf.setLong("mapred.task.timeout", 86400000L);
                conf.setBoolean("mapred.map.tasks.speculative.execution", false);
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

                conf.setBoolean("gatk.hadoop", true);
                conf.setBoolean("gatk.hadoop.isazure", is_azure);
                job.setReducerClass(VariantReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(NullWritable.class);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut"));

                DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
                // Standard inputs
                DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"),
                        job.getConfiguration());

                DistributedCache.createSymlink(job.getConfiguration());

                if (job.waitForCompletion(true)) {
                    System.out.println("Merge INDEL Variants done");
                }
                endTime = System.currentTimeMillis();
                System.out.println("MergeINDELVariant job took: " + (endTime - startTime));
            }
        }

        if (!nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Merge BAM Job");

            outputPath = new Path(FinalBAMPath);
            outFs = outputPath.getFileSystem(conf);

            if (!outFs.mkdirs(outputPath))
                System.out.println("mkdir failed");
            // Currently no support to merge output from MarkDuplicates 
            // from Job Client. Need to have a separate MR job for it.
            if (!noqrecab)
                inputPath = new Path(RecalOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else if (!nomarkdup)
                throw new Exception("Merge not implemented MarkDuplicates output.");
            else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant)
                inputPath = new Path(BAMInputPath);

            fs = inputPath.getFileSystem(conf);

            content = fs.listStatus(inputPath);
            mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam");

            Path p = null;
            int nfiles = 0;
            for (int i = 0; i < content.length; i++) {
                p = content[i].getPath();
                ++nfiles;
            }

            if (nfiles == 1) {
                boolean rename = fs.rename(p, mergeOutFile);
            } else {
                out = outFs.create(mergeOutFile, true);

                for (int i = 0; i < content.length; i++) {
                    p = content[i].getPath();
                    if ((p.getName()).endsWith(".bam")) {
                        in = fs.open(p);
                        IOUtils.copyBytes(in, out, conf, false);
                        in.close();
                    }
                }

                out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
                out.close();
            }

            endTime = System.currentTimeMillis();
            System.out.println("Final Merge took: " + (endTime - startTime));
        }
        System.out.println("JobCompleted");
    } catch (IOException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (InterruptedException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (ClassNotFoundException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (Exception e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    }
    return 0;
}

From source file:org.chombo.mr.Joiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Joiner  MR";
    job.setJobName(jobName);//from   w  w w. jav  a  2 s . c om

    job.setJarByClass(Joiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());

    job.setMapperClass(Joiner.JoinerMapper.class);
    job.setReducerClass(Joiner.JoinerReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    int numReducer = job.getConfiguration().getInt("joi.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.MultiJoiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "MultiJoiner  MR";
    job.setJobName(jobName);//ww w  . j  ava2  s  .c om

    job.setJarByClass(MultiJoiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());

    job.setMapperClass(MultiJoiner.JoinerMapper.class);
    job.setReducerClass(MultiJoiner.JoinerReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("muj.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.OutlierBasedDataValidation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Detecting invalid data as outliers";
    job.setJobName(jobName);/*from w  w  w.j a  v  a  2 s  .  co m*/

    job.setJarByClass(OutlierBasedDataValidation.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo");
    job.setMapperClass(OutlierBasedDataValidation.DataValidatorMapper.class);
    job.setReducerClass(OutlierBasedDataValidation.DataValidatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("obdv.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.Projection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Projection  and grouping  MR";
    job.setJobName(jobName);/*from   w w  w.j av a 2 s.c om*/

    job.setJarByClass(Projection.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());
    String operation = job.getConfiguration().get("projection.operation", "project");

    if (operation.startsWith("grouping")) {
        //group by
        job.setMapperClass(Projection.ProjectionMapper.class);
        job.setReducerClass(Projection.ProjectionReducer.class);

        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        int numReducer = job.getConfiguration().getInt("pro.num.reducer", -1);
        numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
        job.setNumReduceTasks(numReducer);

        //order by
        boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0;
        if (doOrderBy) {
            job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
            job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);
        }

    } else {
        //simple projection
        job.setMapperClass(Projection.SimpleProjectionMapper.class);
    }
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.RecordSetBulkMutator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "record set mutator  MR";
    job.setJobName(jobName);// w  ww  .j  a v  a 2  s  .  c  o  m

    job.setJarByClass(RecordSetBulkMutator.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RecordSetBulkMutator.BulkMutatorMapper.class);
    job.setReducerClass(RecordSetBulkMutator.BulkMutatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("rsb.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}