Example usage for org.apache.hadoop.io IOUtils copyBytes

List of usage examples for org.apache.hadoop.io IOUtils copyBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io IOUtils copyBytes.

Prototype

public static void copyBytes(InputStream in, OutputStream out, long count, boolean close) throws IOException 

Source Link

Document

Copies count bytes from one stream to another.

Usage

From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java

License:Open Source License

public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    SortVcfOptions options = new SortVcfOptions(args);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat());
    conf.setBoolean("hadoopbam.vcf.write-header", false);

    Path inputPath = new Path(options.getInput());
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);

    Path vcfHeaderPath = files[0].getPath();
    if (options.getVcfHeader() != null)
        vcfHeaderPath = new Path(options.getVcfHeader());

    if (files.length <= 0) {
        System.err.println("Input dir is empty!");
        return 1;
    }//  w  ww.  j av  a 2s .c  om
    conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString());
    conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName());

    KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf);

    baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf));
    VCFHeader vcfHeader = baseOF.getHeader();

    Job job = Job.getInstance(conf, "VCFSort");
    job.setJarByClass(SortVcf.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortVcfReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VariantContextWritable.class);

    job.setInputFormatClass(VCFInputFormat.class);
    job.setOutputFormatClass(MyVCFOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(options.getReducerNum());

    SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
    String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date());
    Path partTmp = new Path(tmpDir + "/temp");
    VCFInputFormat.addInputPath(job, inputPath);
    if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job))
        VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE);
    FileOutputFormat.setOutputPath(job, partTmp);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class);

    Path partitionFile;
    if (options.getPartitionFileString() == null) {
        partitionFile = new Path(tmpDir + "/_partitons.lst");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        System.out.println("vcf-sort :: Sampling...");
        int numSamples = options.getNumSamples();
        if (fs.getContentSummary(inputPath).getLength() < 10000000) {
            numSamples = 1;
            job.setNumReduceTasks(1);
        }
        InputSampler.writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples,
                        numSamples));

    } else {
        System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ...");
        partitionFile = new Path(options.getPartitionFileString());
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
    }

    if (!job.waitForCompletion(true)) {
        System.err.println("sort :: Job failed.");
        return 1;
    }

    final FileSystem srcFS = partTmp.getFileSystem(conf);
    Path headerPath = new Path(tmpDir + "/header.vcf.gz");
    BGZFCodec bgzfCodec = new BGZFCodec();
    OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath));
    VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
    VariantContextWriter writer;
    writer = builder.setOutputVCFStream(new FilterOutputStream(os) {
        @Override
        public void close() throws IOException {
            this.out.flush();
        }
    }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

    writer.writeHeader(vcfHeader);
    os.close();

    Path outputPath = new Path(options.getOutput());
    final FileSystem dstFS = outputPath.getFileSystem(conf);
    OutputStream vcfgz = dstFS.create(outputPath);
    final FSDataInputStream headerIns = srcFS.open(headerPath);
    IOUtils.copyBytes(headerIns, vcfgz, conf, false);
    headerIns.close();

    final FileStatus[] parts = partTmp.getFileSystem(conf)
            .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*"));
    for (FileStatus p : parts) {
        final FSDataInputStream ins = srcFS.open(p.getPath());
        IOUtils.copyBytes(ins, vcfgz, conf, false);
        ins.close();
    }
    vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    vcfgz.close();
    partTmp.getFileSystem(conf).delete(partTmp, true);
    return 0;
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

@Override
public int run(String[] argv) throws Exception {
    try {//from  w  w  w. j a  v  a 2  s.c om
        Configuration conf;
        FileSystem srcFs, outFs, fs;
        Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath;
        int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100;
        FileStatus[] content;
        ClusterStatus status;
        int numNodes, mapSlotsPerNode;
        long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime,
                splitSize;
        float inputBufpcnt;
        FSDataOutputStream out;
        FSDataInputStream in;
        SAMFileReader fileReader;
        InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler;
        double sampling_frequency = 0.01;

        // Job object can be used for Aligner job if enabled
        conf = getConf();
        Job job = new Job(conf);

        parseCommandLineArgs(argv, conf);

        maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks();

        maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks();
        if (!noalign) {
            System.out.println("Starting Alignment Job");
            startTime = System.currentTimeMillis();

            status = new JobClient(new JobConf(conf)).getClusterStatus();
            numNodes = status.getTaskTrackers();
            // Job specific setting of number of Reducers..
            if (nReducers == 0)
                nReducers = numNodes;
            conf.setInt("mapred.reduce.tasks", nReducers);

            Path refPath = new Path(refFileLoc);
            fs = refPath.getFileSystem(conf);
            blockSize = fs.getFileStatus(refPath).getBlockSize();
            splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize());

            if (reads_per_split == 0) {
                inputPath = new Path(readFile1);
                long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen();
                long numSplits = Math.round(readSize / splitSize);

                if (numSplits < maxMapTasks)
                    numSplits = maxMapTasks;

                if (numSplits < nReducers)
                    numSplits = nReducers;

                long numReads = Math.round(readSize / (long) fq_read_size);
                reads_per_split = numReads / numSplits;

                // Total Order Partitioner
                if ((double) reads_per_split <= (1 / sampling_frequency)) {
                    sampling_frequency = 1;
                    granularity = 1;
                } else if (((double) reads_per_split > (1 / sampling_frequency))
                        && ((double) reads_per_split <= (1 / sampling_frequency * 100))) {
                    sampling_frequency = 0.1;
                    granularity = 10;
                }
            }

            job.setJarByClass(GATKJobClient.class);
            job.setInputFormatClass(NLineXInputFormat.class);
            FileInputFormat.addInputPath(job, new Path(fqInput));
            FileOutputFormat.setOutputPath(job, new Path(BWAOutPath));

            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration());
            if (!is_azure) {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"),
                        job.getConfiguration());
            } else {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"),
                        job.getConfiguration());
            }
            DistributedCache.createSymlink(job.getConfiguration());

            // Setting local.cache.size - Add up the size of the files
            // distributed through the cache

            cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen();
            if (!is_azure) {
                cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen();
            }

            if (cacheSize > 8 * 1024 * 1024 * 1024) {
                conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024));
            }

            conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs..
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setPartitionerClass(BWAPartitioner.class);
            job.setReducerClass(BWAReducer.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            if (job.waitForCompletion(true)) {
                System.out.println("BWA Alignment done");
            }

            content = fs.listStatus(new Path(BWAOutPath));

            for (int i = 0; i < content.length; i++) {
                if (!((content[i].getPath().getName()).endsWith(".bam"))
                        && !((content[i].getPath().getName()).startsWith("_"))) {
                    fs.delete(content[i].getPath(), false);
                }
            }
            endTime = System.currentTimeMillis();
            System.out.println("BWA Alignment took: " + (endTime - startTime));
            startTime = System.currentTimeMillis();
            System.out.println("Starting Splitting BAM Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1");
            FileOutputFormat.setOutputPath(job, output);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setInt("gatk.hadoop.granularity", granularity);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.isindex", false);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("SplittingBAM Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Splitting BAM Indexing took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Sort Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            if (norealign && nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath));
            job.setInputFormatClass(ContigInputFormat.class);
            job.setPartitionerClass(ContigPartitioner.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            fs = inputPath.getFileSystem(conf);
            content = fs.listStatus(inputPath);
            for (int i = 0; i < content.length; i++) {
                if (content[i].getPath().getName().endsWith(".bam")) {
                    in = fs.open(content[i].getPath());
                    List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader())
                            .getSequenceDictionary().getSequences();
                    conf.setInt("mapred.reduce.tasks", sequences.size());

                    break;
                }
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            //conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);

            if (job.waitForCompletion(true)) {
                System.out.println("Sort completed successfully");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Sort job took: " + (endTime - startTime));
        }

        if (!norealign) {
            if (!noalign)
                BAMInputPath = SortBWAOutPath;

            startTime = System.currentTimeMillis();
            System.out.println("Starting Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);
            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2");
            FileOutputFormat.setOutputPath(job, output);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setBoolean("gatk.hadoop.isindex", true);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Indexing job took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Realigner Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);

            job.setInputFormatClass(BAMInputFormat.class);

            srcFs = new Path(outputDir).getFileSystem(conf);
            if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition")))
                System.out.println("mkdir failed");
            inputDir = new Path(outputDir + Path.SEPARATOR + "Partition");
            inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
            partition = new Path(inputDir, "_partition");
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(conf, partition);

            try {
                URI partitionURI = new URI(partition.toString() + "#_partition");
                DistributedCache.addCacheFile(partitionURI, conf);
            } catch (URISyntaxException e) {
                assert false;
            }

            if (nReducers == 0) {
                if (!nomarkdup || !noqrecab || !novariant) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10));
                }
            } else {
                conf.setInt("mapred.reduce.tasks", nReducers);
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            if (nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(IndelMapper.class);
            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(IndelOutPath));

            sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                    max_splits);
            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
            job.setInputFormatClass(LociInputFormat.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());
            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Indel realignment done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Indel Realigner took: " + (endTime - startTime));
        }

        if (!nomarkdup || !noqrecab || !novariant) {
            /* 
             * MarkDuplicate and Indexing Job 
             * FixMateInformation is not required as it is handled
             * automatically by GATK after IndelRealignment.
             */
            System.out.println("Starting MarkDup/Indexing job");
            startTime = System.currentTimeMillis();
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            if (!nomarkdup) {
                System.out.println("Starting MarkDuplicates job");
                conf.setBoolean("gatk.hadoop.ismarkdup", true);
                FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath));
            }
            if (!noqrecab || !novariant) {
                conf.setBoolean("gatk.hadoop.issindex", true);
                conf.setBoolean("gatk.hadoop.isindex", true);
                if (nomarkdup) {
                    System.out.println("Starting Indexing job");
                    FileOutputFormat.setOutputPath(job,
                            new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"));
                }
            }
            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Markdup/Indexing job done !!!");
            }
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }

            if (!nomarkdup) {
                Path rmdupOutPath = new Path(RmdupOutPath);
                fs = rmdupOutPath.getFileSystem(conf);
                content = fs.listStatus(rmdupOutPath);

                for (int i = 0; i < content.length; i++) {
                    if ((content[i].getPath().getName()).startsWith("part")) {
                        fs.delete(content[i].getPath(), false);
                    }
                }
                endTime = System.currentTimeMillis();
                System.out.println("MarkDuplicates took: " + (endTime - startTime));
            } else {
                endTime = System.currentTimeMillis();
                System.out.println("Indexing took: " + (endTime - startTime));
            }
        }

        if (!noqrecab) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Recal - Count Covariates Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(LociInputFormat.class);

            conf.setLong("local.cache.size", 20106127360L);
            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.set("gatk.hadoop.outputpath", outputDir);
            // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
            // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(RecalCovMapper.class);
            job.setCombinerClass(RecalCovCombiner.class);
            job.setReducerClass(RecalCovReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"),
                    job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("CountCovariates done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("CountCovariates took: " + (endTime - startTime));
        }

        if (!noqrecab || !novariant) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Table Recalibration / Unified Genotyper Job");
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            FileInputFormat.addInputPath(job, inputPath);

            if (!noqrecab) {
                conf.setBoolean("gatk.hadoop.recab", true);
                if (norealign) {
                    job.setInputFormatClass(BAMInputFormat.class);
                    srcFs = new Path(outputDir).getFileSystem(conf);
                    if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition")))
                        System.out.println("mkdir failed");
                } else {
                    job.setInputFormatClass(LociInputFormat.class);
                }
                inputDir = new Path(outputDir + "/" + "Partition");
                inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
                partition = new Path(inputDir, "_partition");
                job.setPartitionerClass(TotalOrderPartitioner.class);
                TotalOrderPartitioner.setPartitionFile(conf, partition);
                try {
                    URI partitionURI = new URI(partition.toString() + "#_partition");
                    DistributedCache.addCacheFile(partitionURI, conf);
                } catch (URISyntaxException e) {
                    assert false;
                }

                if (nReducers == 0) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", nReducers);
                }
                conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                if (!nomresults)
                    conf.setBoolean("gatk.hadoop.ismerge", true);
                job.setReducerClass(SortReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(SAMRecordWritable.class);
                job.setOutputFormatClass(SortOutputFormat.class);
                FileOutputFormat.setOutputPath(job, new Path(RecalOutPath));
            } else {
                job.setInputFormatClass(LociInputFormat.class);
                conf.setInt("mapred.reduce.tasks", 0);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"));
            }

            job.setMapperClass(RecalMapper.class);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);

            conf.set("gatk.hadoop.outputpath", outputDir);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            if (!novariant) {
                conf.setBoolean("gatk.hadoop.variant", true);
                if (!nofvariant)
                    conf.setBoolean("gatk.hadoop.fvariant", true);
                conf.setInt("gatk.hadoop.nthreads", nThreads);
                conf.setBoolean("gatk.hadoop.xvariant", xVariantCall);
            }

            if (!noqrecab && norealign) {
                sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                        max_splits);
                InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
                job.setInputFormatClass(LociInputFormat.class);
            }

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("TableRecalibration Job done !!");
            }
            endTime = System.currentTimeMillis();
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }
            System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime));
        }
        if (!novariant && !nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Merge Variant Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut");
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setReducerClass(VariantReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Merge Variants done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("MergeVariant job took: " + (endTime - startTime));

            if (xVariantCall && !novariant && !nomresults) {
                startTime = System.currentTimeMillis();

                System.out.println("Merge INDEL Variant Job");
                job = new Job();
                job.setJarByClass(GATKJobClient.class);
                conf = job.getConfiguration();
                inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut");
                FileInputFormat.addInputPath(job, inputPath);
                job.setInputFormatClass(WholeFileInputFormat.class);

                conf.setInt("mapred.reduce.tasks", 1);
                conf.setLong("mapred.task.timeout", 86400000L);
                conf.setBoolean("mapred.map.tasks.speculative.execution", false);
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

                conf.setBoolean("gatk.hadoop", true);
                conf.setBoolean("gatk.hadoop.isazure", is_azure);
                job.setReducerClass(VariantReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(NullWritable.class);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut"));

                DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
                // Standard inputs
                DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"),
                        job.getConfiguration());

                DistributedCache.createSymlink(job.getConfiguration());

                if (job.waitForCompletion(true)) {
                    System.out.println("Merge INDEL Variants done");
                }
                endTime = System.currentTimeMillis();
                System.out.println("MergeINDELVariant job took: " + (endTime - startTime));
            }
        }

        if (!nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Merge BAM Job");

            outputPath = new Path(FinalBAMPath);
            outFs = outputPath.getFileSystem(conf);

            if (!outFs.mkdirs(outputPath))
                System.out.println("mkdir failed");
            // Currently no support to merge output from MarkDuplicates 
            // from Job Client. Need to have a separate MR job for it.
            if (!noqrecab)
                inputPath = new Path(RecalOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else if (!nomarkdup)
                throw new Exception("Merge not implemented MarkDuplicates output.");
            else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant)
                inputPath = new Path(BAMInputPath);

            fs = inputPath.getFileSystem(conf);

            content = fs.listStatus(inputPath);
            mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam");

            Path p = null;
            int nfiles = 0;
            for (int i = 0; i < content.length; i++) {
                p = content[i].getPath();
                ++nfiles;
            }

            if (nfiles == 1) {
                boolean rename = fs.rename(p, mergeOutFile);
            } else {
                out = outFs.create(mergeOutFile, true);

                for (int i = 0; i < content.length; i++) {
                    p = content[i].getPath();
                    if ((p.getName()).endsWith(".bam")) {
                        in = fs.open(p);
                        IOUtils.copyBytes(in, out, conf, false);
                        in.close();
                    }
                }

                out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
                out.close();
            }

            endTime = System.currentTimeMillis();
            System.out.println("Final Merge took: " + (endTime - startTime));
        }
        System.out.println("JobCompleted");
    } catch (IOException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (InterruptedException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (ClassNotFoundException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (Exception e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    }
    return 0;
}

From source file:org.deeplearning4j.utils.ShowData2UIServer.java

License:Apache License

public static void main(String[] args) throws Exception {

    //load the data saved, and then it'll be showed ,in the browser:http://localhost:9000

    File statsFile = null;//ww w .  j av  a  2  s . c om
    if (hdfsPath) {
        statsFile = File.createTempFile("tmp", "dl4j");
        OutputStream os = new FileOutputStream(statsFile);
        FileSystem fs = CommonUtils.openHdfsConnect();
        InputStream in = fs.open(new Path("/user/hadoop/trainlog/AnimalModelByHdfsTrainingStatsSpark2.dl4j"));
        IOUtils.copyBytes(in, os, 4096, false); //??
        IOUtils.closeStream(in);
        CommonUtils.closeHdfsConnect(fs);
        os.close();
    } else {
        statsFile = new File("/home/AnimalModelByHdfsTrainingStats1.dl4j");
    }
    StatsStorage statsStorage = new FileStatsStorage(statsFile);
    UIServer uiServer = UIServer.getInstance();
    uiServer.attach(statsStorage);
}

From source file:org.hadoop.tdg.TestPseudoHadoop.java

License:Apache License

public void copyFileWithProgress() throws IOException {
    InputStream in = null;/*  w  w  w. j  av a 2s. c o  m*/
    FSDataOutputStream out = null;
    try {
        in = new BufferedInputStream(new FileInputStream(HOME_FILE));
        //            FileSystem fs = FileSystem.get(URI.create(DST), conf);
        out = fs.create(p, new Progressable() {
            @Override
            public void progress() {
                System.out.print("~");
            }
        });

        IOUtils.copyBytes(in, out, 4096, true);
        //            Assert.assertTrue(fs.getFileStatus(p).getLen() == );
    } finally {
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}

From source file:org.hadoop.tdg.TestPseudoHadoop.java

License:Apache License

private void printStream(InputStream is) throws IOException {
    File f1 = new File(HOME_FILE);
    File f2 = new File(HOME + "/test.cpy");
    FileOutputStream fos = null;/* w ww .  jav a 2s  . com*/
    try {
        fos = new FileOutputStream(f2);
        IOUtils.copyBytes(is, fos, 4096, false);
        Files.equal(f1, f2);
    } finally {
        IOUtils.closeStream(is);
        IOUtils.closeStream(fos);
    }
}

From source file:org.hadoop.tdg.TestPseudoHadoop.java

License:Apache License

@Test
public void writeAndReadBzipCompressed() throws IOException {
    BZip2Codec codec = new BZip2Codec();
    String ext = codec.getDefaultExtension();
    Path p = new Path(DST_FILE + ext);
    File f1 = new File(HOME_FILE);
    File f2 = new File(HOME_FILE + ext);
    //writing compressed to hdfs
    CompressionOutputStream cout = codec.createOutputStream(fs.create(p));
    IOUtils.copyBytes(new FileInputStream(f1), cout, 4096, false);
    Assert.assertTrue(//from  w w  w. j av a 2 s  .  c  o m
            fs.getFileStatus(p).getPath().equals(new Path(fs.getUri().toString(), p.toUri().toString())));

    //reading and checking if it's the same
    FSDataInputStream dis = fs.open(p);
    //doesn't work don't know why
    CompressionInputStream cin = codec.createInputStream(dis);
    IOUtils.copyBytes(dis, new FileOutputStream(f2), 4096, false);
    Files.equal(f1, f2);
}

From source file:org.mrgeo.data.raster.RasterWritable.java

License:Apache License

public static MrGeoRaster toMrGeoRaster(final RasterWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from   ww  w  . j  av  a2  s. c  om*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.bytes, 0, writable.getSize());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    return toMrGeoRaster(new RasterWritable(baos.toByteArray()));
}

From source file:org.mrgeo.hdfs.utils.HadoopFileUtils.java

License:Apache License

@SuppressWarnings("squid:S2095") // hadoop FileSystem cannot be closed, or else subsequent uses will fail
public static void copyFileToHdfs(final String fromFile, final String toFile, final boolean overwrite)
        throws IOException {
    final Path toPath = new Path(toFile);
    final Path fromPath = new Path(fromFile);
    final FileSystem srcFS = HadoopFileUtils.getFileSystem(toPath);
    final FileSystem dstFS = HadoopFileUtils.getFileSystem(fromPath);

    final Configuration conf = HadoopUtils.createConfiguration();
    InputStream in = null;/*from   w  ww  .  j  av  a2s. c  om*/
    OutputStream out = null;
    try {
        in = srcFS.open(fromPath);
        out = dstFS.create(toPath, overwrite);

        IOUtils.copyBytes(in, out, conf, true);
    } catch (final IOException e) {
        IOUtils.closeStream(out);
        IOUtils.closeStream(in);
        throw e;
    }
}

From source file:org.mrgeo.vector.mrsvector.VectorTileWritable.java

License:Apache License

public static VectorTile toMrsVector(final VectorTileWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from  w  ww.ja v a2  s .c o  m*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.getBytes(), 0, writable.getLength());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    byte[] data = baos.toByteArray();
    return VectorTile.fromProtobuf(data, 0, data.length);
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.Cat.java

License:Open Source License

@Override
protected int run(final CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("cat :: OUTPATH not given.");
        return 3;
    }/*from  ww  w.j  a v  a  2  s  .  co m*/
    if (args.size() == 1) {
        System.err.println("cat :: no INPATHs given.");
        return 3;
    }

    final Path outPath = new Path(args.get(0));

    final List<String> ins = args.subList(1, args.size());

    final boolean verbose = parser.getBoolean(verboseOpt);

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat");
    if (stringency == null)
        return 3;

    final Configuration conf = getConf();

    // Expand the glob patterns.

    final List<Path> inputs = new ArrayList<Path>(ins.size());
    for (final String in : ins) {
        try {
            final Path p = new Path(in);
            for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p))
                inputs.add(fstat.getPath());
        } catch (IOException e) {
            System.err.printf("cat :: Could not expand glob pattern '%s': %s\n", in, e.getMessage());
        }
    }

    final Path input0 = inputs.get(0);

    // Infer the format from the first input path or contents.
    // the first input path or contents.

    SAMFormat format = SAMFormat.inferFromFilePath(input0);
    if (format == null) {
        try {
            format = SAMFormat.inferFromData(input0.getFileSystem(conf).open(input0));
        } catch (IOException e) {
            System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
            return 4;
        }
        if (format == null) {
            System.err.printf("cat :: Unknown SAM format in input '%s'\n", inputs.get(0));
            return 4;
        }
    }

    // Choose the header.

    final SAMFileHeader header;
    try {
        final SAMFileReader r = new SAMFileReader(input0.getFileSystem(conf).open(input0));

        header = r.getFileHeader();
        r.close();
    } catch (IOException e) {
        System.err.printf("cat :: Could not read input '%s': %s\n", input0, e.getMessage());
        return 5;
    }

    // Open the output.

    final OutputStream out;

    try {
        out = outPath.getFileSystem(conf).create(outPath);
    } catch (IOException e) {
        System.err.printf("cat :: Could not create output file: %s\n", e.getMessage());
        return 6;
    }

    // Output the header.

    try {
        // Don't use the returned stream, because we're concatenating directly
        // and don't want to apply another layer of compression to BAM.
        new SAMOutputPreparer().prepareForRecords(out, format, header);

    } catch (IOException e) {
        System.err.printf("cat :: Outputting header failed: %s\n", e.getMessage());
        return 7;
    }

    // Output the records from each file in the order given, converting if
    // necessary.

    int inIdx = 1;
    try {
        for (final Path inPath : inputs) {
            if (verbose) {
                System.out.printf("cat :: Concatenating path %d of %d...\n", inIdx++, inputs.size());
            }
            switch (format) {
            case SAM: {
                final InputStream in = inPath.getFileSystem(conf).open(inPath);

                // Use SAMFileReader to grab the header, but ignore it, thus
                // ensuring that the header has been skipped.
                new SAMFileReader(in).getFileHeader();

                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            case BAM: {
                final FSDataInputStream in = inPath.getFileSystem(conf).open(inPath);

                // Find the block length, thankfully given to us by the BGZF
                // format. We need it in order to know how much gzipped data to
                // read after skipping the BAM header, so that we can only read
                // that much and then simply copy the remaining gzip blocks
                // directly.

                final ByteBuffer block = ByteBuffer.wrap(new byte[0xffff]).order(ByteOrder.LITTLE_ENDIAN);

                // Don't use readFully here, since EOF is fine.
                for (int read = 0, prev; (prev = in.read(block.array(), read, block.capacity() - read)) < block
                        .capacity();) {
                    // EOF is fine.
                    if (prev == -1)
                        break;
                    read += prev;
                }

                // Find the BGZF subfield and extract the length from it.
                int blockLength = 0;
                for (int xlen = (int) block.getShort(10) & 0xffff, i = 12, end = i + xlen; i < end;) {
                    final int slen = (int) block.getShort(i + 2) & 0xffff;
                    if (block.getShort(i) == 0x4342 && slen == 2) {
                        blockLength = ((int) block.getShort(i + 4) & 0xffff) + 1;
                        break;
                    }
                    i += 4 + slen;
                }
                if (blockLength == 0)
                    throw new IOException("BGZF extra field not found in " + inPath);

                if (verbose) {
                    System.err.printf("cat ::   first block length %d\n", blockLength);
                }

                // Skip the BAM header. Can't use SAMFileReader because it'll
                // use its own BlockCompressedInputStream.

                final ByteArrayInputStream blockIn = new ByteArrayInputStream(block.array(), 0, blockLength);

                final BlockCompressedInputStream bin = new BlockCompressedInputStream(blockIn);

                // Theoretically we could write into the ByteBuffer we already
                // had, since BlockCompressedInputStream needs to read the
                // header before it can decompress any data and thereafter we
                // can freely overwrite the first 8 bytes of the header... but
                // that's a bit too nasty, so let's not.
                final ByteBuffer buf = ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN);

                // Read the BAM magic number and the SAM header length, verify
                // the magic, and skip the SAM header.

                IOUtils.readFully(bin, buf.array(), 0, 8);

                final int magic = buf.getInt(0), headerLen = buf.getInt(4);

                if (magic != 0x014d4142)
                    throw new IOException("bad BAM magic number in " + inPath);

                IOUtils.skipFully(bin, headerLen);

                // Skip the reference sequences.

                IOUtils.readFully(bin, buf.array(), 0, 4);

                for (int i = buf.getInt(0); i-- > 0;) {
                    // Read the reference name length and skip it along with the
                    // reference length.
                    IOUtils.readFully(bin, buf.array(), 0, 4);
                    IOUtils.skipFully(bin, buf.getInt(0) + 4);
                }

                // Recompress the rest of this gzip block.

                final int remaining = bin.available();

                if (verbose)
                    System.err.printf("cat ::   %d bytes to bgzip\n", remaining);

                if (remaining > 0) {
                    // The overload of IOUtils.copyBytes that takes "long length"
                    // was added only in Hadoop 0.20.205.0, which we don't want
                    // to depend on, so copy manually.
                    final byte[] remBuf = new byte[remaining];
                    IOUtils.readFully(bin, remBuf, 0, remBuf.length);

                    final BlockCompressedOutputStream bout = new BlockCompressedOutputStream(out, null);

                    bout.write(remBuf);
                    bout.flush();
                }

                // Just copy the raw bytes comprising the remaining blocks.

                in.seek(blockLength);
                IOUtils.copyBytes(in, out, conf, false);
                in.close();
                break;
            }
            }
        }
    } catch (IOException e) {
        System.err.printf("cat :: Outputting records failed: %s\n", e.getMessage());
        return 8;
    }

    // For BAM, output the BGZF terminator.

    try {
        if (format == SAMFormat.BAM)
            out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

        out.close();
    } catch (IOException e) {
        System.err.printf("cat :: Finishing output failed: %s\n", e.getMessage());
        return 9;
    }
    return 0;
}