org.broadinstitute.sting.gatk.hadoop.SortOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.broadinstitute.sting.gatk.hadoop.SortOutputFormat.java

Source

/* Copyright (c) 2011 Aalto University
 * Copyright (c) 2012-2013 by Virginia Polytechnic Institute and State
 * University
 * All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.gatk.hadoop;

import java.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.broadinstitute.sting.gatk.hadoop.hbamsrc.SAMRecordWritable;

public class SortOutputFormat extends KeyIgnoringBAMOutputFormat<NullWritable> {

    public SortOutputFormat() {
    }

    @Override
    public RecordWriter<NullWritable, SAMRecordWritable> getRecordWriter(TaskAttemptContext context)
            throws IOException {
        int pos = 0;
        if (context == null) {
            throw new IOException("context is NULL");
        }

        Path[] p = FileInputFormat.getInputPaths(context);
        assert (p.length > 0);

        FileSystem fs = p[0].getFileSystem(context.getConfiguration());
        FileStatus[] status = fs.listStatus(p[0]);

        for (int i = 0; i < status.length; i++) {
            if ((status[i].getPath().getName()).endsWith(".bam")) {
                pos = i;
                break;
            }
        }

        if (super.header == null) {
            Configuration c = context.getConfiguration();
            readSAMHeaderFrom(status[pos].getPath(), fs);
            if ((context.getConfiguration().getBoolean("gatk.hadoop.ismerge", false)) == false) {
                setWriteHeader(true);
            } else if (context.getTaskAttemptID().getTaskID().getId() == 0) {
                setWriteHeader(true);
            }
        }
        return super.getRecordWriter(context);
    }

    @Override
    public Path getDefaultWorkFile(TaskAttemptContext context, String ext) throws IOException {
        String filename = context.getTaskAttemptID().toString();
        String extension = ext.isEmpty() ? ext : "." + ext;
        int part = context.getTaskAttemptID().getTaskID().getId();
        return new Path(super.getDefaultWorkFile(context, ext).getParent(),
                String.format("%06d", part) + "-" + filename + extension);
    }

    // Allow the output directory to exist, so that we can make multiple jobs
    // that write into it.
    @Override
    public void checkOutputSpecs(JobContext job) throws FileAlreadyExistsException, IOException {
    }
}