fi.tkk.ics.hadoop.bam.cli.Utils.java Source code

Introduction

Here is the source code for fi.tkk.ics.hadoop.bam.cli.Utils.java
Source

// Copyright (c) 2011 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2011-06-14 14:52:20

package fi.tkk.ics.hadoop.bam.cli;

import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.CodeSource;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

import net.sf.picard.sam.ReservedTagConstants;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.BlockCompressedStreamConstants;

import fi.tkk.ics.hadoop.bam.SAMFormat;
import fi.tkk.ics.hadoop.bam.util.SAMOutputPreparer;
import fi.tkk.ics.hadoop.bam.util.Timer;

import hbparquet.hadoop.util.ContextUtil;

public final class Utils {
    public static void printWrapped(PrintStream out, String str) {
        printWrapped(out, str, 0);
    }

    public static void printWrapped(PrintStream out, String str, int indent) {
        printWrapped(out, str, indent, 80);
    }

    public static void printWrapped(PrintStream out, String str, int indent, int wrapAt) {
        int pos = indent;

        final Scanner words = new Scanner(str);

        // Print the first word out here to avoid a spurious line break if it
        // would wrap.
        if (words.hasNext()) {
            final String word = words.next();
            out.print(word);
            pos += word.length();
        }
        boolean addSpace = true;
        while (words.hasNext()) {
            final String word = words.next();
            final int wend = words.match().end();

            pos += word.length();
            if (addSpace)
                ++pos;

            if (pos < wrapAt) {
                if (addSpace)
                    out.print(' ');
                out.print(word);
            } else {
                pos = indent + word.length();
                out.print('\n');
                for (int i = indent; i-- > 0;)
                    out.print(' ');
                out.print(word);
            }

            if (wend < str.length() && str.charAt(wend) == '\n') {
                pos = indent;
                addSpace = false;

                int i = wend;
                do
                    out.print('\n');
                while (++i < str.length() && str.charAt(i) == '\n');

                for (i = indent; i-- > 0;)
                    out.print(' ');
            } else
                addSpace = true;
        }
        out.print('\n');
    }

    private static String argv0 = null;
    private static Class<?> argv0Class = null;

    // For printing something intelligent in "Usage: argv0 <args>" messages.
    public static String getArgv0() {
        if (argv0 != null)
            return argv0;
        if (argv0Class == null)
            return null;

        final CodeSource cs = argv0Class.getProtectionDomain().getCodeSource();
        if (cs == null)
            return null;

        final String path = cs.getLocation().getPath();
        if (path.endsWith("/")) {
            // Typically (always?) a .class file loaded from the directory in
            // path.
            argv0 = argv0Class.getSimpleName();
        } else {
            // Typically (always?) a .jar file.
            argv0 = new File(path).getName();
        }

        return argv0;
    }

    public static void setArgv0Class(Class<?> cl) {
        argv0Class = cl;
        argv0 = null;
    }

    public static void configureSampling(Path workDir, String outName, Configuration conf) throws IOException {
        final Path partition = workDir.getFileSystem(conf)
                .makeQualified(new Path(workDir, "_partitioning" + outName));

        TotalOrderPartitioner.setPartitionFile(conf, partition);
        try {
            final URI partitionURI = new URI(partition.toString() + "#" + partition.getName());

            if (partitionURI.getScheme().equals("file"))
                return;

            DistributedCache.addCacheFile(partitionURI, conf);
            DistributedCache.createSymlink(conf);
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }
    }

    public static final String WORK_FILENAME_PROPERTY = "hadoopbam.work.filename";

    /** Returns a name that mergeInto() will recognize as a file to be merged.
     *
     * The filename is the value of WORK_FILENAME_PROPERTY surrounded by
     * basePrefix and basePostfix, followed by the part number and with the
     * given extension.
     */
    public static Path getMergeableWorkFile(Path directory, String basePrefix, String basePostfix,
            TaskAttemptContext ctx, String extension) {
        return new Path(directory,
                basePrefix + ContextUtil.getConfiguration(ctx).get(WORK_FILENAME_PROPERTY) + basePostfix + "-"
                        + String.format("%06d", ctx.getTaskAttemptID().getTaskID().getId())
                        + (extension.isEmpty() ? extension : "." + extension));
    }

    /** Merges the files in the given directory that have names given by
     * getMergeableWorkFile() into out.
     *
     * Outputs progress reports if commandName is non-null.
     */
    public static void mergeInto(OutputStream out, Path directory, String basePrefix, String basePostfix,
            Configuration conf, String commandName) throws IOException {
        final FileSystem fs = directory.getFileSystem(conf);

        final FileStatus[] parts = fs.globStatus(new Path(directory,
                basePrefix + conf.get(WORK_FILENAME_PROPERTY) + basePostfix + "-[0-9][0-9][0-9][0-9][0-9][0-9]*"));

        int i = 0;
        Timer t = new Timer();
        for (final FileStatus part : parts) {
            if (commandName != null) {
                System.out.printf("%s :: Merging part %d (size %d)...", commandName, ++i, part.getLen());
                System.out.flush();

                t.start();
            }

            final InputStream in = fs.open(part.getPath());
            IOUtils.copyBytes(in, out, conf, false);
            in.close();

            if (commandName != null)
                System.out.printf(" done in %d.%03d s.\n", t.stopS(), t.fms());
        }
        for (final FileStatus part : parts)
            fs.delete(part.getPath(), false);
    }

    private static final String HEADERMERGER_SORTORDER_PROP = "hadoopbam.headermerger.sortorder";

    public static void setHeaderMergerSortOrder(Configuration conf, SAMFileHeader.SortOrder order) {
        conf.set(HEADERMERGER_SORTORDER_PROP, order.name());
    }

    public static final String HEADERMERGER_INPUTS_PROPERTY = "hadoopbam.headermerger.inputs";

    private static SamFileHeaderMerger headerMerger = null;

    /** Computes the merger of the SAM headers in the files listed in
     * HEADERMERGER_INPUTS_PROPERTY. The sort order of the result is set
     * according to the last call to setHeaderMergerSortOrder, or otherwise
     * to "unsorted".
     *
     * The result is cached locally to prevent it from being recomputed too
     * often.
     */
    public static SamFileHeaderMerger getSAMHeaderMerger(Configuration conf) throws IOException {
        // TODO: it would be preferable to cache this beforehand instead of
        // having every task read the header block of every input file. But that
        // would be trickier, given that SamFileHeaderMerger isn't trivially
        // serializable.

        // Save it in a static field, though, in case that helps anything.
        if (headerMerger != null)
            return headerMerger;

        final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();

        for (final String in : conf.getStrings(HEADERMERGER_INPUTS_PROPERTY)) {
            final Path p = new Path(in);

            final SAMFileReader r = new SAMFileReader(p.getFileSystem(conf).open(p));
            headers.add(r.getFileHeader());
            r.close();
        }

        final String orderStr = conf.get(HEADERMERGER_SORTORDER_PROP);
        final SAMFileHeader.SortOrder order = orderStr == null ? SAMFileHeader.SortOrder.unsorted
                : SAMFileHeader.SortOrder.valueOf(orderStr);

        return headerMerger = new SamFileHeaderMerger(order, headers, true);
    }

    /** Changes the given SAMRecord as appropriate for being placed in a file
     * whose header is getSAMHeaderMerger(conf).getMergedHeader().
     */
    public static void correctSAMRecordForMerging(SAMRecord r, Configuration conf) throws IOException {
        if (headerMerger == null)
            getSAMHeaderMerger(conf);

        final SAMFileHeader h = r.getHeader();

        // Correct the reference indices, and thus the key, if necessary.
        if (headerMerger.hasMergedSequenceDictionary()) {
            final int ri = headerMerger.getMergedSequenceIndex(h, r.getReferenceIndex());

            r.setReferenceIndex(ri);
            if (r.getReadPairedFlag())
                r.setMateReferenceIndex(headerMerger.getMergedSequenceIndex(h, r.getMateReferenceIndex()));
        }

        // Correct the program group if necessary.
        if (headerMerger.hasProgramGroupCollisions()) {
            final String pg = (String) r.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
            if (pg != null)
                r.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, headerMerger.getProgramGroupId(h, pg));
        }

        // Correct the read group if necessary.
        if (headerMerger.hasReadGroupCollisions()) {
            final String rg = (String) r.getAttribute(ReservedTagConstants.READ_GROUP_ID);
            if (rg != null)
                r.setAttribute(ReservedTagConstants.READ_GROUP_ID, headerMerger.getProgramGroupId(h, rg));
        }
    }

    /** Merges the files in the given directory that have names given by
     * getMergeableWorkFile() into out in the given SAMFormat, using
     * getSAMHeaderMerger().getMergedHeader() as the header.
     *
     * Outputs progress reports if commandName is non-null.
     */
    public static void mergeSAMInto(Path out, Path directory, String basePrefix, String basePostfix,
            SAMFormat format, Configuration conf, String commandName) throws IOException {
        Timer t = new Timer();
        System.out.printf("%s :: Merging output...\n", commandName);
        t.start();

        final OutputStream outs = out.getFileSystem(conf).create(out);

        // First, place the SAM or BAM header.
        //
        // Don't use the returned stream, because we're concatenating directly
        // and don't want to apply another layer of compression to BAM.
        new SAMOutputPreparer().prepareForRecords(outs, format, getSAMHeaderMerger(conf).getMergedHeader());

        // Then, the actual SAM or BAM contents.
        mergeInto(outs, directory, basePrefix, basePostfix, conf, commandName);

        // And if BAM, the BGZF terminator.
        if (format == SAMFormat.BAM)
            outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

        outs.close();

        System.out.printf("%s :: Merging complete in %d.%03d s.\n", commandName, t.stopS(), t.fms());
    }

    private static String stringencyOptHelp = null;

    public static String getStringencyOptHelp() {
        if (stringencyOptHelp != null)
            return stringencyOptHelp;

        final StringBuilder sb = new StringBuilder("set Picard's validation stringency to S (");

        String last = null;
        for (final SAMFileReader.ValidationStringency v : SAMFileReader.ValidationStringency.values()) {
            if (last != null) {
                sb.append(last);
                sb.append(", ");
            }
            last = v.name();
        }
        sb.append("or ");
        sb.append(last);
        sb.append(')');
        return stringencyOptHelp = sb.toString();
    }

    /** Sets the default validation stringency in addition to returning it. */
    public static SAMFileReader.ValidationStringency toStringency(Object o, String cmdName) {
        final String s = (String) o;
        try {
            final SAMFileReader.ValidationStringency stringency = SAMFileReader.ValidationStringency.valueOf(s);
            SAMFileReader.setDefaultValidationStringency(stringency);
            return stringency;
        } catch (IllegalArgumentException e) {
            System.err.printf("%s :: invalid stringency '%s'\n", cmdName, s);
            return null;
        }
    }
}