edu.cornell.med.icb.goby.reads.ColorSpaceConverter.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.reads.ColorSpaceConverter.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This file is part of the Goby IO API.
 *
 *     The Goby IO API is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU Lesser General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     The Goby IO API is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU Lesser General Public License for more details.
 *
 *     You should have received a copy of the GNU Lesser General Public License
 *     along with the Goby IO API.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.reads;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.stringparsers.FileStringParser;
import edu.cornell.med.icb.goby.modes.CompactToFastaMode;
import edu.cornell.med.icb.parsers.FastaParser;
import it.unimi.dsi.fastutil.chars.Char2IntArrayMap;
import it.unimi.dsi.fastutil.chars.Char2IntMap;
import it.unimi.dsi.fastutil.chars.Char2ObjectArrayMap;
import it.unimi.dsi.fastutil.chars.Char2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2CharArrayMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Date;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;

/**
 * Convert a sequence to color space. Bases that are not recognized result in code 7
 * (i.e., digram 'A?' will be encoded as 7).
 *
 * @author Fabien Campagne
 *         Date: May 18, 2009
 *         Time: 5:32:52 PM
 */
public final class ColorSpaceConverter {
    private static final Char2ObjectMap<Char2IntMap> CONVERSION_MAP;
    private static final Int2ObjectMap<Int2CharArrayMap> DECODING_MAP;
    public static final int UNKNOWN = 7;
    public static final char UNKNOWN_BASE = 'N';
    static {
        CONVERSION_MAP = new Char2ObjectArrayMap<Char2IntMap>();
        CONVERSION_MAP.put('A', new Char2IntArrayMap());
        CONVERSION_MAP.put('C', new Char2IntArrayMap());
        CONVERSION_MAP.put('T', new Char2IntArrayMap());
        CONVERSION_MAP.put('G', new Char2IntArrayMap());
        CONVERSION_MAP.put('N', new Char2IntArrayMap());
        for (final Char2IntMap map : CONVERSION_MAP.values()) {
            map.defaultReturnValue(UNKNOWN);
        }
        DECODING_MAP = new Int2ObjectArrayMap<Int2CharArrayMap>();
        DECODING_MAP.put(0, new Int2CharArrayMap());
        DECODING_MAP.put(1, new Int2CharArrayMap());
        DECODING_MAP.put(2, new Int2CharArrayMap());
        DECODING_MAP.put(3, new Int2CharArrayMap());
        DECODING_MAP.put(4, new Int2CharArrayMap());
        DECODING_MAP.put(5, new Int2CharArrayMap());
        DECODING_MAP.put(6, new Int2CharArrayMap());

        for (final Int2CharArrayMap map : DECODING_MAP.values()) {
            map.defaultReturnValue(UNKNOWN_BASE);
        }
        push("AA", 0);
        push("CC", 0);
        push("GG", 0);
        push("TT", 0);
        push("AC", 1);
        push("CA", 1);
        push("GT", 1);
        push("TG", 1);
        push("AG", 2);
        push("CT", 2);
        push("GA", 2);
        push("TC", 2);
        push("AT", 3);
        push("CG", 3);
        push("GC", 3);
        push("TA", 3);
        push("AN", 4);
        push("CN", 4);
        push("GN", 4);
        push("TN", 4);
        push("NA", 5);
        push("NC", 5);
        push("NG", 5);
        push("NT", 5);
        push("NN", 6);

    }

    /**
     * Private constructor for utility class.
     */
    private ColorSpaceConverter() {
        super();
    }

    private static void push(final String s, final int colorCode) {
        final char firstBase = s.charAt(0);
        final char secondBase = s.charAt(1);
        CONVERSION_MAP.get(firstBase).put(secondBase, colorCode);
        DECODING_MAP.get(colorCode).put(firstBase, secondBase);
    }

    /**
     * Return the base corresponding to the previous base and color code transition.
     *
     * @param previousBase Base in sequence space at the previous position.
     * @param colorCode    Transition in color space.
     * @return
     */
    public static char decodeColor(final char previousBase, final char colorCode) {
        return DECODING_MAP.get(colorCode).get(previousBase);
    }

    private static int getColorCode(final char firstBase, final char secondBase) {
        final Char2IntMap intMap = CONVERSION_MAP.get(firstBase);
        if (intMap == null) {
            return UNKNOWN;
        }
        return intMap.get(secondBase);
    }

    /**
     * Converts a sequence into the equivalent sequence in color space.
     *
     * @param input  The sequence to be converted
     * @param output where the converted sequence should be placed
     */
    public static void convert(final CharSequence input, final MutableString output) {
        convert(input, output, false);
    }

    /**
     * Converts a sequence into the equivalent sequence in color space.
     *
     * @param input  The sequence to be converted
     * @param output where the converted sequence should be placed
     * @param anyN   if true, converts color space codes larger or equal to 4 to 'N' characters.
     */
    public static void convert(final CharSequence input, final MutableString output, final boolean anyN) {
        assert output != null : "The output location must not be null";

        output.setLength(0);
        if (input != null) {
            int position = 0;
            final int length = input.length() - 1; // -1 since we enumerate digrams
            if (input.length() > 0) {

                output.setLength(position + 1);
                output.setCharAt(position++, input.charAt(0));
            }
            for (int index = 0; index < length; ++index) {
                final char code = Character.forDigit(getColorCode(input.charAt(index), input.charAt(index + 1)),
                        10);
                output.setLength(position + 1);
                output.setCharAt(position++, (code >= '4') ? (anyN ? 'N' : code) : code);
            }
            output.setLength(position);
        }
    }

    public static String getColorSpaceSubstitutionMatrix() {
        return "# Substitution matrix to align in color space. Perfect \n"
                + "# matches score 2, mimatches between valid base bigrams (0-3) score -1.\n" + "\n"
                + "      0     1     2     3\n" + "0     2   -1    -1    -1\n" + "1    -1    2    -1    -1\n"
                + "2    -1   -1     2    -1\n" + "3    -1   -1    -1     2\n";
    }

    public static void main(final String[] args) throws JSAPException, IOException {
        final JSAP jsap = new JSAP();

        final FlaggedOption sequenceOption = new FlaggedOption("input");
        sequenceOption.setRequired(true);
        sequenceOption.setLongFlag("input");
        sequenceOption.setShortFlag('i');
        sequenceOption.setStringParser(FileStringParser.getParser().setMustBeFile(true).setMustExist(true));
        sequenceOption.setHelp("The input file (in Fasta format) to convert");
        jsap.registerParameter(sequenceOption);

        final FlaggedOption outputOption = new FlaggedOption("output");
        outputOption.setRequired(false);
        outputOption.setLongFlag("output");
        outputOption.setShortFlag('o');
        outputOption.setStringParser(FileStringParser.getParser().setMustBeFile(true));
        outputOption.setHelp("The output file to write to (default = stdout)");
        jsap.registerParameter(outputOption);

        final FlaggedOption titleOption = new FlaggedOption("title");
        titleOption.setRequired(false);
        titleOption.setLongFlag("title");
        titleOption.setShortFlag('t');
        titleOption.setHelp("Title for this conversion");
        jsap.registerParameter(titleOption);

        final Switch verboseOption = new Switch("verbose");
        verboseOption.setLongFlag("verbose");
        verboseOption.setShortFlag('v');
        verboseOption.setHelp("Verbose output");
        jsap.registerParameter(verboseOption);

        final Switch helpOption = new Switch("help");
        helpOption.setLongFlag("help");
        helpOption.setShortFlag('h');
        helpOption.setHelp("Print this message");
        jsap.registerParameter(helpOption);

        jsap.setUsage("Usage: " + ColorSpaceConverter.class.getName() + " " + jsap.getUsage());

        final JSAPResult result = jsap.parse(args);

        if (result.getBoolean("help")) {
            System.out.println(jsap.getHelp());
            System.exit(0);
        }

        if (!result.success()) {
            final Iterator<String> errors = result.getErrorMessageIterator();
            while (errors.hasNext()) {
                System.err.println(errors.next());
            }
            System.err.println(jsap.getUsage());
            System.exit(1);
        }

        final boolean verbose = result.getBoolean("verbose");

        final File sequenceFile = result.getFile("input");
        if (verbose) {
            System.out.println("Reading sequence from: " + sequenceFile);
        }

        // extract the title to use for the output header
        final String title;
        if (result.contains("title")) {
            title = result.getString("title");
        } else {
            title = sequenceFile.getName();
        }

        Reader inputReader = null;
        PrintWriter outputWriter = null;

        try {
            if ("gz".equals(FilenameUtils.getExtension(sequenceFile.getName()))) {
                inputReader = new InputStreamReader(new GZIPInputStream(FileUtils.openInputStream(sequenceFile)));
            } else {
                inputReader = new FileReader(sequenceFile);
            }
            final FastaParser fastaParser = new FastaParser(inputReader);

            final File outputFile = result.getFile("output");
            final OutputStream outputStream;
            if (outputFile != null) {
                outputStream = FileUtils.openOutputStream(outputFile);
                if (verbose) {
                    System.out.println("Writing sequence : " + outputFile);
                }
            } else {
                outputStream = System.out;
            }
            outputWriter = new PrintWriter(outputStream);

            // write the header portion of the output
            outputWriter.print("# ");
            outputWriter.print(new Date());
            outputWriter.print(' ');
            outputWriter.print(ColorSpaceConverter.class.getName());
            for (final String arg : args) {
                outputWriter.print(' ');
                outputWriter.print(arg);
            }
            outputWriter.println();
            outputWriter.print("# Cwd: ");
            outputWriter.println(new File(".").getCanonicalPath());
            outputWriter.print("# Title: ");
            outputWriter.println(title);

            // now parse the input sequence
            long sequenceCount = 0;
            final MutableString descriptionLine = new MutableString();
            final MutableString sequence = new MutableString();
            final MutableString colorSpaceSequence = new MutableString();

            while (fastaParser.hasNext()) {
                fastaParser.next(descriptionLine, sequence);
                outputWriter.print('>');
                outputWriter.println(descriptionLine);

                convert(sequence, colorSpaceSequence);
                CompactToFastaMode.writeSequence(outputWriter, colorSpaceSequence);

                sequenceCount++;
                if (verbose && sequenceCount % 10000 == 0) {
                    System.out.println("Converted " + sequenceCount + " entries");
                }
            }

            if (verbose) {
                System.out.println("Conversion complete!");
            }
        } finally {
            IOUtils.closeQuietly(inputReader);
            IOUtils.closeQuietly(outputWriter);
        }
    }
}