org.mitre.bio.mapred.Fasta2SequenceFile.java Source code

Introduction

Here is the source code for org.mitre.bio.mapred.Fasta2SequenceFile.java
Source

/**
 * Created on March 25, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.bio.mapred;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.mitre.bio.mapred.io.FastaInputFormat;

/**
 * A map-reduce class that maps sequences (strings) from a FASTA formated file into a {@link SequenceFile} with {@link Text} for both key and values.
 *
 * <p>The key contains the user defined FASTA header and the sequence is the value.
 * 
 * @author Marc Colosimo
 */
public class Fasta2SequenceFile extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(Fasta2SequenceFile.class);
    private static final String HEADER_FORMAT = "bio.fasta.header.format";

    /**
     * Maps each FASTA sequence parsing the header for the key {@link Text} and the sequence
     * for the value {@link Text} into a {@link SequenceFile}.
     *
     * <P>By default this return the full description as the name</P>
     */
    public static class FastaMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {

        private static final Log MAP_LOG = LogFactory.getLog(FastaMapper.class);
        private Text seq = new Text();
        private Text name = new Text();
        private int headerFormat;
        private String jobName = "";

        @Override
        public void configure(JobConf conf) {
            this.headerFormat = conf.getInt(HEADER_FORMAT, 0);
            this.jobName = conf.getJobName();
        }

        /**
         * Returns the first part (before any pipe characters '|') without the '>'. If no pipes, 
         * then the whole string is return.
         */
        public String getName(String header) {
            int offset = header.startsWith(">") ? 1 : 0;
            int index = header.indexOf("|");
            if (index > 1) {
                return header.substring(offset, index);
            } else {
                return header.substring(offset, header.length());
            }
        }

        /**
         * Expects the FASTA header (with '>') and separates name from description (NCBI style headers).
         *
         * @see {@link org.mitre.bio.io.FastaIterator}
         * @param header
         * @return
         */
        public String getSequenceName(String header) {
            /**
             * Split apart sample name and description
             */
            int endname = header.indexOf(" ");
            String sn = (endname == -1) ? header : header.substring(0, endname);

            /**
             * Grab the part that the user wants
             */
            switch (headerFormat) {
            case 2:
                sn = header;
            case 1:
                sn = getName(sn);
                break;
            case 0:
            default:
                sn = sn.substring(header.startsWith(">") ? 1 : 0);
            }
            if (sn == null || sn.length() == 0) {
                // how do we get our node name/task id?
                sn = "[No Name " + Long.toHexString(System.currentTimeMillis()) + "]";
            }
            return sn.trim();
        }

        /**
         * Test mapping fasta names to count first matching (*.)\.*.
         * @param key       file position
         * @param value     FASTA record (expecting only one).
         * @param output    Sequence name parsed from header
         *                  (nothing after and "|") as the row key and the sequence in the given "family:column"
         * @param reporter
         * @throws java.io.IOException
         */
        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {

            /** Simple FASTA Parser - should use CCV's FastaIterator incase we get more than one sequence. **/
            BufferedReader buffer = new BufferedReader(new StringReader(value.toString()));
            String header = buffer.readLine().trim();
            if (header.startsWith(">")) {
                name.set(this.getSequenceName(header));
                reporter.setStatus("Recieved " + name);
                String line = buffer.readLine();
                StringBuffer sb = new StringBuffer();
                while (line != null) {
                    sb.append(line.trim());
                    line = buffer.readLine();
                }
                if (sb.length() == 0) {
                    MAP_LOG.warn(name + " contains no sequence!");
                } else {
                    seq.set(sb.toString());
                    output.collect(name, seq);
                }
            }
        }
    }

    /**
     * Init the job with the given parameters and run it.
     *
     * @param jobConf   the hadoop job configuration
     * @param input     input fasta file path
     * @param output    output {@link SequenceFile} path
     * @return          zero if successful
     * @throws java.lang.Exception
     */
    public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception {
        JobConf conf = new JobConf(jobConf, Fasta2SequenceFile.class);
        conf.setJobName("Fasta2SequenceFile");

        // Set Input Format to use FastaInputFormat
        FastaInputFormat.setInputPaths(conf, new Path(input));
        conf.setInputFormat(FastaInputFormat.class);
        conf.setMapperClass(FastaMapper.class);
        conf.setOutputKeyClass(Text.class); // map output key class
        conf.setOutputValueClass(Text.class); // map output value class

        // Set Output Format to use SequenceFile
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        if (cleanLogs) {
            LOG.info("removing log directory");
            Path path = new Path(output, "_logs");
            FileSystem fs = path.getFileSystem(jobConf);
            fs.delete(path, true);
        }
        return 0;
    }

    static int printUsage() {
        System.out.println(
                "Fasta2SequenceFile [-m <maps>] [-r <reduces>] [-c] [-n number] <fasta input [dir]> <output dir>");
        System.out.println("\twhere\n'-c' will clean (remove) the logs when done.\n"
                + "'-n' sets how to parse the header for the sample's name:\n\t0(default) is part before any spaces."
                + "\n\t1 is 0 plus anything before a pipe character." + "\n\t2 is the whole header unprocessed.");
        return -1;
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());
        boolean cleanLogs = false;

        List<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-c".equals(args[i])) {
                    cleanLogs = true;
                } else if ("-n".equals(args[i])) {
                    conf.setInt(HEADER_FORMAT, Integer.parseInt(args[++i]));
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
            return printUsage();
        }

        return initJob(conf, other_args.get(0), other_args.get(1), cleanLogs);
    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new Fasta2SequenceFile(), args);
        System.exit(res);
    }
}