Corrector.PCorrection.java Source code

Java tutorial

Introduction

Here is the source code for Corrector.PCorrection.java

Source

/*
PCorrection.java
2012  ReadStackCorrector, developed by Chien-Chih Chen (rocky@iis.sinica.edu.tw), 
released under Apache License 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 
at: https://github.com/ice91/ReadStackCorrector
*/
package Corrector;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class PCorrection extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(PCorrection.class);

    // PopBubblesMapper
    ///////////////////////////////////////////////////////////////////////////

    public static class PCorrectionMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
        private static int K = 0;
        private static Node node = new Node();

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
        }

        public void map(LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            String vals[] = nodetxt.toString().split("\t");
            if (vals[1].equals(Node.NODEMSG)) {
                node.fromNodeMsg(nodetxt.toString());
                output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            } else {
                output.collect(new Text(vals[0]), new Text(Node.CORRECTMSG + "\t" + vals[1]));
            }
            reporter.incrCounter("Brush", "nodes", 1);
            /*node.fromNodeMsg(nodetxt.toString());
            if (node.str_raw().equals("X")) {
            List<String> corrections = node.getCorrections();
            if (corrections != null)
            {
                for(String correction : corrections)
                {
                    String [] vals = correction.split("\\|");
                    String id    = vals[0];
                    String correct_msg   = vals[1];
                
                    output.collect(new Text(id),
                                   new Text(Node.CORRECTMSG + "\t" + correct_msg));
                
                }
                
                node.clearCorrections();
            }   
            } else {
            output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            reporter.incrCounter("Brush", "nodes", 1);
            }*/
        }
    }

    // PCorrectionReducer
    ///////////////////////////////////////////////////////////////////////////

    public static class PCorrectionReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        private static int K = 0;

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
        }

        public class Fix {
            //public String node_id;
            public List<String> pos;

            public Fix(String[] vals, int offset) throws IOException {
                if (!vals[offset].equals(Node.CORRECTMSG)) {
                    throw new IOException("Unknown msg");
                }
                pos = new ArrayList<String>();
                String pos_lists = vals[offset + 1];
                String[] val2s = pos_lists.split("!");
                for (int i = 0; i < val2s.length; i++) {
                    pos.add(val2s[i]);
                }
            }
        }

        public class Correct {
            public char chr;
            public int pos;

            public Correct(int pos1, char chr1) throws IOException {
                pos = pos1;
                chr = chr1;
            }
        }

        public void reduce(Text nodeid, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node(nodeid.toString());

            int sawnode = 0;

            boolean killnode = false;
            float extracov = 0;
            //List<Fix> fixs = new ArrayList<Fix>();
            List<Correct> corrects = new ArrayList<Correct>();

            while (iter.hasNext()) {
                String msg = iter.next().toString();

                //System.err.println(nodeid.toString() + "\t" + msg);

                String[] vals = msg.split("\t");

                if (vals[0].equals(Node.NODEMSG)) {
                    node.parseNodeMsg(vals, 0);
                    sawnode++;
                } else if (vals[0].equals(Node.CORRECTMSG)) {
                    String pos_lists = vals[1];
                    String[] val2s = pos_lists.split("!");
                    for (int i = 0; i < val2s.length; i++) {
                        String[] vals3 = val2s[i].split(",");
                        corrects.add(new Correct(Integer.parseInt(vals3[0]), vals3[1].charAt(0)));
                    }
                } else {
                    throw new IOException("Unknown msgtype: " + msg);
                }
            }

            if (sawnode != 1) {
                throw new IOException(
                        "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + nodeid.toString());
            }

            boolean failed_reads = false;
            Correct[] c_array = corrects.toArray(new Correct[corrects.size()]);
            corrects.clear();
            if (c_array.length > 0) {
                //\\ 0:A 1:T 2:C 3:G 4:Sum 
                boolean exclusive[] = new boolean[node.len()];
                int[][] array = new int[node.len()][5];
                for (int i = 0; i < node.len(); i++) {
                    exclusive[i] = true;
                    for (int j = 0; j < 5; j++) {
                        array[i][j] = 0;
                    }
                }
                //\\\
                for (int i = 0; i < c_array.length; i++) {
                    //String [] vals = fix_msg.split(",");
                    int pos = c_array[i].pos;
                    char fix_chr = c_array[i].chr;
                    array[pos][4] = array[pos][4] + 1;
                    if (fix_chr == 'A') {
                        array[pos][0] = array[pos][0] + 1;
                    } else if (fix_chr == 'T') {
                        array[pos][1] = array[pos][1] + 1;
                    } else if (fix_chr == 'C') {
                        array[pos][2] = array[pos][2] + 1;
                    } else if (fix_chr == 'G') {
                        array[pos][3] = array[pos][3] + 1;
                    }
                }
                //\\\

                // fix str content
                float majority = 0.99f;
                String fix_str = ""; //node.str().substring(0, pos) + fix_char + node.str().substring(pos+1); 
                String fix_qv = "";
                //\\determine exclusive
                for (int i = 0; i < array.length; i++) {
                    int left_corrects = 0;
                    int right_corrects = 0;
                    // add left_side correction
                    for (int j = i - 1; j >= 0 && j >= i - 12; j--) {
                        left_corrects = left_corrects + array[j][4];
                    }
                    // add right_sum correction
                    for (int j = i + 1; j < array.length && j <= i + 12; j++) {
                        right_corrects = right_corrects + array[j][4];
                    }
                    if (right_corrects == 0 && left_corrects == 0) {
                        exclusive[i] = true;
                    }
                }
                //\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
                for (int i = 0; i < array.length; i++) {
                    if (array[i][4] > 0 && array[i][0] == array[i][4] && exclusive[i] == true) {
                        fix_str = fix_str + "A";
                        fix_qv = fix_qv + "A"; // A == 0
                        reporter.incrCounter("Brush", "fix_char", 1);
                    } else if (array[i][4] > 0 && array[i][1] == array[i][4] && exclusive[i] == true) {
                        fix_str = fix_str + "T";
                        fix_qv = fix_qv + "A"; // A == 0
                        reporter.incrCounter("Brush", "fix_char", 1);
                    } else if (array[i][4] > 0 && array[i][2] == array[i][4] && exclusive[i] == true) {
                        fix_str = fix_str + "C";
                        fix_qv = fix_qv + "A"; // A == 0
                        reporter.incrCounter("Brush", "fix_char", 1);
                    } else if (array[i][4] > 0 && array[i][3] == array[i][4] && exclusive[i] == true) {
                        fix_str = fix_str + "G";
                        fix_qv = fix_qv + "A"; // A == 0
                        reporter.incrCounter("Brush", "fix_char", 1);
                    } else {
                        fix_str = fix_str + node.str().charAt(i);
                        fix_qv = fix_qv + node.QV().charAt(i);
                    }
                }
                node.setstr(fix_str);
                node.setQV(fix_qv);
            }
            output.collect(nodeid, new Text(node.toNodeMsg()));
        }
    }

    // Run Tool
    ///////////////////////////////////////////////////////////////////////////

    public RunningJob run(String inputPath, String outputPath) throws Exception {
        sLogger.info("Tool name: PCorrection");
        sLogger.info(" - input: " + inputPath);
        sLogger.info(" - output: " + outputPath);

        JobConf conf = new JobConf(PCorrection.class);
        conf.setJobName("PCorrection " + inputPath + " " + Config.K);

        Config.initializeConfiguration(conf);

        FileInputFormat.addInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);
        //conf.setBoolean("mapred.output.compress", true);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PCorrectionMapper.class);
        conf.setReducerClass(PCorrectionReducer.class);

        //delete the output directory if it exists already
        FileSystem.get(conf).delete(new Path(outputPath), true);

        return JobClient.runJob(conf);
    }

    // Parse Arguments and run
    ///////////////////////////////////////////////////////////////////////////

    public int run(String[] args) throws Exception {
        String inputPath = "";
        String outputPath = "";

        Config.K = 21;

        run(inputPath, outputPath);
        return 0;
    }

    // Main
    ///////////////////////////////////////////////////////////////////////////

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new PCorrection(), args);
        System.exit(res);
    }
}