org.mitre.bio.mapred.TotalSequenceLength.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.bio.mapred.TotalSequenceLength.java

Source

/**
 * Created on March 25, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */

package org.mitre.bio.mapred;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.mitre.mapred.fs.FileUtils;

/**
 * Map-Reduce class to calculate the total length of all of the sequences ({@link Strings}) in a {@link SequenceFile}.
 * 
 * <P>This expects the {@link SequenceFile} to have the types {@link Text} for both key and values</P>
 *
 * @author Marc Colosimo
 */
public class TotalSequenceLength extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(TotalSequenceLength.class);

    private static final IntWritable outputKey = new IntWritable(-1);

    public static class SequenceMapClass extends MapReduceBase
            implements Mapper<Text, Text, IntWritable, IntWritable> {

        @Override
        public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter)
                throws IOException {
            reporter.setStatus("Calculating the length of sequences in " + key.toString() + "...");
            output.collect(TotalSequenceLength.outputKey, new IntWritable(value.getLength()));
        }
    }

    /**
     * Both a combiner and reducer for summing the lengths. This expects only one key value.
     */
    public static class LengthReduceClass extends MapReduceBase
            implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

        @Override
        public void reduce(IntWritable key, Iterator<IntWritable> values,
                OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new IntWritable(sum));
        }
    }

    public int getCount(JobConf jobConf, String pathString) throws IOException {
        // assume we only have one part (part-00000)
        Path outputPath = new Path(pathString, "part-00000");
        FileSystem fs = outputPath.getFileSystem(jobConf);
        InputStream in = fs.open(outputPath);
        String sin = FileUtils.convertStreamToString(in);
        String[] lines = sin.split("\n");
        String cnt = lines[0].split("\t")[1].trim();
        return Integer.parseInt(cnt);
    }

    /**
     * Init the job with the given parameters and run it.
     *
     * @param jobConf   the hadoop job configuration
     * @param input     input {@link SequenceFile} path
     * @param output    output path (this will contain ONE part with the length)
     * @return zero if successful
     * @throws java.lang.Exception
     */
    public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception {
        JobConf conf = new JobConf(jobConf, TotalSequenceLength.class);
        conf.setJobName("TotalSequenceLength");

        // We can only handle one reducer
        if (conf.getNumReduceTasks() != 1) {
            conf.setNumReduceTasks(1);
            LOG.info("Setting number of reducers to ONE!");
        }

        SequenceFileInputFormat.setInputPaths(conf, new Path(input));
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapperClass(SequenceMapClass.class);
        conf.setOutputKeyClass(IntWritable.class); // map output key class
        conf.setOutputValueClass(IntWritable.class); // map output value class

        conf.setCombinerClass(LengthReduceClass.class);
        conf.setReducerClass(LengthReduceClass.class);
        FileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        if (cleanLogs) {
            LOG.info("removing log directory");
            Path path = new Path(output, "_logs");
            FileSystem fs = path.getFileSystem(jobConf);
            fs.delete(path, true);
        }

        return 0;
    }

    static int printUsage() {
        System.out.println("TotalSequenceLength [-m <maps>] [-r <reduces>]  [-c] <input> <output>");
        System.out.println("\twhere '-c' will clean (remove) the logs when done");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());
        boolean cleanLogs = false;

        List<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-c".equals(args[i])) {
                    cleanLogs = true;
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
            return printUsage();
        }

        int res = initJob(conf, other_args.get(0), other_args.get(1), cleanLogs);
        int cnt = this.getCount(conf, other_args.get(1));
        System.out.printf("Total length of sequences is %d\n", cnt);
        return res;
    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new TotalSequenceLength(), args);
        System.exit(res);
    }

}