org.mitre.ccv.mapred.CompleteCompositionVectors.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.mapred.CompleteCompositionVectors.java

Source

/**
 * Created on April 8, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.mapred;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;

import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.GenericOptionsParser;

import org.mitre.bio.mapred.TotalSequenceLength;
import org.mitre.mapred.fs.FileUtils;

/**
 * Class that generates complete compostion vectors and feature vectors.
 *
 * <p>Some properties (<code>-D property=value</code>) supported are:
 * <ul>
 * <li>None</li>
 * </ul>
 *
 * @author Marc Colosimo
 */
public class CompleteCompositionVectors extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(CompleteCompositionVectors.class);
    /**
     * File name for storing k-mer counts.
     */
    public static final String KMER_COUNTS = "kmer_cnts";
    /**
     * File name for storing total  sequence length (all of the samples).
     */
    public static final String TOTAL_LENGTH = "length";
    /**
     * File name for storing k-mer probabilities
     */
    public static final String KMER_PROBABILITIES = "kmer_probs";
    /**
     * File name for storing inverted k-mer probabilities
     */
    public static final String INVERTED_KMER_PROBABILITIES = "invt_kmer_probs";
    /**
     * File name for storing k-mer pi values
     */
    public static final String KMER_PI_VALUES = "kmer_pi_values";
    /**
     * File name for storing sequence composition vectors
     */
    public static final String COMPOSITION_VECTORS = "composition_vectors";
    /**
     * File name for storing k-mer entropy values
     */
    public static final String ENTROPY_VALUES = "kmer_entropy_values";
    /**
     * File name for storing sorted (reverse natural order) k-mer entropy values
     */
    public static final String SORTED_ENTROPY_VALUES = "sorted_kmer_entropy_values";
    /**
     * File name for storing reduce set of k-mers in sorted order (usually prefixed with the number)
     */
    public static final String KMER_ENTROPY_SET = "_kmer_entroy_set";
    /**
     * File name for storing generated sparse k-mer features vectors
     * (usually a reduced k-mer set from the composition vectors).
     */
    public static final String FEATURE_VECTORS = "feature_vectors";
    /**
     * File name for writting out feature vectors to a JSON file.
     */
    public static final String JSON = "json";

    /**
     *
     * The JSO data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
     * will be in a different order. This version, by default sorts, only by entropy values, whereas the
     * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic).
     * @param argv
     * @return
     * @throws java.lang.Exception
     */
    @Override
    @SuppressWarnings("static-access") // For OptionBuilder
    public int run(String[] argv) throws Exception {
        JobConf conf = new JobConf(getConf());
        String cli_title = "CompleteCompositionVectorHadoop";

        int start = CalculateKmerCounts.DEFAULT_START;
        int end = CalculateKmerCounts.DEFAULT_END;
        int topkmers = 0;

        String input = null;
        String output = null;
        String vectorJsonOutput = null;
        //String kmerJsonOutput = null;

        boolean cleanLogs = false;

        /** create the Options */
        Options options = new Options();

        /** Hadoop Options */
        options.addOption(
                OptionBuilder.withArgName("number").hasArg(true).withDescription("number of maps").create("m"));
        options.addOption(
                OptionBuilder.withArgName("number").hasArg(true).withDescription("number of reducers").create("r"));

        // org.hadoop.util.GenericOptionsParser should captures this, but it doesn't
        options.addOption(OptionBuilder.withArgName("property=value").hasArg(true).withValueSeparator()
                .withDescription("use value for given property").create("D"));

        /** CompleteCompositionVector Options */
        options.addOption(OptionBuilder.withArgName("number").hasArg(true)
                .withDescription("number of top k-mers to use in calculations").create("topKmers"));
        options.addOption(OptionBuilder.withArgName("start").hasArg(true).withDescription("starting length of tile")
                .create("start"));
        options.addOption(OptionBuilder.withArgName("end").hasArg(true).withDescription("ending length of title")
                .create("end"));
        options.addOption(OptionBuilder.hasArg(true).withArgName("file")
                .withDescription("JSON file to write out k-mers to").create("kmersfile"));

        options.addOption(OptionBuilder.hasArg(true).withArgName("file")
                .withDescription("JSON file to write out feature vectors to "
                        + "(Overrides kmersout, only one file will be written).")
                .create("vectorsfile"));

        options.addOption(OptionBuilder.withArgName("number").hasArg(true)
                .withDescription("What preference to use: 0-min 1-median 2-avg(min,med): default is median")
                .create("prefval"));

        options.addOption(OptionBuilder.withArgName("help").hasArg(false).withDescription("print this message")
                .create("help"));

        // automatically generate the help statement
        HelpFormatter formatter = new HelpFormatter();

        //GenericOptionsParser gop = new GenericOptionsParser(conf, options, argv);
        GenericOptionsParser gop = new GenericOptionsParser(conf, argv);

        String[] remaining_args = gop.getRemainingArgs();

        // create the parser
        CommandLineParser parser = new GnuParser();
        //CommandLine line = gop.getCommandLine();
        String[] other_args = new String[] {};

        try {
            CommandLine line = parser.parse(options, remaining_args);
            other_args = line.getArgs();

            // Make sure there is a parameter left.
            if (other_args.length == 0) {
                System.out.println(cli_title);
                System.out.println("Missing input path!");
                formatter.printHelp("hccv [options] <input> [<output>] ", options);
                GenericOptionsParser.printGenericCommandUsage(System.out);
                return -1;
            }

            Option[] opts = line.getOptions();
            if (line.hasOption("help")) {
                System.out.println(cli_title);
                formatter.printHelp("hccv [options] <input> [<output>] ", options);
                GenericOptionsParser.printGenericCommandUsage(System.out);
                return -1;
            }

            // could also use line.iterator()
            for (Option opt : opts) {
                if (opt.getOpt().equals("m")) {
                    conf.setNumMapTasks(Integer.parseInt(opt.getValue()));
                }
                if (opt.getOpt().equals("r")) {
                    conf.setNumReduceTasks(Integer.parseInt(opt.getValue()));
                }
                if (opt.getOpt().equals("D")) {
                    // We can have multiple properties we want to set
                    String[] properties = opt.getValues();
                    for (String property : properties) {
                        String[] keyval = property.split("=");
                        conf.set(keyval[0], keyval[1]);
                    }
                }
                if (opt.getOpt().equals("start")) {
                    start = Integer.parseInt(opt.getValue());
                }
                if (opt.getOpt().equals("end")) {
                    end = Integer.parseInt(opt.getValue());
                }
                if (opt.getOpt().equals("topKmers")) {
                    topkmers = Integer.parseInt(opt.getValue());
                }
                if (opt.getOpt().equals("vectorsfile")) {
                    vectorJsonOutput = opt.getValue();
                }
            }
        } catch (ParseException e) {
            LOG.warn("options parsing faild: " + e.getMessage());
            System.out.println(cli_title);
            formatter.printHelp("hccv [options] <input> [<output>] ", options);
            GenericOptionsParser.printGenericCommandUsage(System.out);
        }
        if (start <= 2) {
            throw new IllegalArgumentException("Value of 'start' argument must be larger than 2");
        }

        input = other_args[0];
        if (other_args.length < 2) {
            output = input + "_" + FileUtils.getSimpleDate();
        } else {
            output = other_args[2];
        }

        /**
         * Check output path. Either needs to exist as a directory or not exist
         */
        Path outputPath = new Path(output);
        FileSystem fs = outputPath.getFileSystem(conf);
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);
        } else if (fs.exists(outputPath) || !fs.getFileStatus(outputPath).isDir()) {
            LOG.fatal(String.format("Output directory %s already exists", outputPath.makeQualified(fs)));
            throw new FileAlreadyExistsException(
                    String.format("Output directory %s already exists", outputPath.makeQualified(fs)));
        }

        String outputDir = output + Path.SEPARATOR;

        int res;
        /**
         * Zero, CalculateCompositionVectors
         */
        LOG.info("Starting CalculateCompositionVectors Map-Reduce job");
        CalculateCompositionVectors cv = new CalculateCompositionVectors();
        res = cv.initJob(conf, start, end, input, outputDir + COMPOSITION_VECTORS, cleanLogs);
        if (res != 0) {
            LOG.info("CalculateCompositionVectors returned non-zero result!");
            return res;
        }
        // We can stop now or continue to reduce dimensionallity using RRE or other means

        /**
         * First, CalculateKmerCounts
         */
        LOG.info("Starting CalculateKmerCounts Map-Reduce job");
        // FastMap option for CalculateKmers!?!
        CalculateKmerCounts ckc = new CalculateKmerCounts();
        res = ckc.initJob(conf, start, end, input, outputDir + KMER_COUNTS);
        if (res != 0) {
            LOG.fatal("CalculateKmerCounts returned non-zero result!");
            return res;
        }

        /**
         * Second, TotalSequenceLength
         */
        LOG.info("Starting TotalSequenceLength Map-Reduce job");
        TotalSequenceLength tsl = new TotalSequenceLength();
        res = tsl.initJob(conf, input, outputDir + TOTAL_LENGTH, cleanLogs);
        if (res != 0) {
            LOG.fatal("TotalSequenceLength returned non-zero result!");
            return res;
        }
        int length = tsl.getCount(conf, outputDir + TOTAL_LENGTH);

        if (length < 3) {
            LOG.fatal("TotalSequenceLength returned a total sequence length of less than 3.");
            return -1;
        } else {
            LOG.info(String.format("TotalSequenceLength returned a total sequence length of %d.", length));
        }

        /**
         * Third, CalculateKmerProbabilities
         */
        LOG.info("Starting CalculateKmerProbabilities Map-Reduce job");
        CalculateKmerProbabilities ckp = new CalculateKmerProbabilities();
        res = ckp.initJob(conf, start, end, length, outputDir + KMER_COUNTS, outputDir + KMER_PROBABILITIES,
                cleanLogs);
        if (res != 0) {
            LOG.fatal("CalculateKmerProbabilities returned non-zero result!");
            return res;
        }

        /**
         * Fourth, InvertKmerProbabilities
         */
        LOG.info("Starting InvertKmerProbabilities Map-Reduce job");
        InvertKmerProbabilities ikp = new InvertKmerProbabilities();
        res = ikp.initJob(conf, outputDir + KMER_PROBABILITIES, outputDir + INVERTED_KMER_PROBABILITIES, cleanLogs);
        if (res != 0) {
            LOG.fatal("InvertKmerProbabilities returned non-zero result!");
            return res;
        }

        /**
         * Fifth, CalculateKmerPiValues
         */
        LOG.info("Starting CalculateKmerPiValues Map-Reduce job");
        CalculateKmerPiValues kpv = new CalculateKmerPiValues();
        res = kpv.initJob(conf, start, end, outputDir + INVERTED_KMER_PROBABILITIES, outputDir + KMER_PI_VALUES,
                cleanLogs);
        if (res != 0) {
            LOG.fatal("CalculateKmerPiValues returned non-zero result!");
            return res;
        }

        /**
         * Sixth,CalculateKmerRevisedRelativeEntropy
         */
        LOG.info("Starting CalculateKmerRevisedRelativeEntropy Map-Reduce job");
        CalculateKmerRevisedRelativeEntropy krre = new CalculateKmerRevisedRelativeEntropy();
        res = krre.initJob(conf, outputDir + KMER_PI_VALUES, outputDir + COMPOSITION_VECTORS,
                outputDir + ENTROPY_VALUES, cleanLogs);
        if (res != 0) {
            LOG.fatal("CalculateKmerRevisedRelativeEntropy returned non-zero result!");
            return res;
        }

        /**
         * Seventh, SortKmerRevisedRelativeEntropies
         */
        SortKmerRevisedRelativeEntropies srre = new SortKmerRevisedRelativeEntropies();
        res = srre.initJob(conf, outputDir + ENTROPY_VALUES, outputDir + SORTED_ENTROPY_VALUES, cleanLogs);
        if (res != 0) {
            LOG.fatal("SortKmerRevisedRelativeEntropies returned non-zero result!");
            return res;
        }

        /**
         * Eigth, GenerateFeatureVectors
         *
         * Generate a flatten list to add to the cache to be distributed to the map-tasks.
         */
        Path listOutputPath = new Path(outputDir + Integer.toString(topkmers) + KMER_ENTROPY_SET);
        LOG.info(String.format("Loading %d sorted k-mers from %s to %s", topkmers,
                outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString()));
        int num = CompleteCompositionVectorUtils.flattenKmerEntropySequenceFile(conf, topkmers,
                outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString(), cleanLogs);

        if (num != topkmers) {
            LOG.fatal(String.format("Requested %d k-mers, but got %d. Using %d", topkmers, num, num));
            topkmers = num;
        }
        GenerateFeatureVectors fv = new GenerateFeatureVectors();
        res = fv.initJob(conf, listOutputPath.toString(), topkmers, outputDir + COMPOSITION_VECTORS,
                outputDir + FEATURE_VECTORS, cleanLogs);
        if (res != 0) {
            LOG.fatal("GenerateFeatureVectors returned non-zero result!");
            return res;
        }

        /**
         * Save feature vectors, features (k-mers), and properties to a JSON file.
         *
         * The data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
         * will be in a different order. This version, by default sorts, only by entropy values, whereas the
         * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic).
         */
        if (vectorJsonOutput != null && vectorJsonOutput.length() > 0) {
            LOG.info("Writing features out to " + vectorJsonOutput);
            CompleteCompositionVectorUtils.featureVectors2Json(conf, start, end, topkmers,
                    outputDir + SORTED_ENTROPY_VALUES, outputDir + FEATURE_VECTORS, vectorJsonOutput);
        }

        LOG.info("All done generating complete composition vectors and feature vectors.");
        return res;
    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CompleteCompositionVectors(), args);
        System.exit(res);
    }
}