org.semanticscience.narf.graphs.main.CycleExtractor.java Source code

Introduction

Here is the source code for org.semanticscience.narf.graphs.main.CycleExtractor.java
Source

/**
 * Copyright (c) 2013  Jose Cruz-Toledo
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package org.semanticscience.narf.graphs.main;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.semanticscience.narf.graphs.lib.CycleSerializer;
import org.semanticscience.narf.graphs.lib.cycles.Cycle;
import org.semanticscience.narf.graphs.lib.cycles.exceptions.CycleException;
import org.semanticscience.narf.graphs.nucleicacid.ExtractedNucleicAcid;
import org.semanticscience.narf.graphs.nucleicacid.InteractionEdge;
import org.semanticscience.narf.graphs.nucleicacid.NucleicAcid;
import org.semanticscience.narf.graphs.nucleicacid.PredictedNucleicAcid;
import org.semanticscience.narf.structures.lib.exceptions.InvalidDotBracketNotationException;
import org.semanticscience.narf.structures.lib.exceptions.InvalidResidueException;
import org.semanticscience.narf.structures.lib.exceptions.InvalidSequenceException;
import org.semanticscience.narf.structures.parts.Nucleotide;
import org.semanticscience.narf.structures.parts.Sequence;

import com.hp.hpl.jena.rdf.model.Model;

/**
 * @author Jose Cruz-Toledo
 * 
 */
public class CycleExtractor {
    public static void main(String[] args) {
        Options options = createOptions();
        CommandLineParser p = createCliParser();
        String inputPDBDirStr = null;
        String inputSeqFileStr = null;
        File inputSeqFile = null;
        File inputPDBDir = null;
        String outputdirStr = null;
        File outputDir = null;

        String format = null;
        try {
            CommandLine c = p.parse(options, args);
            if (c.hasOption("help")) {
                printUsage();
                System.exit(1);
            }
            if (c.hasOption("inputPDBDir")) {
                inputPDBDirStr = c.getOptionValue("inputPDBDir");
                inputPDBDir = new File(inputPDBDirStr);
            }

            if (c.hasOption("inputSeqFile")) {
                inputSeqFileStr = c.getOptionValue("inputSeqFile");
                inputSeqFile = new File(inputSeqFileStr);
            }
            // check that either the input sequence file or the input PDB
            // directory are specified
            if (inputSeqFile == null && inputPDBDir == null) {
                System.out.println("Either an input PDB directory or an input sequence file must be specified!");
                printUsage();
                System.exit(1);
            }
            if (c.hasOption("outputDir")) {
                outputdirStr = c.getOptionValue("outputDir");
                outputDir = new File(outputdirStr);
            } else {
                System.out.println("You must specify an output directory!");
                printUsage();
                System.exit(1);
            }
            if (c.hasOption("outputFormat")) {
                format = c.getOptionValue("outputFormat");
            } else {
                System.out.println("You must specify an output format!");
                printUsage();
                System.exit(1);
            }
            if (inputPDBDir != null) {
                //TODO: be able to change default parser for pdb structures
                CycleSerializer cs = new CycleSerializer("x3dna-dssr", "beta-r21-on-20130903");
                // from the input directory get a list of input files<String>
                List<String> inputFiles = CycleExtractor.getFilePathsFromDir(inputPDBDir, "pdb");
                for (String aFilePath : inputFiles) {
                    Set<NucleicAcid> nucs = CycleExtractor.runX3DNADSSR(aFilePath);
                    if (nucs == null || nucs.size() == 0) {
                        throw new CycleException("Could not extract cycles from :" + aFilePath);
                    } else {
                        // only one model
                        if (nucs.size() == 1) {
                            for (NucleicAcid aNuc : nucs) {
                                List<Cycle<Nucleotide, InteractionEdge>> ccb = aNuc.getMinimumCycleBasis();
                                //see if cycle basis is empty
                                if (ccb.size() == 0) {
                                    System.out.println("PDBID:" + aFilePath + " has an empty cycle basis!");
                                    continue;
                                }
                                // the pdbid
                                String aPdbId = CycleExtractor.getPdbIdFromFilePath(aFilePath);
                                if (format.equals("RDF")) {
                                    Model m = cs.createNarfModelFromPDB(aPdbId, aNuc, ccb, false);
                                    // make an output file
                                    File outputFile = new File(
                                            outputDir.getAbsolutePath() + "/" + aPdbId + "_cycles.rdf");
                                    // create a fop
                                    FileOutputStream fop = new FileOutputStream(outputFile);
                                    m.write(fop);
                                    fop.close();
                                } else if (format.equals("tsv")) {
                                    String tsv = cs.createNarfTsv(aPdbId, aNuc, ccb, null, -1, false);
                                    File outputFile = new File(
                                            outputDir.getAbsolutePath() + "/" + aPdbId + "_cycles.tsv");
                                    FileUtils.writeStringToFile(outputFile, tsv);
                                }
                            }
                        }
                    }
                }
                //now write a report in a separate output file
                //print a summary file
                String summary = cs.makeSummary();
                File readme_out = new File(outputDir.getAbsolutePath() + "/cycle_summary.txt");
                FileUtils.writeStringToFile(readme_out, summary);

            } else if (inputSeqFile != null) {
                //create a cycle serializer object
                CycleSerializer cs = new CycleSerializer("x3dna-dssr", "beta-r21-on-20130903");
                // open the file
                List<String> lines = FileUtils.readLines(inputSeqFile);
                // foreach line in the file
                int c3 = 1;
                for (String aLine : lines) {
                    c3++;
                    // get a sequence
                    List<String> sl = Arrays.asList(aLine.split(","));
                    // skip the first line
                    //if (sl.get(0).equals("sequence")) {
                    //   continue;
                    //}
                    String aSeq = aLine;
                    //String aSeq = sl.get(0).replace("\"", "");
                    // call runMfold
                    Set<NucleicAcid> nas = CycleExtractor.runMfold(aSeq);
                    if (nas == null || nas.size() == 0) {
                        throw new CycleException("Could not extract cycles from :" + sl);
                    } else {
                        if (nas.size() == 1) {

                            for (NucleicAcid aNuc : nas) {

                                Random r = new Random();
                                int rand = Math.abs((r.nextInt(65536) - 32768));
                                String rd = Integer.toString(rand);
                                // get the MCB of each prediction
                                List<Cycle<Nucleotide, InteractionEdge>> ccb = aNuc.getMinimumCycleBasis();
                                //get the aptamer type
                                //String apt_type = sl.get(5).replace("\"", "");
                                //get the selex experiment mid
                                //String se_mid = sl.get(4).replace("\"", "");
                                //se_mid = se_mid.replace("\\/", "");
                                //se_mid += rand;
                                if (format.equals("RDF")) {
                                    Model m = cs.createNarfModelFromAB(rd, aNuc, ccb);
                                    // make an output file
                                    File outputFile = new File(outputDir.getAbsolutePath() + "/"
                                            + Math.abs(c3 + rand) + rd + "_cycles.rdf");
                                    // create a fop
                                    FileOutputStream fop = new FileOutputStream(outputFile);
                                    m.write(fop);
                                    fop.close();
                                } else if (format.equals("tsv")) {
                                    /*String tsv = cs.createNarfTsv(se_mid, aNuc, ccb,apt_type,rand, true);
                                    File outputFile = new File(
                                          outputDir.getAbsolutePath() + "/"
                                                + se_mid+"-"+rand + "_cycles.tsv");
                                    FileUtils
                                    .writeStringToFile(outputFile, tsv);*/
                                }

                            }

                        }

                    }
                }
                String summary = cs.makeSummary();
                File readme_out = new File(outputDir.getAbsolutePath() + "/cycle_summary.txt");
                FileUtils.writeStringToFile(readme_out, summary);
            }
        } catch (ParseException e) {
            System.out.println("Unable to parse specified options.");
            printUsage();
            System.exit(1);
        } catch (CycleException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static Set<NucleicAcid> runX3DNADSSR(String aPathToPDBFile) {
        File iF = new File(aPathToPDBFile);
        try {
            return ExtractedNucleicAcid.x3dnaDssr(iF);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Runs Mfold on the given sequence string
     * 
     * @param aSequence
     *            a string of valid sequence characters which should only
     *            contain these characters:
     *            [AaCcGgTtUuRrYyKkMmSsWwBbDdHhVvNnXx-]+
     * @return a set of Nucleic acids
     */
    private static Set<NucleicAcid> runMfold(String aSequence) {
        Set<NucleicAcid> rm = null;
        try {
            Sequence s = new Sequence(aSequence);
            rm = PredictedNucleicAcid.rnafold(s);
            return rm;
        } catch (InvalidSequenceException e) {
            System.out.println("invalid sequence: " + aSequence);
            return null;
        } catch (InvalidResidueException e) {
            System.out.println("invalid sequence: " + aSequence);
            return null;
        } catch (InvalidDotBracketNotationException e) {
            e.printStackTrace();
            return null;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Create a list of file path strings of a given extension from a given
     * directory
     * 
     * @param aDir
     *            the directory to search
     * @param anExtension
     *            the file extension to use as filter
     * @return a list of absolute file paths that have the parameter extension
     */
    private static List<String> getFilePathsFromDir(File aDir, String anExtension) {
        List<String> fns = new ArrayList<String>();
        File[] lof = aDir.listFiles();
        for (int i = 0; i < lof.length; i++) {
            File af = lof[i];
            if (af.isFile()) {
                int dotindx = af.getName().lastIndexOf('.');
                if (dotindx > 0) {
                    String ext = af.getName().substring(dotindx + 1);
                    if (ext.equalsIgnoreCase(anExtension)) {
                        fns.add(af.getAbsolutePath());
                    }
                }
            }
        }
        return fns;
    }

    private static String getPdbIdFromFilePath(String aPath) {
        String rm = aPath.substring(aPath.lastIndexOf("/") + 1, aPath.indexOf("."));
        return rm;
    }

    @SuppressWarnings("static-access")
    private static Options createOptions() {
        Options o = new Options();
        OptionBuilder.withArgName("/path/to/input/dir");
        Option inputPDBDir = OptionBuilder.hasArg(true)
                .withDescription("The directory where your input PDB files are located").create("inputPDBDir");
        Option inputSeqFile = OptionBuilder.hasArg(true)
                .withDescription("The path to the file of one aptamer sequence per line").create("inputSeqFile");
        Option outputDir = OptionBuilder.withArgName("/path/to/output/dir").hasArg(true)
                .withDescription("The directory where the cycle output will be stored").create("outputDir");
        Option outputFormat = OptionBuilder.withArgName("outputFormat").hasArg(true)
                .withDescription("The output format for the cycles (RDF|tsv)").isRequired().create("outputFormat");
        o.addOption(inputSeqFile);
        o.addOption(outputFormat);
        o.addOption(inputPDBDir);
        o.addOption(outputDir);
        return o;
    }

    private static void printUsage() {
        HelpFormatter hf = new HelpFormatter();
        hf.printHelp("cycleExtractor [OPTIONS]", createOptions());
    }

    private static CommandLineParser createCliParser() {
        return new GnuParser();
    }
}