mzmatch.ipeak.normalisation.VanDeSompele.java Source code

Introduction

Here is the source code for mzmatch.ipeak.normalisation.VanDeSompele.java
Source

/* Copyright (C) 2008, Groningen Bioinformatics Centre (http://gbic.biol.rug.nl/)
 * This file is part of mzmatch.
 * 
 * PeakML is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * mzmatch is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PeakML; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

package mzmatch.ipeak.normalisation;

// java
import java.io.*;
import java.awt.*;

import java.util.*;
import java.util.zip.*;

// jfreechart
import org.jfree.chart.*;
import org.jfree.chart.axis.*;
import org.jfree.chart.plot.*;
import org.jfree.chart.renderer.category.*;

import org.jfree.data.category.*;

// libraries
import cmdline.*;

// peakml
import peakml.*;
import peakml.math.*;
import peakml.graphics.*;
import peakml.chemistry.*;

import peakml.io.*;
import peakml.io.peakml.*;
import peakml.io.chemistry.*;

// mzmatch
import mzmatch.util.*;

@SuppressWarnings("unchecked")
public class VanDeSompele {
    // implementation
    /**
     * Counts all non IPeakSet instances hidden in the given IPeak instance.
     * 
     * @param peak      The IPeak instance to evaluate.
     * @return         Number of non IPeakSet instances.
     */
    public static int count(IPeak peak) {
        if (peak.getClass().equals(IPeakSet.class)) {
            int cnt = 0;
            for (IPeak p : (IPeakSet<IPeak>) peak)
                cnt += count(p);
            return cnt;
        }

        return 1;
    }

    // main entrance
    final static String version = "1.0.0";
    final static String application = "VanDeSompele";

    @OptionsClass(name = application, version = version, author = "RA Scheltema (r.a.scheltema@rug.nl)", description = "Applies a basic normalisation scheme to the PeakML file resulting from the combine process. "
            + "The method looks for house-hold metabolites not expected to change. These metabolites are "
            + "defined as the metabolites showing the least changes compared to the detections in all "
            + "of the other measurements. This is reflected by a stability score (stored as an annotation "
            + "for each of the entries labeled 'stability factor'). These stability scores are calculated "
            + "from all the metabolites that could be identified with the given database, within the given "
            + "ppm-range. The identified metabolites need to have been identified in all of the used "
            + "measurements." + "\n\n"
            + "From the top 10% (at least 10) most stable, identifed metabolites the normalisation factors "
            + "are calculated and applied to the rest of the data.", example = "Windows batch-file:\n"
                    + "SET JAVA=java -cp mzmatch.jar -da -dsa -Xmn1g -Xms1425m -Xmx1425m -Xss128k -XX:+UseParallelGC -XX:ParallelGCThreads=10\n"
                    + "\n" + "REM extract all the mass chromatograms\n"
                    + "%JAVA% mzmatch.ipeak.normalisaton.VanDeSompele -v -i combined.peakml -o normalized.peakml -selection selection.peakml -ppm 3 \n", references = "1. Vandesompele J, De Preter K, Pattyn F, Poppe B, Van Roy N, De Paepe A, Speleman F. Accurate normalization of real-time quantitative RT-PCR data by geometric averaging of multiple internal control genes. Genome Biol. 2002.")
    public static class Options {
        @Option(name = "i", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the input file. The only allowed format is PeakML and when it is "
                + "not set the input is read from standard in. The tool expects a combination "
                + "of peaks from different sets and will exit when this is not encountered.")
        public String input = null;
        @Option(name = "o", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the ouput file. The file is written in the PeakML file format and "
                + "contains all the peaks with normalized intensities. When this option is not "
                + "set the output is written to the standard out. Be sure to unset the verbose "
                + "option when setting up a pipeline reading and writing from the standard in- " + "and outputs.")
        public String output = null;
        @Option(name = "selection", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the file where the un-normalized selection of peaks is written.")
        public String selection = null;
        @Option(name = "selection_normalized", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the file where the normalizaed selection of peaks is written.")
        public String selection_normalized = null;
        @Option(name = "img", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the file where a graph of the normalization factors is written. This "
                + "file is in PDF format.")
        public String img = null;
        @Option(name = "factors", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the file where the normalization factors are written.")
        public String factors = null;

        @Option(name = "ppm", param = "double", type = Option.Type.REQUIRED_ARGUMENT, usage = "The accuracy of the measurement in parts-per-milion. This value is used for "
                + "matching the masses to those found in the supplied databases. This value is " + "obligitory.")
        public double ppm = -1;
        @Option(name = "database", param = "filename", type = Option.Type.REQUIRED_ARGUMENT, usage = "Option for the molecule databases to match the contents of the input file to. "
                + "This file should adhere to the compound-xml format.")
        public String database = null;

        @Option(name = "v", param = "", type = Option.Type.NO_ARGUMENT, usage = "When this is set, the progress is shown on the standard output.")
        public boolean verbose = false;
        @Option(name = "h", param = "", type = Option.Type.NO_ARGUMENT, usage = "When this is set, the help is shown.")
        public boolean help = false;
    }

    public static void main(String args[]) {
        try {
            Tool.init();

            // parse the commandline options
            Options options = new Options();
            CmdLineParser cmdline = new CmdLineParser(options);

            // check whether we need to show the help
            cmdline.parse(args);
            if (options.help) {
                Tool.printHeader(System.out, application, version);
                cmdline.printUsage(System.out, "");
                return;
            }

            if (options.verbose) {
                Tool.printHeader(System.out, application, version);
                cmdline.printOptions();
            }

            // check the command-line parameters
            {
                // if the output directories do not exist, create them
                if (options.output != null)
                    Tool.createFilePath(options.output, true);
            }

            // load the data
            if (options.verbose)
                System.out.println("Loading data");
            ParseResult result = PeakMLParser.parse(new FileInputStream(options.input), true);

            Header header = result.header;
            IPeakSet<IPeakSet<? extends IPeak>> peaksets = (IPeakSet<IPeakSet<? extends IPeak>>) result.measurement;

            int nrmeasurements = header.getNrMeasurementInfos();

            // remove the stability factor annotation
            for (IPeak peak : peaksets)
                peak.removeAnnotation("stability factor");

            // load the database
            if (options.verbose)
                System.out.println("Loading the molecule database");
            HashMap<String, Molecule> database = MoleculeIO.parseXml(new FileInputStream(options.database));

            // filter the set to include only identifiable metabolites
            if (options.verbose)
                System.out.println("Creating selection");
            Vector<IPeakSet<? extends IPeak>> selection = new Vector<IPeakSet<? extends IPeak>>();
            for (Molecule molecule : database.values()) {
                double mass = molecule.getMass(Mass.MONOISOTOPIC);
                double delta = PeriodicTable.PPM(mass, options.ppm);

                // get the most intense peak containing all the measurements
                Vector<IPeakSet<? extends IPeak>> neighbourhoud = peaksets.getPeaksInMassRange(mass - delta,
                        mass + delta);
                Collections.sort(neighbourhoud, IPeak.sort_intensity_descending);
                for (IPeakSet<? extends IPeak> neighbour : neighbourhoud)
                    if (count(neighbour) == nrmeasurements) {
                        selection.add(neighbour);
                        break;
                    }
            }

            // calculate the stability factor for each peak in the selection
            if (options.verbose)
                System.out.println("Calculating stability factors");
            for (int peakid1 = 0; peakid1 < selection.size(); ++peakid1) {
                double stddeviations[] = new double[selection.size()];

                IPeakSet<? extends IPeak> peakset1 = selection.get(peakid1);
                for (int peakid2 = 0; peakid2 < selection.size(); ++peakid2) {
                    IPeakSet<? extends IPeak> peakset2 = selection.get(peakid2);

                    double values[] = new double[nrmeasurements];
                    for (int measurementid = 0; measurementid < nrmeasurements; ++measurementid) {
                        int measurementid1 = peakset1.get(measurementid).getMeasurementID();
                        int setid1 = header.indexOfSetInfo(header.getSetInfoForMeasurementID(measurementid1));
                        int measurementid2 = peakset2.get(measurementid).getMeasurementID();
                        int setid2 = header.indexOfSetInfo(header.getSetInfoForMeasurementID(measurementid2));
                        if (setid1 != setid2 || measurementid1 != measurementid2)
                            System.err.println("[WARNING]: differing setid or spectrumid for comparison");

                        values[measurementid] = Math.log(peakset1.get(measurementid).getIntensity()
                                / peakset2.get(measurementid).getIntensity()) / Math.log(2);
                    }
                    stddeviations[peakid2] = Statistical.stddev(values);
                }

                peakset1.addAnnotation("stability factor", Statistical.mean(stddeviations));
            }

            // sort on the stability factor
            Collections.sort(selection, new IPeak.AnnotationAscending("stability factor"));

            // take the top 10% and calculate the geometric mean
            if (options.verbose)
                System.out.println("Calculating normalisation factors");
            int nrselected = (int) (0.1 * selection.size());
            if (nrselected < 10)
                nrselected = (10 < selection.size() ? 10 : selection.size());
            double normalization_factors[] = new double[nrmeasurements];
            for (int measurementid = 0; measurementid < nrmeasurements; ++measurementid) {
                double values[] = new double[nrselected];
                for (int i = 0; i < nrselected; ++i) {
                    IPeak peak = selection.get(i).get(measurementid);
                    values[i] = peak.getIntensity();
                }
                normalization_factors[measurementid] = Statistical.geomean(values);
            }

            // scale the found normalization factors
            double maxnf = Statistical.max(normalization_factors);
            for (int sampleid = 0; sampleid < nrmeasurements; ++sampleid)
                normalization_factors[sampleid] /= maxnf;

            // write the selection if needed
            if (options.selection != null) {
                if (options.verbose)
                    System.out.println("Writing original selection data");

                PeakMLWriter.write(result.header, selection, null,
                        new GZIPOutputStream(new FileOutputStream(options.selection)), null);
            }

            // normalize all the peaks
            if (options.verbose)
                System.out.println("Normalizing all the entries");
            for (IPeakSet<? extends IPeak> peakset : peaksets) {
                for (int measurementid = 0; measurementid < nrmeasurements; ++measurementid) {
                    // TODO why did I do this again ?
                    int id = 0;
                    int setid = 0;
                    int spectrumid = 0;
                    for (int i = 0; i < header.getNrSetInfos(); ++i) {
                        SetInfo set = header.getSetInfos().get(i);

                        if (id + set.getNrMeasurementIDs() > measurementid) {
                            setid = i;
                            spectrumid = measurementid - id;
                            break;
                        } else
                            id += set.getNrMeasurementIDs();
                    }

                    MassChromatogram<Peak> masschromatogram = null;
                    for (IPeak p : peakset) {
                        int mymeasurementid = p.getMeasurementID();
                        int mysetid = header.indexOfSetInfo(header.getSetInfoForMeasurementID(mymeasurementid));
                        if (mysetid == setid && mymeasurementid == spectrumid) {
                            masschromatogram = (MassChromatogram<Peak>) p;
                            break;
                        }
                    }
                    if (masschromatogram == null)
                        continue;

                    for (IPeak peak : masschromatogram.getPeaks())
                        peak.setIntensity(peak.getIntensity() / normalization_factors[measurementid]);
                }
            }

            // write the selection if needed
            if (options.selection_normalized != null) {
                if (options.verbose)
                    System.out.println("Writing the normalized selection data");

                PeakMLWriter.write(result.header, selection, null,
                        new GZIPOutputStream(new FileOutputStream(options.selection_normalized)), null);
            }

            // write the factors if needed
            if (options.factors != null) {
                if (options.verbose)
                    System.out.println("Writing the normalization factors");

                PrintStream out = new PrintStream(options.factors);
                for (int measurementid = 0; measurementid < nrmeasurements; ++measurementid)
                    out.println(header.getMeasurementInfo(measurementid).getLabel() + "\t"
                            + normalization_factors[measurementid]);
            }

            // write the plot if needed
            if (options.img != null) {
                if (options.verbose)
                    System.out.println("Writing the graph");

                DefaultCategoryDataset dataset = new DefaultCategoryDataset();
                JFreeChart linechart = ChartFactory.createLineChart(null, "measurement", "normalization factor",
                        dataset, PlotOrientation.VERTICAL, false, // legend
                        false, // tooltips
                        false // urls
                );

                CategoryPlot plot = (CategoryPlot) linechart.getPlot();
                CategoryAxis axis = (CategoryAxis) plot.getDomainAxis();
                axis.setCategoryLabelPositions(CategoryLabelPositions.UP_45);
                LineAndShapeRenderer renderer = (LineAndShapeRenderer) plot.getRenderer();

                renderer.setSeriesShapesFilled(0, true);
                renderer.setSeriesShapesVisible(0, true);

                linechart.setBackgroundPaint(Color.WHITE);
                linechart.setBorderVisible(false);
                linechart.setAntiAlias(true);

                plot.setBackgroundPaint(Color.WHITE);
                plot.setDomainGridlinesVisible(true);
                plot.setRangeGridlinesVisible(true);

                // create the datasets
                for (int measurementid = 0; measurementid < nrmeasurements; ++measurementid)
                    dataset.addValue(normalization_factors[measurementid], "",
                            header.getMeasurementInfo(measurementid).getLabel());
                JFreeChartTools.writeAsPDF(new FileOutputStream(options.img), linechart, 800, 500);
            }

            // write the normalized values
            if (options.verbose)
                System.out.println("Writing the normalized data");
            PeakMLWriter.write(result.header, peaksets.getPeaks(), null,
                    new GZIPOutputStream(new FileOutputStream(options.output)), null);
        } catch (Exception e) {
            Tool.unexpectedError(e, application);
        }
    }
}