ubic.gemma.core.apps.DifferentialExpressionAnalysisCli.java Source code

Java tutorial


Here is the source code for ubic.gemma.core.apps.DifferentialExpressionAnalysisCli.java


 * The Gemma project
 * Copyright (c) 2006-2011 University of British Columbia
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *       http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package ubic.gemma.core.apps;

import gemma.gsec.SecurityService;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import ubic.gemma.core.analysis.expression.diff.DifferentialExpressionAnalysisConfig;
import ubic.gemma.core.analysis.expression.diff.DifferentialExpressionAnalyzerService;
import ubic.gemma.core.analysis.expression.diff.DifferentialExpressionAnalyzerServiceImpl.AnalysisType;
import ubic.gemma.core.analysis.preprocess.batcheffects.BatchInfoPopulationServiceImpl;
import ubic.gemma.core.analysis.service.ExpressionDataFileService;
import ubic.gemma.core.analysis.util.ExperimentalDesignUtils;
import ubic.gemma.core.util.AbstractCLI;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis;
import ubic.gemma.model.common.auditAndSecurity.eventType.DifferentialExpressionAnalysisEvent;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExperimentalFactorValueObject;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.persistence.service.analysis.expression.diff.DifferentialExpressionAnalysisService;
import ubic.gemma.persistence.service.expression.experiment.ExperimentalFactorService;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

 * A command line interface to the {@link DifferentialExpressionAnalysis}.
 * @author keshav
public class DifferentialExpressionAnalysisCli extends ExpressionExperimentManipulatingCLI {

    private final List<Long> factorIds = new ArrayList<>();
    private final List<String> factorNames = new ArrayList<>();
    private DifferentialExpressionAnalyzerService differentialExpressionAnalyzerService = null;
     * Whether batch factors should be included (if they exist)
    private boolean ignoreBatch = true;
    private boolean delete = false;
    private DifferentialExpressionAnalysisService differentialExpressionAnalysisService;
    private ExpressionDataFileService expressionDataFileService;
    private Long subsetFactorId;
    private String subsetFactorName;
    private boolean tryToCopyOld = false;
     * Used when processing a single experiment.
    private AnalysisType type = null;

     * Use moderated statistics.
    private boolean ebayes = DifferentialExpressionAnalysisConfig.DEFAULT_EBAYES;

    private boolean persist = true;

    public static void main(String[] args) {
        DifferentialExpressionAnalysisCli p = new DifferentialExpressionAnalysisCli();
        executeCommand(p, args);


    public String getCommandName() {
        return "diffExAnalyze";

    protected Exception doWork(String[] args) {
        Exception err = this.processCommandLine(args);
        if (err != null) {
            return err;

        SecurityService securityService = this.getBean(SecurityService.class);

        for (BioAssaySet ee : expressionExperiments) {
            if (!(ee instanceof ExpressionExperiment)) {

            if (expressionExperiments.size() > 1) {
                AbstractCLI.log.info(">>>>>> Begin processing: " + ee);

             * This is really only important when running as admin and in a batch mode.

            if (!securityService.isOwnedByCurrentUser(ee) && this.expressionExperiments.size() > 1) {
                AbstractCLI.log.warn("Experiment is not owned by current user, skipping: " + ee);

            this.processExperiment((ExpressionExperiment) ee);


        return null;

    public String getShortDesc() {
        return "Analyze expression data sets for differentially expressed genes.";

    protected void buildOptions() {

         * These options from the super class support: running on one or more data sets from the command line, running
         * on list of data sets from a file, running on all data sets.

        /* Supports: running on all data sets that have not been run since a given date. */

        //        Option topOpt = Option.builder( "top" ).hasArg().argName( "number" ).desc( "The top (most significant) results to display." )
        //                .build();
        //        super.addOption( topOpt );

        this.autoSeekEventType = DifferentialExpressionAnalysisEvent.class;

        Option factors = Option.builder("factors").desc(
                "ID numbers, categories or names of the factor(s) to use, comma-delimited, with spaces replaced by underscores")


        Option subsetFactor = Option.builder("subset").desc(
                "ID number, category or name of the factor to use for subsetting the analysis; must also use with -factors")

        Option analysisType = Option.builder("type").hasArg().desc(
                "Type of analysis to perform. If omitted, the system will try to guess based on the experimental design. "
                        + "Choices are : TWO_WAY_ANOVA_WITH_INTERACTION, "
                        + "TWO_WAY_ANOVA_NO_INTERACTION , OWA (one-way ANOVA), TTEST, OSTTEST (one-sample t-test),"
                        + " GENERICLM (generic LM, no interactions); default: auto-detect")


        Option ignoreBatchOption = Option.builder("usebatch").desc(
                "If a 'batch' factor is available, use it. Otherwise, batch information can/will be ignored in the analysis.")


        super.addOption("nodb", "Output files only to your gemma.appdata.home instead of database");

        super.addOption("redo", "If using automatic analysis "
                + "try to base analysis on previous analyses. Will re-run all analyses for the experiment");

                "Instead of running the analysis on the given experiments, remove the old analyses. Use with care!");

        super.addOption("ebayes", "Use empirical-Bayes moderated statistics. Default: "
                + DifferentialExpressionAnalysisConfig.DEFAULT_EBAYES);


    protected void processOptions() {
        differentialExpressionAnalyzerService = this.getBean(DifferentialExpressionAnalyzerService.class);
        differentialExpressionAnalysisService = this.getBean(DifferentialExpressionAnalysisService.class);
        expressionDataFileService = this.getBean(ExpressionDataFileService.class);
        if (this.hasOption("type")) {

            if (this.expressionExperiments.size() > 1) {
                throw new IllegalArgumentException(
                        "You can only specify the analysis type when analyzing a single experiment");

            if (!this.hasOption("factors")) {
                throw new IllegalArgumentException(
                        "Please specify the factor(s) when specifying the analysis type.");
            this.type = AnalysisType.valueOf(this.getOptionValue("type"));

        if (this.hasOption("subset")) {
            if (this.expressionExperiments.size() > 1) {
                throw new IllegalArgumentException(
                        "You can only specify the subset factor when analyzing a single experiment");

            if (!this.hasOption("factors")) {
                throw new IllegalArgumentException(
                        "You have to specify the factors if you also specify the subset");

            String subsetFactor = this.getOptionValue("subset");
            try {
                this.subsetFactorId = Long.parseLong(subsetFactor);

            } catch (NumberFormatException e) {
                this.subsetFactorName = subsetFactor;

        if (this.hasOption("usebatch")) {
            this.ignoreBatch = false;

        if (this.hasOption("delete")) {
            this.delete = true;

        if (this.hasOption("ebayes")) {
            this.ebayes = true;

        if (this.hasOption("nodb")) {
            this.persist = false;

        this.tryToCopyOld = this.hasOption("redo");

        if (this.hasOption("factors")) {

            if (this.tryToCopyOld) {
                throw new IllegalArgumentException("You can't specify 'redo' and 'factors' together");

            if (this.expressionExperiments.size() > 1) {
                throw new IllegalArgumentException(
                        "You can only specify the factors when analyzing a single experiment");

            String rawFactors = this.getOptionValue("factors");
            String[] factorIDst = StringUtils.split(rawFactors, ",");
            if (factorIDst != null && factorIDst.length > 0) {
                for (String string : factorIDst) {
                    try {
                        Long factorId = Long.parseLong(string);
                    } catch (NumberFormatException e) {

    private void processExperiment(ExpressionExperiment ee) {
        Collection<DifferentialExpressionAnalysis> results;
        DifferentialExpressionAnalysisConfig config = new DifferentialExpressionAnalysisConfig();

        try {

            ee = this.eeService.thawLite(ee);

            if (delete) {
                AbstractCLI.log.info("Deleting any analyses for experiment=" + ee);
                successObjects.add("Deleted analysis for: " + ee.toString());

            Collection<ExperimentalFactor> experimentalFactors = ee.getExperimentalDesign()
            if (experimentalFactors.size() == 0) {
                if (this.expressionExperiments.size() == 1) {
                     * Only need to be noisy if this is the only ee. Batch processing should be less so.
                    throw new RuntimeException(
                            "Experiment does not have an experimental design populated: " + ee.getShortName());
                        .warn("Experiment does not have an experimental design populated: " + ee.getShortName());

            Collection<ExperimentalFactor> factors = this.guessFactors(ee);

            if (factors.size() > 0) {
                 * Manual selection of factors
                ExperimentalFactor subsetFactor = this.getSubsetFactor(ee);

                AbstractCLI.log.info("Using " + factors.size() + " factors provided as arguments");

                if (subsetFactor != null) {
                    if (factors.contains(subsetFactor)) {
                        throw new IllegalArgumentException(
                                "Subset factor cannot also be included as factor to analyze");
                    AbstractCLI.log.info("Subsetting by " + subsetFactor);


                boolean rnaSeq = super.eeService.isRNASeq(ee);
                 * Interactions included by default. It's actually only complicated if there is a subset factor.
                if (type == null && factors.size() == 2) {

                results = this.differentialExpressionAnalyzerService.runDifferentialExpressionAnalyses(ee, config);

            } else {
                 * Automatically

                if (tryToCopyOld) {

                Collection<ExperimentalFactor> factorsToUse = new HashSet<>();

                if (this.ignoreBatch) {
                    for (ExperimentalFactor ef : experimentalFactors) {
                        if (!ExperimentalDesignUtils.isBatch(ef)) {
                } else {

                if (factorsToUse.isEmpty()) {
                    throw new RuntimeException("No factors available for " + ee.getShortName());

                if (factorsToUse.size() > 3) {

                    if (!tryToCopyOld) {
                        throw new RuntimeException("Experiment has too many factors to run automatically: "
                                + ee.getShortName()
                                + "; try using the -redo flag to base it on an old analysis, or select factors manually");
                    results = this.tryToRedoBasedOnOldAnalysis(ee);

                } else {


                    if (factorsToUse.size() == 2) {
                        // include interactions by default

                    boolean rnaSeq = super.eeService.isRNASeq(ee);

                    results = this.differentialExpressionAnalyzerService.runDifferentialExpressionAnalyses(ee,


            if (results == null) {
                throw new Exception(
                        "Failed to process differential expression for experiment " + ee.getShortName());

            if (!this.persist) {
                AbstractCLI.log.info("Writing results to disk");
                for (DifferentialExpressionAnalysis r : results) {
                    expressionDataFileService.writeDiffExArchiveFile(ee, r, config);


        } catch (Exception e) {
            AbstractCLI.log.error("Error while processing " + ee + ": " + e.getMessage());
            errorObjects.add(ee + ": " + e.getMessage());


    private ExperimentalFactor getSubsetFactor(ExpressionExperiment ee) {
        ExperimentalFactorService efs = this.getBean(ExperimentalFactorService.class);
        ExperimentalFactor subsetFactor = null;
        if (StringUtils.isNotBlank(this.subsetFactorName)) {
            Collection<ExperimentalFactor> experimentalFactors = ee.getExperimentalDesign()
            for (ExperimentalFactor experimentalFactor : experimentalFactors) {

                // has already implemented way of figuring out human-friendly name of factor value.
                ExperimentalFactorValueObject fvo = new ExperimentalFactorValueObject(experimentalFactor);

                if (ignoreBatch && BatchInfoPopulationServiceImpl.isBatchFactor(experimentalFactor)) {
                    AbstractCLI.log.info("Ignoring batch factor:" + experimentalFactor);

                if (subsetFactorName.equals(experimentalFactor.getName().replaceAll(" ", "_"))) {
                    subsetFactor = experimentalFactor;
                } else if (fvo.getCategory() != null
                        && subsetFactorName.equals(fvo.getCategory().replaceAll(" ", "_"))) {
                    subsetFactor = experimentalFactor;

            if (subsetFactor == null)
                throw new IllegalArgumentException("Didn't find factor for provided subset factor name");

            return subsetFactor;

        } else if (this.subsetFactorId != null) {
            subsetFactor = efs.load(subsetFactorId);
            if (subsetFactor == null) {
                throw new IllegalArgumentException("No factor for id=" + subsetFactorId);
            return subsetFactor;
        return null;

     * Determine which factors to use if given from the command line. Only applicable if analysis is on a single data
     * set.
    private Collection<ExperimentalFactor> guessFactors(ExpressionExperiment ee) {
        Collection<ExperimentalFactor> factors = new HashSet<>();

        ExperimentalFactorService efs = this.getBean(ExperimentalFactorService.class);
        if (this.factorNames.size() > 0) {
            if (this.factorIds.size() > 0) {
                throw new IllegalArgumentException("Please provide factor names or ids, not a mixture of each");
            Collection<ExperimentalFactor> experimentalFactors = ee.getExperimentalDesign()
            for (ExperimentalFactor experimentalFactor : experimentalFactors) {

                // has already implemented way of figuring out human-friendly name of factor value.
                ExperimentalFactorValueObject fvo = new ExperimentalFactorValueObject(experimentalFactor);

                if (ignoreBatch && BatchInfoPopulationServiceImpl.isBatchFactor(experimentalFactor)) {
                    AbstractCLI.log.info("Ignoring batch factor:" + experimentalFactor);

                if (factorNames.contains(experimentalFactor.getName().replaceAll(" ", "_"))) {
                } else if (fvo.getCategory() != null
                        && factorNames.contains(fvo.getCategory().replaceAll(" ", "_"))) {

            if (factors.size() != factorNames.size()) {
                throw new IllegalArgumentException("Didn't find factors for all the provided factor names");

        } else if (this.factorIds.size() > 0) {
            for (Long factorId : factorIds) {
                if (this.factorNames.size() > 0) {
                    throw new IllegalArgumentException("Please provide factor names or ids, not a mixture of each");
                ExperimentalFactor factor = efs.load(factorId);
                factor = efs.thaw(factor);
                if (factor == null) {
                    throw new IllegalArgumentException("No factor for id=" + factorId);
                if (!factor.getExperimentalDesign().equals(ee.getExperimentalDesign())) {
                    throw new IllegalArgumentException("Factor with id=" + factorId + " does not belong to " + ee);

                if (ignoreBatch && BatchInfoPopulationServiceImpl.isBatchFactor(factor)) {
                            "Selected factor looks like a batch, and 'ignoreBatch' is true, skipping:" + factor);


        return factors;

     * Run the analysis using configuration based on an old analysis.
    private Collection<DifferentialExpressionAnalysis> tryToRedoBasedOnOldAnalysis(ExpressionExperiment ee) {
        Collection<DifferentialExpressionAnalysis> oldAnalyses = differentialExpressionAnalysisService

        if (oldAnalyses.isEmpty()) {
            throw new IllegalArgumentException("There are no old analyses to redo");

        AbstractCLI.log.info("Will attempt to redo " + oldAnalyses.size() + " analyses for " + ee);
        Collection<DifferentialExpressionAnalysis> results = new HashSet<>();
        for (DifferentialExpressionAnalysis copyMe : oldAnalyses) {
            results.addAll(this.differentialExpressionAnalyzerService.redoAnalysis(ee, copyMe, this.persist));
        return results;

