org.bdval.DistributionDifferenceByFeatureMode.java Source code

Java tutorial

Introduction

Here is the source code for org.bdval.DistributionDifferenceByFeatureMode.java

Source

/*
 * Copyright (C) 2008-2010 Institute for Computational Biomedicine,
 *                         Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.bdval;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import edu.cornell.med.icb.geo.tools.ClassificationTask;
import edu.cornell.med.icb.geo.tools.ConditionIdentifiers;
import edu.cornell.med.icb.iterators.RecursiveFileListIterator;
import edu.cornell.med.icb.iterators.TextFileLineIterator;
import edu.cornell.med.icb.util.ICBStringUtils;
import edu.cornell.med.icb.util.ProcessEstimator;
import edu.mssm.crover.tables.ColumnTypeException;
import edu.mssm.crover.tables.InvalidColumnException;
import edu.mssm.crover.tables.Table;
import edu.mssm.crover.tables.TypeMismatchException;
import edu.mssm.crover.tables.readers.SyntaxErrorException;
import edu.mssm.crover.tables.readers.UnsupportedFormatException;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.doubles.DoubleList;
import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.util.Properties;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bdval.modelconditions.ProcessModelConditionsMode;
import org.bdval.signalquality.BaseSignalQualityCalculator;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Compare the distribution of each feature used in a set specific of biomarker models. Distribution differences are quantified for
 * feature signal between two sample sets (i.e., training set vs. validation set). A P-value (Kolmogorov-smirnov)
 * and ratio of rank statistics is evaluated for each feature in each model processed.
 *
 * @author Kevin Dorff
 */
public class DistributionDifferenceByFeatureMode extends DAVMode {

    /**
     * The logger for this class.
     */
    private static final Log LOG = LogFactory.getLog(DistributionDifferenceByFeatureMode.class);

    /**
     * Map of datsetName to dataset details.
     */
    private Map<String, Map<String, String>> datasetName2DetailsMap;

    /**
     * The signal quality calculator object.
     */
    private BaseSignalQualityCalculator signalQualityCalcObj;

    /**
     * Map of model id's to model filename prefixes.
     */
    private Map<String, String> modelIdToModelPrefixMap;

    /**
     * Map of model id's to model conditions.
     */
    private Map<String, Map<String, String>> modelIdToModelConditionsMap;

    /**
     * Map of datsetName to dataset details.
     */
    private String evalDatasetRoot;

    /**
     * The label used to denote training values in the properties file.
     */
    private String propertiesTrainingLabel;

    /**
     * The label used to denote validation values in the properties file.
     */
    private String propertiesValidationLabel;

    /**
     * Cache tables by filename in memory.
     */
    private final Map<String, Table> tableCache = new Object2ObjectOpenHashMap<String, Table>();

    /**
     * Models to exclude from processing.
     */
    private Set<String> excludeModelSet;

    /**
     * True if we should write extended output.
     */
    private boolean extendedOutput;

    /**
     * If true classes will be merged, otherwise the classes will be written separately.
     */
    private boolean mergeClasses;

    /**
     * The maximum number of classes, important for the output file header to be correct.
     */
    private int maxNumClasses;

    /**
     * Cache of the filenames to ClassificationTask maps.
     */
    private final Map<String, ClassificationTask> filenamesToClassificationTaskMap = new Object2ObjectOpenHashMap<String, ClassificationTask>();

    /**
     * Define command line options for this mode.
     * @param jsap the JSAP command line parser
     * @throws JSAPException if there is a problem building the options
     */
    @Override
    public void defineOptions(final JSAP jsap) throws JSAPException {
        // No input file using this flag
        jsap.getByID("input").addDefault("N/A");
        // there is no need for task definitions.
        jsap.getByID("task-list").addDefault("N/A");
        // No need for platform-filenames
        jsap.getByID("platform-filenames").addDefault("N/A");
        // there is no need for condition ids.
        jsap.getByID("conditions").addDefault("N/A");

        final Parameter maqciiPropertiesFileOption = new FlaggedOption("maqcii-properties-file")
                .setStringParser(JSAP.STRING_PARSER).setDefault(JSAP.NO_DEFAULT).setRequired(true)
                .setLongFlag("maqcii-properties-file")
                .setHelp("The maqcii properties file such as 'maqcii-c.properties'.");
        jsap.registerParameter(maqciiPropertiesFileOption);

        final Parameter modelConditionsFileOption = new FlaggedOption("model-conditions-file")
                .setStringParser(JSAP.STRING_PARSER).setDefault(JSAP.NO_DEFAULT).setRequired(true)
                .setLongFlag("model-conditions-file")
                .setHelp("The model-conditions-file such as 'model-conditions.txt'.");
        jsap.registerParameter(modelConditionsFileOption);

        final Parameter modelsDirOption = new FlaggedOption("models-dir").setStringParser(JSAP.STRING_PARSER)
                .setDefault(JSAP.NO_DEFAULT).setRequired(true).setLongFlag("models-dir")
                .setHelp("The directory containing models (may be within sub-directories).");
        jsap.registerParameter(modelsDirOption);

        final Parameter modelsListOption = new FlaggedOption("model-list").setStringParser(JSAP.STRING_PARSER)
                .setDefault("all").setRequired(false).setLongFlag("model-list")
                .setHelp("The models to process (or 'all' to process all models). "
                        + "Comma separated, such as 'DUDTR,YTNJM'.");
        jsap.registerParameter(modelsListOption);

        final Parameter modelExcludeListOption = new FlaggedOption("model-exclude-list")
                .setStringParser(JSAP.STRING_PARSER).setDefault("none").setRequired(false)
                .setLongFlag("model-exclude-list")
                .setHelp("The models to NOT process (or 'none' to process all models). "
                        + "Comma separated, such as 'DUDTR,YTNJM'.");
        jsap.registerParameter(modelExcludeListOption);

        final Parameter signalQualityCalcClassOption = new FlaggedOption("signal-quality-calc-class")
                .setStringParser(JSAP.STRING_PARSER).setDefault(JSAP.NO_DEFAULT).setRequired(true)
                .setLongFlag("signal-quality-calc-class")
                .setHelp("Fully qualified classname for an " + "AbstractSignalQualityCalculator class.");
        jsap.registerParameter(signalQualityCalcClassOption);

        final Parameter evalDatasetRootOption = new FlaggedOption("eval-dataset-root")
                .setStringParser(JSAP.STRING_PARSER).setDefault("-").setRequired(false)
                .setLongFlag("eval-dataset-root").setHelp("The eval-dataset-root directory or specify '-' to use "
                        + "the dataset-root directory specified " + "in the model-conditions file");
        jsap.registerParameter(evalDatasetRootOption);

        final Parameter propertiesTrainingLabelOption = new FlaggedOption("properties-training-label")
                .setStringParser(JSAP.STRING_PARSER).setDefault("training").setRequired(false)
                .setLongFlag("properties-training-label")
                .setHelp("The label used to denote training " + "values in the properties file.");
        jsap.registerParameter(propertiesTrainingLabelOption);

        final Parameter propertiesValidationLabelOption = new FlaggedOption("properties-validation-label")
                .setStringParser(JSAP.STRING_PARSER).setDefault("validation").setRequired(false)
                .setLongFlag("properties-validation-label")
                .setHelp("The label used to denote validation " + "values in the properties file.");
        jsap.registerParameter(propertiesValidationLabelOption);

        final Parameter extendedOutputOption = new FlaggedOption("extended-output")
                .setStringParser(JSAP.BOOLEAN_PARSER).setDefault("false").setRequired(true)
                .setLongFlag("extended-output").setHelp("If true, extra output will be included.");
        jsap.registerParameter(extendedOutputOption);

        final Parameter mergeClassesOption = new FlaggedOption("merge-classes").setStringParser(JSAP.BOOLEAN_PARSER)
                .setDefault("false").setRequired(true).setLongFlag("merge-classes")
                .setHelp("If true, all classes will be merged.");
        jsap.registerParameter(mergeClassesOption);

        final Parameter maxNumClassesOption = new FlaggedOption("max-num-classes")
                .setStringParser(JSAP.INTEGER_PARSER).setDefault("2").setRequired(true)
                .setLongFlag("max-num-classes")
                .setHelp("The maximum number of classes (for the output file header)");
        jsap.registerParameter(maxNumClassesOption);
    }

    /**
     * Interpret the command line arguments.
     *
     * @param jsap    the JSAP command line parser
     * @param result  the results of command line parsing
     * @param options the DAVOptions
     */
    @Override
    public void interpretArguments(final JSAP jsap, final JSAPResult result, final DAVOptions options) {
        checkArgumentsSound(jsap, result, false);
        setupPathwayOptions(result, options);
        setupRservePort(result, options);
        setupClassifier(result, options);
        setExceptionOnCheckPostFilteringFail(true);

        final String maqciiPropertiesFile = verifyFilenameOption(result, "maqcii-properties-file");
        final String modelConditionsFile = verifyFilenameOption(result, "model-conditions-file");
        final String modelsDir = verifyDirectoryOption(result, "models-dir");
        final String keepModelSetStr = result.getString("model-list");
        final String excludeModelSetStr = result.getString("model-exclude-list");
        extendedOutput = result.getBoolean("extended-output");
        mergeClasses = result.getBoolean("merge-classes");
        maxNumClasses = result.getInt("max-num-classes");
        if (mergeClasses) {
            maxNumClasses = 1;
        }

        propertiesTrainingLabel = result.getString("properties-training-label");
        propertiesValidationLabel = result.getString("properties-validation-label");

        evalDatasetRoot = result.getString("eval-dataset-root");
        if (!evalDatasetRoot.equals("-")) {
            if (!isValidDirectory(evalDatasetRoot)) {
                LOG.fatal("eval-dataset-root must either be '-' or a valid directory");
                System.exit(10);
            }
        } else {
            evalDatasetRoot = null;
        }

        LOG.info("Creating the signal quality calculator...");
        signalQualityCalcObj = createCalculator(result);
        if (signalQualityCalcObj == null) {
            System.exit(10);
        }

        // Populate keepModelSet (empty set for all)
        Set<String> keepModelSet = modelList(keepModelSetStr, "all");
        excludeModelSet = modelList(excludeModelSetStr, "none");

        // Only retain model conditions in keepModelSet
        LOG.info("Reading the model conditions file...");
        modelIdToModelConditionsMap = ProcessModelConditionsMode.readModelConditionsFile(modelConditionsFile,
                keepModelSet);
        LOG.info(String.format("... found model conditions for %d models",
                modelIdToModelConditionsMap.keySet().size()));

        // Only save model prefixes that we have model conditions for
        keepModelSet = modelIdToModelConditionsMap.keySet();
        if (keepModelSet.size() == 0) {
            LOG.fatal("No models to load");
            System.exit(10);
        }
        LOG.info("Scanning the models directory for models to keep...");
        modelIdToModelPrefixMap = scanModelsDirectory(modelsDir, keepModelSet);
        if (modelIdToModelPrefixMap.keySet().size() == 0) {
            LOG.fatal("No models to loaded.");
            System.exit(10);
        }
        keepModelSet = modelIdToModelPrefixMap.keySet();

        //Reduce model conditions to the models we actually have
        reduceMap(modelIdToModelConditionsMap, keepModelSet);

        LOG.info(String.format("... Finished scanning models directory. Found %d models.",
                modelIdToModelPrefixMap.keySet().size()));
        final Set<String> datasetNames = extractDatasetNamesFromModelConditions(modelIdToModelConditionsMap);
        LOG.info(String.format("Models exist in %d datasets", datasetNames.size()));

        LOG.debug(String.format("modelIdToModelConditionsMap[%d]%s=", modelIdToModelConditionsMap.keySet().size(),
                ArrayUtils.toString(modelIdToModelConditionsMap.keySet())));
        LOG.debug(String.format("modelIdToModelPrefixMap[%d]=%s", modelIdToModelPrefixMap.keySet().size(),
                ArrayUtils.toString(modelIdToModelPrefixMap.keySet())));
        LOG.info(String.format("datasetNames[%d]=%s", datasetNames.size(), ArrayUtils.toString(datasetNames)));

        LOG.info("Reading the maqcii properties file...");
        datasetName2DetailsMap = readMaqciiProperties(maqciiPropertiesFile, datasetNames);
        if (datasetName2DetailsMap == null) {
            System.exit(10);
        }

        LOG.debug(String.format("datasetName2DetailsMap[%d]=%s", datasetName2DetailsMap.keySet().size(),
                ArrayUtils.toString(datasetName2DetailsMap)));
    }

    /**
     * Take a string and make a model list from it (string should be commans separated).
     *
     * @param modelListStr the list of models, comma separated
     * @param allValue     which string should mark all/none (denote this should return
     *                     an empty set.
     * @return the set
     */
    public Set<String> modelList(final String modelListStr, final String allValue) {
        final Set<String> resultSet = new ObjectLinkedOpenHashSet<String>();
        if (!StringUtils.isBlank(modelListStr) && !modelListStr.equals(allValue)) {
            final String[] parts = StringUtils.split(modelListStr, ',');
            for (final String part : parts) {
                resultSet.add(part.trim());
            }
        }
        return resultSet;
    }

    /**
     * Reduce a map, keeping only the values specified in keepKeySet.
     *
     * @param map        the map to reduce
     * @param keepKeySet the set of keys to keep.
     */
    public static void reduceMap(final Map<String, ?> map, final Set<String> keepKeySet) {
        final List<String> reduceKeys = new LinkedList<String>();
        for (final String key : map.keySet()) {
            if (!keepKeySet.contains(key)) {
                reduceKeys.add(key);
            }
        }
        for (final String key : reduceKeys) {
            map.remove(key);
        }
    }

    /**
     * Scan the given model directory for model files (files that end in .model).
     * and return a map of model-id's to model filename prefixes. Only model-ids which
     * exist in keepModelIdsSet will be saved into the Map. If keepModelIdsSet is empty
     * or null, ALL models will be retrieved. This will also scan all subdirectories.
     *
     * @param modelDirName    the directory that contains the model files
     * @param keepModelIdsSet the set of model ids to keep (or keep ALL if keepModelIdsSet
     *                        is empty or null)
     * @return the map of model-id's to model filename prefixes
     */
    public static Map<String, String> scanModelsDirectory(final String modelDirName,
            final Set<String> keepModelIdsSet) {
        // Get the list of model prefixes
        final Map<String, String> modelsMap = new Object2ObjectLinkedOpenHashMap<String, String>();
        int numFilesScanned = 0;
        int numModelsFound = 0;
        for (final File file : new RecursiveFileListIterator(new File(modelDirName))) {
            final String candidateFile = getFilename(file);
            if (candidateFile == null) {
                LOG.fatal("Could not determine full path filename for file " + file.toString());
                System.exit(10);
            }
            if (candidateFile.endsWith(".model")) {
                final String modelFilenamePrefix = BDVModel.removeSuffix(candidateFile, ".model");
                final String modelId = modelIdFromPrefix(modelFilenamePrefix);
                if (keepModelIdsSet == null || keepModelIdsSet.size() == 0 || keepModelIdsSet.contains(modelId)) {
                    modelsMap.put(modelId, modelFilenamePrefix);
                    numModelsFound++;
                }
            }
            numFilesScanned++;
            if (numFilesScanned % 5000 == 0) {
                LOG.debug(
                        String.format("... Looked at %d files, found %d models", numFilesScanned, numModelsFound));
            }
        }
        return modelsMap;
    }

    /**
     * Given a file, return the filename. If an IOException is raised during
     * getCanonicalPath() it is swallowed and null is returned.
     *
     * @param file File to get the name of
     * @return the full path filename
     */
    public static String getFilename(final File file) {
        try {
            return file.getCanonicalPath();
        } catch (IOException e) {
            return null;
        }
    }

    /**
     * Verify that a specified filename exists or fail out.
     *
     * @param result    the JSAPResults
     * @param optionKey the key that should contain an existing file
     * @return the filename
     */
    final String verifyFilenameOption(final JSAPResult result, final String optionKey) {
        final String filename = result.getString(optionKey);
        final File file = new File(filename);
        if (!file.exists()) {
            LOG.fatal("Required file named " + filename + " does not exist.");
            System.exit(10);
        }
        if (!file.isFile()) {
            LOG.fatal("Required file named " + filename + " is not a file.");
            System.exit(10);
        }
        if (file.isDirectory()) {
            LOG.fatal("Required file named " + filename + " is not a file.");
            System.exit(10);
        }
        if (!file.canRead()) {
            LOG.fatal("Required file named " + filename + " is not readable.");
            System.exit(10);
        }
        return filename;
    }

    /**
     * Verify that a specified filename exists or fail out.
     *
     * @param result    the JSAPResults
     * @param optionKey the key that should contain an existing file
     * @return the filename
     */
    final String verifyDirectoryOption(final JSAPResult result, final String optionKey) {
        final String dirName = result.getString(optionKey);
        if (!isValidDirectory(dirName)) {
            System.exit(10);
        }
        return dirName;
    }

    public static boolean isValidDirectory(final String dirName) {
        final File file = new File(dirName);
        if (!file.exists()) {
            LOG.fatal("Required directory named " + dirName + " does not exist.");
            return false;
        }
        if (file.isFile()) {
            LOG.fatal("Required directory named " + dirName + " is not a directory.");
            return false;
        }
        if (!file.isDirectory()) {
            LOG.fatal("Required directory named " + dirName + " is not a directory.");
            return false;
        }
        if (!file.canRead()) {
            LOG.fatal("Required directory named " + dirName + " is not readable.");
            return false;
        }
        return true;
    }

    /**
     * Create the calculator object specified by the command line argument
     * signal-quality-calc-class
     *
     * @param result the JSAPResult object
     * @return the newly created and configured calculator object or null if it
     *         couldn't be created.
     */
    final BaseSignalQualityCalculator createCalculator(final JSAPResult result) {
        final String classname = result.getString("signal-quality-calc-class");
        try {
            final BaseSignalQualityCalculator calculator = (BaseSignalQualityCalculator) Class.forName(classname)
                    .newInstance();
            calculator.configure(result, BaseSignalQualityCalculator.OutputFileHeader.P_VALUES, extendedOutput,
                    maxNumClasses);
            return calculator;
        } catch (InstantiationException e) {
            LOG.fatal("Error creating ISignalQualityCalculator object", e);
        } catch (IllegalAccessException e) {
            LOG.fatal("Error creating ISignalQualityCalculator object", e);
        } catch (ClassNotFoundException e) {
            LOG.fatal("Error creating ISignalQualityCalculator object", e);
        } catch (FileNotFoundException e) {
            LOG.fatal("Error creating output file", e);
        }
        return null;
    }

    /**
     * The SingleQualityMode D&V mode.
     *
     * @param options program options.
     */
    @Override
    public void process(final DAVOptions options) {
        super.process(options);

        final ProcessEstimator estimator = new ProcessEstimator(modelIdToModelPrefixMap.size());
        for (final String modelId : modelIdToModelPrefixMap.keySet()) {
            if (excludeModelSet.contains(modelId)) {
                estimator.unitCompleted();
                continue;
            }
            final String modelFilenamePrefix = modelIdToModelPrefixMap.get(modelId);
            final Map<String, String> modelConditionsMap = modelIdToModelConditionsMap.get(modelId);
            final String datasetName = modelConditionsMap.get("dataset-name");
            final String datasetRoot;
            if (evalDatasetRoot == null) {
                datasetRoot = modelConditionsMap.get("dataset-root");
            } else {
                datasetRoot = evalDatasetRoot;
            }

            final Map<String, String> datasetDetailsMap = localizeDatasetDetailsMap(
                    datasetName2DetailsMap.get(datasetName), datasetRoot);

            try {
                loadFilesAndCalculateQuality(options, modelId, modelFilenamePrefix, datasetDetailsMap);
            } catch (IllegalArgumentException e) {
                signalQualityCalcObj
                        .writeData("# " + modelId + " error loading table for model: " + e.getMessage());
            } catch (IOException e) {
                signalQualityCalcObj.writeData("# " + modelId + " error reading file: " + e.getMessage());
            } finally {
                // Work completed, estimate time remaining
                final long estimate = estimator.unitCompleted();
                if (estimate == Long.MAX_VALUE) {
                    System.out.println("## Waiting for second data point to estimate time");
                } else {
                    final long finishAt = (new Date().getTime()) + estimate;
                    System.out.printf("## Processed model %d of %d, Time remaining %s, finish at %s%n",
                            estimator.getUnitsCompleted(), estimator.getTotalUnits(),
                            ICBStringUtils.millis2hms(estimate), DateFormatUtils.format(finishAt, "HH:mm:ss"));
                }
            }
        }

        /** Run for a single model. */
        /*
        */

        signalQualityCalcObj.close();
    }

    /**
     * Perform a signal quality assessment on a single set of files (one model).
     *
     * @param options             the options to run with
     * @param modelId             the model id being processed
     * @param modelFilenamePrefix the model filename prefix
     * @param datasetDetailsMap   the map of dataset details (filenames, etc.)
     * @throws IOException error reading or writing
     */
    public void loadFilesAndCalculateQuality(final DAVOptions options, final String modelId,
            final String modelFilenamePrefix, final Map<String, String> datasetDetailsMap) throws IOException {

        final String trainingDatasetFilename = datasetDetailsMap.get("training.dataset-file");
        final String validationDatasetFilename = datasetDetailsMap.get("validation.dataset-file");
        final String trainingSamplesFilename = datasetDetailsMap.get("training.test-samples");
        final String validationSamplesFilename = datasetDetailsMap.get("validation.test-samples");
        final String trainingTrueLabelsFilename = datasetDetailsMap.get("training.true-labels");
        final String validationTrueLabelsFilename = datasetDetailsMap.get("validation.true-labels");
        final String tasksFilename = datasetDetailsMap.get("tasks-file");

        LOG.info("Running loadFilesAndCalculateQuality for:");
        LOG.info("  modelId=" + modelId);
        LOG.info("  modelFilenamePrefix=" + modelFilenamePrefix);
        LOG.info("  trainingDatasetFilename=" + trainingDatasetFilename);
        LOG.info("  validationDatasetFilename=" + validationDatasetFilename);
        LOG.info("  trainingSamplesFilename=" + trainingSamplesFilename);
        LOG.info("  validationSamplesFilename=" + validationSamplesFilename);
        LOG.info("  trainingTrueLabelsFilename=" + trainingTrueLabelsFilename);
        LOG.info("  validationTrueLabelsFilename=" + validationTrueLabelsFilename);
        LOG.info("  tasksFilename=" + tasksFilename);

        // Load the tasks and CIDs files using the official way...
        final ClassificationTask tClassTasks = loadCachedTaskAndConditions(tasksFilename,
                trainingTrueLabelsFilename);
        final ClassificationTask vClassTasks = loadCachedTaskAndConditions(tasksFilename,
                validationTrueLabelsFilename);

        final ConditionIdentifiers tConditionIdentifiers = tClassTasks.getConditionsIdentifiers();
        final ConditionIdentifiers vConditionIdentifiers = vClassTasks.getConditionsIdentifiers();

        final String[] allClasses = tClassTasks.getConditionNames();
        final String[] vClasses = vClassTasks.getConditionNames();
        assert Arrays.equals(allClasses, vClasses);

        final StringBuilder classMapcomment = new StringBuilder();
        if (mergeClasses) {
            classMapcomment.append(
                    String.format("## All classes merge to %s", BaseSignalQualityCalculator.CLASS_TRANSLATION[0]));
        } else {
            for (int i = 0; i < allClasses.length; i++) {
                if (classMapcomment.length() > 0) {
                    classMapcomment.append('\n');
                }
                classMapcomment.append(String.format("## Class %s becomes %s", allClasses[i],
                        BaseSignalQualityCalculator.CLASS_TRANSLATION[i]));
            }
        }
        signalQualityCalcObj.setClassMapComment(classMapcomment.toString());

        final ObjectSet<String> trainingSampleIds = loadSampleIds(trainingSamplesFilename);
        final ObjectSet<String> validationSampleIds = loadSampleIds(validationSamplesFilename);

        try {
            final BDVModel model = new BDVModel(modelFilenamePrefix);

            final boolean scaleFeaturesFromCommandLine = options.scaleFeatures;
            model.load(options);
            // Force scaleFeature to respect the command line option (default is true)
            options.scaleFeatures = scaleFeaturesFromCommandLine;

            assert model.getGeneList() != null : " gene list must not be null";

            final List<Set<String>> trainingLabelValueGroups = new ArrayList<Set<String>>();
            options.inputTable = readMemoryCachedInputFile(trainingDatasetFilename);
            final Table trainingTable = model.loadTestSet(this, options, model.getGeneList(),
                    trainingLabelValueGroups, trainingSampleIds);
            final int trainingFilteredNumberOfSamples = trainingTable.getRowNumber();
            LOG.info("Training dataset has " + trainingFilteredNumberOfSamples + " samples.");

            if (trainingFilteredNumberOfSamples != trainingSampleIds.size()) {
                signalQualityCalcObj.writeData(String.format(
                        "# error with model-id=%s - number of samples "
                                + "doesn't match. trainingTable has %d but trainingSampleIds " + "has %d",
                        modelId, trainingFilteredNumberOfSamples, trainingSampleIds.size()));
                return;
            }

            final List<Set<String>> validationLabelValueGroups = new ArrayList<Set<String>>();
            options.inputTable = readMemoryCachedInputFile(validationDatasetFilename);
            final Table validationTable = model.loadTestSet(this, options, model.getGeneList(),
                    validationLabelValueGroups, validationSampleIds);
            final int validationFilteredNumberOfSamples = validationTable.getRowNumber();
            LOG.info("Validation dataset has " + validationFilteredNumberOfSamples + " samples.");

            if (validationFilteredNumberOfSamples != validationSampleIds.size()) {
                signalQualityCalcObj.writeData(String.format(
                        "# error with model-id=%s - number of samples "
                                + "doesn't match. validationTable has %d but validationSampleIds " + "has %d",
                        modelId, validationFilteredNumberOfSamples, validationSampleIds.size()));
                return;
            }

            final Map<String, Map<String, double[]>> classToDataMapMap = new Object2ObjectOpenHashMap<String, Map<String, double[]>>();

            System.out.printf("There are %d classes, %s%n", allClasses.length, ArrayUtils.toString(allClasses));
            if (mergeClasses) {
                classToDataMapMap.put("merged-training",
                        retrieveDataAsMap(trainingTable, tConditionIdentifiers, null));
                System.out.println("Loading filtered validation data");
                classToDataMapMap.put("merged-validation",
                        retrieveDataAsMap(validationTable, vConditionIdentifiers, null));

                System.out.printf("Loaded data for model=%s merged classes%n", modelId);

                signalQualityCalcObj.calculatePValues(model, modelId, new String[] { "merged" }, classToDataMapMap);
            } else {
                for (final String classId : allClasses) {
                    // For each CLASS
                    classToDataMapMap.put(classId + "-training",
                            retrieveDataAsMap(trainingTable, tConditionIdentifiers, classId));
                    classToDataMapMap.put(classId + "-validation",
                            retrieveDataAsMap(validationTable, vConditionIdentifiers, classId));

                    System.out.printf("Loaded data for model=%s/class=%s%n", modelId, classId);
                }
                signalQualityCalcObj.calculatePValues(model, modelId, allClasses, classToDataMapMap);
            }

        } catch (IOException e) {
            LOG.error("Error loading model " + modelFilenamePrefix, e);
            System.exit(10);
        } catch (ClassNotFoundException e) {
            LOG.fatal("Error loading model " + modelFilenamePrefix, e);
            System.exit(10);
        } catch (ColumnTypeException e) {
            LOG.fatal("Error processing input file ", e);
            System.exit(10);
        } catch (TypeMismatchException e) {
            LOG.fatal("Error processing input file ", e);
            System.exit(10);
        } catch (InvalidColumnException e) {
            LOG.fatal("Error processing input file ", e);
            System.exit(10);
        } catch (SyntaxErrorException e) {
            LOG.fatal("Error reading dataset file ", e);
            System.exit(10);
        } catch (UnsupportedFormatException e) {
            LOG.fatal("Error reading dataset file ", e);
            System.exit(10);
        }
    }

    /**
     * Load / cache the ClassificationTask for the specified task file and cids file.
     *
     * @param tasksFilename      the task file
     * @param trueLabelsFilename the cids file with the true labels
     * @return a single ClassificationTask object that contains information about
     *         the tasks and sample ids and labels, etc.
     * @throws IOException error reading either the tasks file or the cids file
     */
    private synchronized ClassificationTask loadCachedTaskAndConditions(final String tasksFilename,
            final String trueLabelsFilename) throws IOException {
        final String key = tasksFilename + trueLabelsFilename;
        ClassificationTask classTask = filenamesToClassificationTaskMap.get(key);
        if (classTask == null) {
            final ClassificationTask[] classTaskArray = ClassificationTask.parseTaskAndConditions(tasksFilename,
                    trueLabelsFilename);
            assert classTaskArray.length == 1;
            classTask = classTaskArray[0];
            filenamesToClassificationTaskMap.put(key, classTask);
        }
        return classTask;
    }

    /**
     * Read a table by filename. Cache them in memory. On future reads, get it from the cache.
     *
     * @param fileName Name of the file to read
     * @return A table that contains data read from the input file
     * @throws SyntaxErrorException       if there is an error in the file
     * @throws IOException                if the input file cannot be read
     * @throws UnsupportedFormatException if the file format is not recognized
     */
    private Table readMemoryCachedInputFile(final String fileName)
            throws IOException, SyntaxErrorException, UnsupportedFormatException {
        Table table = tableCache.get(fileName);
        if (table == null) {
            table = readInputFile(fileName);
            if (table != null) {
                tableCache.put(fileName, table);
            }
        }
        return table;
    }

    /**
     * Given a model filename prefix, return the model id (which is after the last
     * "-" in the string).
     *
     * @param modelFilenamePrefix the model filename prefix
     * @return the model id
     */
    private static String modelIdFromPrefix(final String modelFilenamePrefix) {
        final String[] parts = StringUtils.split(modelFilenamePrefix, '-');
        return parts[parts.length - 1];
    }

    /**
     * Load sample id's from a given file.
     *
     * @param sampleFilename the sample id's filename
     * @return the sample id's set
     * @throws IOException error reading file
     */
    public static ObjectSet<String> loadSampleIds(final String sampleFilename) throws IOException {
        if (sampleFilename != null) {
            LOG.info("Reading test sample filename: " + sampleFilename);

            final ObjectSet<String> sampleIds = new ObjectOpenHashSet<String>();

            for (final String line : new TextFileLineIterator(sampleFilename)) {
                final String sampleId = line.trim();
                sampleIds.add(sampleId);
            }
            return sampleIds;
        }
        return null;
    }

    /**
     * Retrieve the data from the table into Map[column_name, double[]].
     *
     * @param table                the input Table which contains the data
     * @param conditionIdentifiers maps sample ids to their class
     * @param classToKeep          data class to read values for, if mergeClasses is true this will
     *                             be ignored
     *                             all classes (mergedClasses)
     * @return data in map form
     */
    private Map<String, double[]> retrieveDataAsMap(final Table table,
            final ConditionIdentifiers conditionIdentifiers, final String classToKeep) {
        final Map<String, double[]> data = new Object2ObjectLinkedOpenHashMap<String, double[]>();
        boolean[] keepRowValues = ArrayUtils.EMPTY_BOOLEAN_ARRAY;
        final DoubleList keptValues = new DoubleArrayList();
        int numKeptValues = 0;
        for (int col = 0; col < table.getColumnNumber(); col++) {
            final String colName = table.getIdentifier(col);
            try {
                if (col == 0) {
                    // Column 0 will determine which values we keep
                    final String[] samplesIds = table.getStrings(colName);
                    final int numRows = samplesIds.length;
                    keepRowValues = new boolean[numRows];
                    for (int i = 0; i < numRows; i++) {
                        final String sampleId = samplesIds[i];

                        // Obtain class for the sample in two ways, compare them.
                        final String sampleTrueLabel = conditionIdentifiers.conditionForIdentifier(sampleId);

                        if (sampleTrueLabel == null) {
                            // NEVER keep if the sampleId isn't in the true-labels file
                            keepRowValues[i] = false;
                        } else {
                            if (mergeClasses || sampleTrueLabel.equals(classToKeep)) {
                                keepRowValues[i] = true;
                            }
                            if (keepRowValues[i]) {
                                numKeptValues++;
                            }
                        }
                    }
                } else {
                    keptValues.clear();
                    if (numKeptValues > 0) {
                        final double[] allValues = table.getDoubles(colName);
                        for (int i = 0; i < keepRowValues.length; i++) {
                            if (keepRowValues[i]) {
                                keptValues.add(allValues[i]);
                            }
                        }
                    }
                    data.put(table.getIdentifier(col), keptValues.toDoubleArray());
                }
            } catch (InvalidColumnException e) {
                LOG.error(e);
            }
        }
        return data;
    }

    /**
     * Take the map of model id's to model conditions and extract all of the
     * dataset names.
     *
     * @param modelIdToModelConditionsMap the map of model id's to model conditions
     * @return the set of dataset names
     */
    private static Set<String> extractDatasetNamesFromModelConditions(
            final Map<String, Map<String, String>> modelIdToModelConditionsMap) {
        final Set<String> datasetNames = new ObjectLinkedOpenHashSet<String>();
        for (final String modelId : modelIdToModelConditionsMap.keySet()) {
            final Map<String, String> params = modelIdToModelConditionsMap.get(modelId);
            final String datasetName = params.get("dataset-name");
            if (datasetName != null) {
                datasetNames.add(datasetName);
            }
        }
        return datasetNames;
    }

    /**
     * Read the maqcii properties file (such as maqcii-c.properties) and load the filenames
     * for dataset and samples files for training and validation for each of the datasets
     * listed in datasetNames. The result is a Map[datasetName, Map[key, value]] where key
     * is the which of the files and value is the path to the file.
     *
     * @param maqciiPropertiesFilename the maqcii properties file
     * @param datasetNames             the datasets to obtain the filenames for
     * @return a map so we can determine the dataset and samples files for each specific dataset
     *         name.
     */
    public Map<String, Map<String, String>> readMaqciiProperties(final String maqciiPropertiesFilename,
            final Set<String> datasetNames) {
        final Properties maqciiProperties;
        try {
            maqciiProperties = new Properties(new File(maqciiPropertiesFilename));
        } catch (ConfigurationException e) {
            LOG.fatal("Error reading properties file " + maqciiPropertiesFilename);
            System.exit(10);
            return null; // To make IDEA happy.
        }

        final Map<String, Map<String, String>> resultsMap = new Object2ObjectLinkedOpenHashMap<String, Map<String, String>>();
        // Make sure it has appropriate entries
        // ALL expected data must be accounted for or return null
        for (final String datasetName : datasetNames) {

            final Map<String, String> datasetResultsMap = new Object2ObjectLinkedOpenHashMap<String, String>();
            resultsMap.put(datasetName, datasetResultsMap);

            datasetResultsMap.put("training.dataset-file", readPropertyOneOf(maqciiProperties,
                    datasetName + "." + propertiesTrainingLabel + ".dataset-file", datasetName + ".dataset-file"));
            assert datasetResultsMap.get("training.dataset-file") != null;

            datasetResultsMap.put("validation.dataset-file",
                    readPropertyOneOf(maqciiProperties,
                            datasetName + "." + propertiesValidationLabel + ".dataset-file",
                            datasetName + ".dataset-file"));
            assert datasetResultsMap.get("validation.dataset-file") != null;

            datasetResultsMap.put("training.test-samples", maqciiProperties
                    .getString(datasetName + "." + propertiesTrainingLabel + ".test-samples", null));
            assert datasetResultsMap.get("training.test-samples") != null;

            datasetResultsMap.put("validation.test-samples", maqciiProperties
                    .getString(datasetName + "." + propertiesValidationLabel + ".test-samples", null));
            assert datasetResultsMap.get("validation.test-samples") != null;

            datasetResultsMap.put("training.true-labels",
                    maqciiProperties.getString(datasetName + "." + propertiesTrainingLabel + ".true-labels", null));
            assert datasetResultsMap.get("training.true-labels") != null;

            datasetResultsMap.put("validation.true-labels", maqciiProperties
                    .getString(datasetName + "." + propertiesValidationLabel + ".true-labels", null));
            assert datasetResultsMap.get("validation.true-labels") != null;

            datasetResultsMap.put("tasks-file", maqciiProperties.getString(datasetName + ".tasks-file", null));
            assert datasetResultsMap.get("tasks-file") != null;
        }
        return resultsMap;
    }

    /**
     * Return the first non-null of propertyNames. Returns null of none of the
     * propertyNames returns a value.
     *
     * @param properties    the properties to read from
     * @param propertyNames the list of property names to try to find the first
     *                      non-null value of
     * @return the found property value or null if no values were found
     */
    private static String readPropertyOneOf(final Properties properties, final String... propertyNames) {
        for (final String propertyName : propertyNames) {
            LOG.info("Looking for property named " + propertyName);
            final String value = properties.getString(propertyName);
            if (value != null) {
                return value;
            }
        }
        return null;
    }

    /**
     * Take the datasetDetailsMap and localize by replacing occurances of
     * ${eval-dataset-root} with the value in datasetRoot.
     *
     * @param details     the datasetDetailsMap to localize
     * @param datasetRoot the datasetRoot to use to localize the map
     * @return a localized map
     */
    private Map<String, String> localizeDatasetDetailsMap(final Map<String, String> details,
            final String datasetRoot) {
        final Map<String, String> results = new Object2ObjectOpenHashMap<String, String>();
        for (final Map.Entry<String, String> entry : details.entrySet()) {
            results.put(entry.getKey(), entry.getValue().replace("${eval-dataset-root}", datasetRoot));
        }
        return results;
    }
}