org.mskcc.cbio.cgds.scripts.ImportTabDelimData.java Source code

Java tutorial

Introduction

Here is the source code for org.mskcc.cbio.cgds.scripts.ImportTabDelimData.java

Source

/** Copyright (c) 2012 Memorial Sloan-Kettering Cancer Center.
**
** This library is free software; you can redistribute it and/or modify it
** under the terms of the GNU Lesser General Public License as published
** by the Free Software Foundation; either version 2.1 of the License, or
** any later version.
**
** This library is distributed in the hope that it will be useful, but
** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF
** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  The software and
** documentation provided hereunder is on an "as is" basis, and
** Memorial Sloan-Kettering Cancer Center 
** has no obligations to provide maintenance, support,
** updates, enhancements or modifications.  In no event shall
** Memorial Sloan-Kettering Cancer Center
** be liable to any party for direct, indirect, special,
** incidental or consequential damages, including lost profits, arising
** out of the use of this software and its documentation, even if
** Memorial Sloan-Kettering Cancer Center 
** has been advised of the possibility of such damage.  See
** the GNU Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public License
** along with this library; if not, write to the Free Software Foundation,
** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
**/

package org.mskcc.cbio.cgds.scripts;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.mskcc.cbio.cgds.dao.*;
import org.mskcc.cbio.cgds.model.CanonicalGene;
import org.mskcc.cbio.cgds.model.CnaEvent;
import org.mskcc.cbio.cgds.model.GeneticAlterationType;
import org.mskcc.cbio.cgds.model.GeneticProfile;
import org.mskcc.cbio.cgds.util.ConsoleUtil;
import org.mskcc.cbio.cgds.util.ProgressMonitor;

/**
 * Code to Import Copy Number Alteration or MRNA Expression Data.
 *
 * @author Ethan Cerami
 */
public class ImportTabDelimData {
    private HashSet<Long> importedGeneSet = new HashSet<Long>();
    private HashSet<String> importedMicroRNASet = new HashSet<String>();
    private static Logger logger = Logger.getLogger(ImportTabDelimData.class);

    /**
     * Barry Target Line:  A constant currently used to indicate the RAE method.
     */
    public static final String BARRY_TARGET = "Barry";

    /**
     * Consensus Target Line:  A constant currently used to indicate consensus of multiple
     * CNA calling algorithms.
     */
    public static final String CONSENSUS_TARGET = "consensus";

    private ProgressMonitor pMonitor;
    private File mutationFile;
    private String targetLine;
    private int geneticProfileId;
    private GeneticProfile geneticProfile;
    private HashSet<String> microRnaIdSet;

    /**
     * Constructor.
     *
     * @param dataFile         Data File containing CNA data.
     * @param targetLine       The line we want to import.
     *                         If null, all lines are imported.
     * @param geneticProfileId GeneticProfile ID.
     * @param pMonitor         Progress Monitor Object.
     */
    public ImportTabDelimData(File dataFile, String targetLine, int geneticProfileId, ProgressMonitor pMonitor) {
        this.mutationFile = dataFile;
        this.targetLine = targetLine;
        this.geneticProfileId = geneticProfileId;
        this.pMonitor = pMonitor;
    }

    /**
     * Constructor.
     *
     * @param dataFile         Data File containing CNA data.
     * @param geneticProfileId GeneticProfile ID.
     * @param pMonitor         Progress Monitor Object.
     */
    public ImportTabDelimData(File dataFile, int geneticProfileId, ProgressMonitor pMonitor) {
        this.mutationFile = dataFile;
        this.geneticProfileId = geneticProfileId;
        this.pMonitor = pMonitor;
    }

    /**
     * Import the CNA Data.
     *
     * @throws IOException  IO Error.
     * @throws DaoException Database Error.
     */
    public void importData() throws IOException, DaoException {
        DaoMicroRna daoMicroRna = new DaoMicroRna();
        microRnaIdSet = daoMicroRna.getEntireSet();

        geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);

        FileReader reader = new FileReader(mutationFile);
        BufferedReader buf = new BufferedReader(reader);
        String headerLine = buf.readLine();
        String parts[] = headerLine.split("\t");
        String caseIds[];

        //  Branch, depending on targetLine setting
        if (targetLine == null) {
            caseIds = new String[parts.length - 1];
            System.arraycopy(parts, 1, caseIds, 0, parts.length - 1);
        } else {
            caseIds = new String[parts.length - 2];
            System.arraycopy(parts, 2, caseIds, 0, parts.length - 2);
        }
        pMonitor.setCurrentMessage("Import tab delimited data for " + caseIds.length + " cases.");

        // Add Cases to the Database
        ArrayList<String> orderedCaseList = new ArrayList<String>();
        for (int i = 0; i < caseIds.length; i++) {
            if (!DaoCaseProfile.caseExistsInGeneticProfile(caseIds[i], geneticProfileId)) {
                DaoCaseProfile.addCaseProfile(caseIds[i], geneticProfileId);
            }
            orderedCaseList.add(caseIds[i]);
        }
        DaoGeneticProfileCases daoGeneticProfileCases = new DaoGeneticProfileCases();
        daoGeneticProfileCases.addGeneticProfileCases(geneticProfileId, orderedCaseList);

        String line = buf.readLine();
        int numRecordsStored = 0;

        DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance();

        DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance();
        DaoMicroRnaAlteration daoMicroRnaAlteration = DaoMicroRnaAlteration.getInstance();

        while (line != null) {
            if (pMonitor != null) {
                pMonitor.incrementCurValue();
                ConsoleUtil.showProgress(pMonitor);
            }

            //  Ignore lines starting with #
            if (!line.startsWith("#") && line.trim().length() > 0) {
                parts = line.split("\t", -1);

                int startIndex = getStartIndex();
                String values[] = (String[]) ArrayUtils.subarray(parts, startIndex, parts.length);

                String method = null;
                String geneId = null;
                if (targetLine != null) {
                    method = parts[0];
                    geneId = parts[1];
                } else {
                    geneId = parts[0];
                }

                if (geneId != null) {
                    if (geneId.contains("///") || geneId.contains("---")) {
                        //  Ignore gene IDs separated by ///.  This indicates that
                        //  the line contains information regarding multiple genes, and
                        //  we cannot currently handle this.
                        //  Also, ignore gene IDs that are specified as ---.  This indicates
                        //  the line contains information regarding an unknown gene, and
                        //  we cannot currently handle this.
                        logger.debug("Ignoring gene ID:  " + geneId);
                    } else {
                        // deal with multiple symbols separate by |, use the first one
                        int ix = geneId.indexOf("|");
                        if (ix > 0) {
                            geneId = geneId.substring(0, ix);
                        }

                        //  Assume we are dealing with Entrez Gene Ids or Symbols.
                        List<CanonicalGene> genes = daoGene.guessGene(geneId);

                        //  If no target line is specified or we match the target, process.
                        if (targetLine == null || method.equals(targetLine)) {
                            if (genes.isEmpty()) {
                                //  if gene is null, we might be dealing with a micro RNA ID
                                if (geneId.toLowerCase().contains("-mir-")) {
                                    //                                    if (microRnaIdSet.contains(geneId)) {
                                    //                                        storeMicroRnaAlterations(values, daoMicroRnaAlteration, geneId);
                                    //                                        numRecordsStored++;
                                    //                                    } else {
                                    pMonitor.logWarning("microRNA is not known to me:  [" + geneId
                                            + "]. Ignoring it " + "and all tab-delimited data associated with it!");
                                    //                                    }
                                } else {
                                    pMonitor.logWarning("Gene not found:  [" + geneId + "]. Ignoring it "
                                            + "and all tab-delimited data associated with it!");
                                }
                            } else if (genes.size() == 1) {
                                storeGeneticAlterations(values, daoGeneticAlteration, genes.get(0));
                                if (geneticProfile != null && geneticProfile
                                        .getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION
                                        && geneticProfile.showProfileInAnalysisTab()) {
                                    storeCna(genes.get(0).getEntrezGeneId(), orderedCaseList, values);
                                }

                                numRecordsStored++;
                            } else {
                                for (CanonicalGene gene : genes) {
                                    if (gene.isMicroRNA()) { // for micro rna, duplicate the data
                                        storeGeneticAlterations(values, daoGeneticAlteration, gene);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            line = buf.readLine();
        }
        if (MySQLbulkLoader.isBulkLoad()) {
            daoGeneticAlteration.flushGeneticAlteration();
            daoMicroRnaAlteration.flushMicroRnaAlteration();
        }

        if (numRecordsStored == 0) {
            throw new DaoException("Something has gone wrong!  I did not save any records" + " to the database!");
        }
    }

    private void storeMicroRnaAlterations(String[] values, DaoMicroRnaAlteration daoMicroRnaAlteration,
            String microRnaId) throws DaoException {

        //  Check that we have not already imported information regarding this microRNA.
        //  This is an important check, because a GISTIC or RAE file may contain
        //  multiple rows for the same gene, and we only want to import the first row.
        if (!importedMicroRNASet.contains(microRnaId)) {
            daoMicroRnaAlteration.addMicroRnaAlterations(geneticProfileId, microRnaId, values);
            importedMicroRNASet.add(microRnaId);
        }
    }

    private void storeGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration,
            CanonicalGene gene) throws DaoException {

        //  Check that we have not already imported information regarding this gene.
        //  This is an important check, because a GISTIC or RAE file may contain
        //  multiple rows for the same gene, and we only want to import the first row.
        if (!importedGeneSet.contains(gene.getEntrezGeneId())) {
            daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values);
            importedGeneSet.add(gene.getEntrezGeneId());
        }
    }

    private void storeCna(long entrezGeneId, ArrayList<String> cases, String[] values) throws DaoException {
        int n = values.length;
        if (n == 0)
            System.out.println();
        int i = values[0].equals("" + entrezGeneId) ? 1 : 0;
        for (; i < n; i++) {
            if (values[i].equals(GeneticAlterationType.AMPLIFICATION)
                    || values[i].equals(GeneticAlterationType.HOMOZYGOUS_DELETION)) {
                CnaEvent event = new CnaEvent(cases.get(i), geneticProfileId, entrezGeneId,
                        Short.parseShort(values[i]));
                DaoCnaEvent.addCaseCnaEvent(event);
            }
        }
    }

    private int getStartIndex() {
        int startIndex = 2;
        if (targetLine == null) {
            startIndex = 1;
        }
        return startIndex;
    }
}