fr.cirad.mgdb.exporting.individualoriented.PLinkExportHandler.java Source code

Java tutorial

Introduction

Here is the source code for fr.cirad.mgdb.exporting.individualoriented.PLinkExportHandler.java

Source

/*******************************************************************************
 * MGDB Export - Mongo Genotype DataBase, export handlers
 * Copyright (C) 2016 <CIRAD>
 *     
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * See <http://www.gnu.org/licenses/gpl-3.0.html> for details about
 * GNU Affero General Public License V3.
 *******************************************************************************/
package fr.cirad.mgdb.exporting.individualoriented;

import fr.cirad.mgdb.model.mongo.maintypes.Individual;
import fr.cirad.mgdb.model.mongo.maintypes.VariantData;
import fr.cirad.mgdb.model.mongo.subtypes.ReferencePosition;
import fr.cirad.mgdb.model.mongo.subtypes.VariantRunData;
import fr.cirad.mgdb.model.mongodao.MgdbDao;
import fr.cirad.tools.ProgressIndicator;
import fr.cirad.tools.mongo.MongoTemplateManager;
import htsjdk.variant.variantcontext.VariantContext.Type;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.log4j.Logger;
import org.springframework.data.mongodb.core.MongoTemplate;

import com.mongodb.DBCursor;
import com.mongodb.DBObject;

/**
 * The Class PLinkExportHandler.
 */
public class PLinkExportHandler extends AbstractIndividualOrientedExportHandler {

    /** The Constant LOG. */
    private static final Logger LOG = Logger.getLogger(PLinkExportHandler.class);

    /** The supported variant types. */
    private static List<String> supportedVariantTypes;

    static {
        supportedVariantTypes = new ArrayList<String>();
        supportedVariantTypes.add(Type.SNP.toString());
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatName()
     */
    @Override
    public String getExportFormatName() {
        return "PLINK";
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatDescription()
     */
    @Override
    public String getExportFormatDescription() {
        return "Exports zipped PED and MAP files. See <a target='_blank' href='http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml'>http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml</a> for more details";
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.individualoriented.AbstractIndividualOrientedExportHandler#getSupportedVariantTypes()
     */
    @Override
    public List<String> getSupportedVariantTypes() {
        return supportedVariantTypes;
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.individualoriented.AbstractIndividualOrientedExportHandler#exportData(java.io.OutputStream, java.lang.String, java.util.Collection, boolean, fr.cirad.tools.ProgressIndicator, com.mongodb.DBCursor, java.util.Map, java.util.Map)
     */
    @Override
    public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles,
            boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor,
            Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles)
            throws Exception {
        File warningFile = File.createTempFile("export_warnings_", "");
        FileWriter warningFileWriter = new FileWriter(warningFile);

        ZipOutputStream zos = new ZipOutputStream(outputStream);

        if (readyToExportFiles != null)
            for (String readyToExportFile : readyToExportFiles.keySet()) {
                zos.putNextEntry(new ZipEntry(readyToExportFile));
                InputStream inputStream = readyToExportFiles.get(readyToExportFile);
                byte[] dataBlock = new byte[1024];
                int count = inputStream.read(dataBlock, 0, 1024);
                while (count != -1) {
                    zos.write(dataBlock, 0, count);
                    count = inputStream.read(dataBlock, 0, 1024);
                }
            }

        MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
        int markerCount = markerCursor.count();

        String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size()
                + "individuals";
        zos.putNextEntry(new ZipEntry(exportName + ".ped"));

        TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>();
        short nProgress = 0, nPreviousProgress = 0;
        int i = 0;
        for (File f : individualExportFiles) {
            BufferedReader in = new BufferedReader(new FileReader(f));
            try {
                String individualId, line = in.readLine(); // read sample id
                if (line != null) {
                    individualId = line;
                    String population = getIndividualPopulation(sModule, line);
                    String individualInfo = (population == null ? "." : population) + " " + individualId;
                    zos.write((individualInfo + " 0 0 0 " + getIndividualGenderCode(sModule, individualId))
                            .getBytes());
                } else
                    throw new Exception("Unable to read first line of temp export file " + f.getName());

                int nMarkerIndex = 0;
                while ((line = in.readLine()) != null) {
                    List<String> genotypes = MgdbDao.split(line, "|");
                    HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes
                    int highestGenotypeCount = 0;
                    String mostFrequentGenotype = null;
                    for (String genotype : genotypes) {
                        if (genotype.length() == 0)
                            continue; /* skip missing genotypes */

                        int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                        if (gtCount > highestGenotypeCount) {
                            highestGenotypeCount = gtCount;
                            mostFrequentGenotype = genotype;
                        }
                        genotypeCounts.put(genotype, gtCount);
                    }

                    if (genotypeCounts.size() > 1) {
                        warningFileWriter.write("- Dissimilar genotypes found for variant " + nMarkerIndex
                                + ", individual " + individualId + ". Exporting most frequent: "
                                + mostFrequentGenotype + "\n");
                        problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                    }

                    String[] alleles = mostFrequentGenotype == null ? new String[0]
                            : mostFrequentGenotype.split(" ");
                    if (alleles.length > 2) {
                        warningFileWriter.write("- More than 2 alleles found for variant " + nMarkerIndex
                                + ", individual " + individualId + ". Exporting only the first 2 alleles.\n");
                        problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                    }

                    String all1 = alleles.length == 0 ? "0" : alleles[0];
                    String all2 = alleles.length == 0 ? "0" : alleles[alleles.length == 1 ? 0 : 1];
                    if (all1.length() != 1 || all2.length() != 1) {
                        warningFileWriter
                                .write("- SNP expected, but alleles are not coded on a single char for variant "
                                        + nMarkerIndex + ", individual " + individualId
                                        + ". Ignoring this genotype.\n");
                        problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                    } else
                        zos.write((" " + all1 + " " + all2).getBytes());

                    nMarkerIndex++;
                }
            } catch (Exception e) {
                LOG.error("Error exporting data", e);
                progress.setError("Error exporting data: " + e.getClass().getSimpleName()
                        + (e.getMessage() != null ? " - " + e.getMessage() : ""));
                return;
            } finally {
                in.close();
            }

            if (progress.hasAborted())
                return;

            nProgress = (short) (++i * 100 / individualExportFiles.size());
            if (nProgress > nPreviousProgress) {
                progress.setCurrentStepProgress(nProgress);
                nPreviousProgress = nProgress;
            }
            zos.write('\n');

            if (!f.delete()) {
                f.deleteOnExit();
                LOG.info("Unable to delete tmp export file " + f.getAbsolutePath());
            }
        }
        warningFileWriter.close();

        zos.putNextEntry(new ZipEntry(exportName + ".map"));

        int avgObjSize = (Integer) mongoTemplate
                .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
        int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;

        markerCursor.batchSize(nChunkSize);
        int nMarkerIndex = 0;
        while (markerCursor.hasNext()) {
            DBObject exportVariant = markerCursor.next();
            DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
            Comparable markerId = (Comparable) exportVariant.get("_id");
            String chrom = (String) refPos.get(ReferencePosition.FIELDNAME_SEQUENCE);
            Long pos = ((Number) refPos.get(ReferencePosition.FIELDNAME_START_SITE)).longValue();

            if (chrom == null)
                LOG.warn("Chromosomal position not found for marker " + markerId);
            Comparable exportedId = markerSynonyms == null ? markerId : markerSynonyms.get(markerId);
            zos.write(((chrom == null ? "0" : chrom) + " " + exportedId + " " + 0 + " " + (pos == null ? 0 : pos)
                    + LINE_SEPARATOR).getBytes());

            if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { // we are going to need this marker's name for the warning file
                Comparable variantName = markerId;
                if (markerSynonyms != null) {
                    Comparable syn = markerSynonyms.get(markerId);
                    if (syn != null)
                        variantName = syn;
                }
                problematicMarkerIndexToNameMap.put(nMarkerIndex, variantName);
            }
            nMarkerIndex++;
        }

        if (warningFile.length() > 0) {
            zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
            int nWarningCount = 0;
            BufferedReader in = new BufferedReader(new FileReader(warningFile));
            String sLine;
            while ((sLine = in.readLine()) != null) {
                for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet())
                    sLine = sLine.replaceAll("__" + aMarkerIndex + "__",
                            problematicMarkerIndexToNameMap.get(aMarkerIndex).toString());
                zos.write((sLine + "\n").getBytes());
                in.readLine();
                nWarningCount++;
            }
            LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
            in.close();
        }
        warningFile.delete();

        zos.close();
        progress.setCurrentStepProgress((short) 100);
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getStepList()
     */
    @Override
    public List<String> getStepList() {
        return Arrays.asList(new String[] { "Exporting data to PLINK format" });
    }

    /**
     * Gets the individual population.
     *
     * @param sModule the module
     * @param individual the individual
     * @return the individual population
     */
    protected String getIndividualPopulation(final String sModule, final String individual) {
        MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
        return mongoTemplate.findById(individual, Individual.class).getPopulation();
    }

    /**
     * Gets the individual gender code.
     *
     * @param sModule the module
     * @param individual the individual
     * @return the individual gender code
     */
    protected String getIndividualGenderCode(String sModule, String individual) {
        return "U";
    }
}