fr.ens.transcriptome.aozan.fastqscreen.FastqScreenGenomeMapper.java Source code

Java tutorial

Introduction

Here is the source code for fr.ens.transcriptome.aozan.fastqscreen.FastqScreenGenomeMapper.java

Source

/*
 *                  Aozan development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU General Public License version 3 or later 
 * and CeCILL. This should be distributed with the code. If you 
 * do not have a copy, see:
 *
 *      http://www.gnu.org/licenses/gpl-3.0-standalone.html
 *      http://www.cecill.info/licences/Licence_CeCILL_V2-en.html
 *
 * Copyright for this code is held jointly by the Genomic platform
 * of the Institut de Biologie de l'cole Normale Suprieure and
 * the individual authors. These should be listed in @author doc
 * comments.
 *
 * For more information on the Aozan project and its aims,
 * or to join the Aozan Google group, visit the home page at:
 *
 *      http://www.transcriptome.ens.fr/aozan
 *
 */

package fr.ens.transcriptome.aozan.fastqscreen;

import static fr.ens.transcriptome.eoulsan.EoulsanRuntime.getSettings;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.FileWriteMode;
import com.google.common.io.Files;

import fr.ens.transcriptome.aozan.AozanException;
import fr.ens.transcriptome.aozan.Common;
import fr.ens.transcriptome.aozan.Globals;
import fr.ens.transcriptome.aozan.QC;
import fr.ens.transcriptome.aozan.Settings;
import fr.ens.transcriptome.eoulsan.bio.BadBioEntryException;
import fr.ens.transcriptome.eoulsan.bio.GenomeDescription;
import fr.ens.transcriptome.eoulsan.data.DataFile;
import fr.ens.transcriptome.eoulsan.data.protocols.DataProtocolService;
import fr.ens.transcriptome.eoulsan.data.storages.GenomeDescStorage;
import fr.ens.transcriptome.eoulsan.data.storages.SimpleGenomeDescStorage;
import fr.ens.transcriptome.eoulsan.illumina.CasavaDesign;
import fr.ens.transcriptome.eoulsan.illumina.CasavaSample;
import fr.ens.transcriptome.eoulsan.illumina.io.CasavaDesignCSVReader;

/**
 * This class read the alias genome file. It make correspondence between genome
 * name in casava design file and the genome name reference used for identified
 * index of bowtie mapper.
 * @since 1.3
 * @author Sandrine Perrin
 */
public class FastqScreenGenomeMapper {

    /** Logger. */
    private static final Logger LOGGER = Common.getLogger();

    /** Splitter. */
    private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults().omitEmptyStrings();

    /** Pattern. */
    private static final Pattern PATTERN = Pattern.compile(".,;:/-_'");

    private static FastqScreenGenomeMapper singleton;
    private final Map<String, String> properties;

    // Correspondence between genome name in casava design file
    private final Set<String> genomesReferencesSample;
    private final Set<String> genomesContaminants;

    // Correspondence between genome sample in run and genome name reference
    private final Map<String, String> genomesNamesConvertor;

    // Associated genome name from design file with valid genome call for mapping
    private final Map<String, String> genomesReferencesSampleRenamed;

    private final Set<String> genomesToMapping;

    private final GenomeDescStorage storage;

    /**
     * Set reference genomes for the samples of a run. Retrieve list of genomes
     * sample from casava design file and filtered them compared to alias genome
     * file. Keep only if it can be create the genome description object.
     * @return collection valid genomes names can be use for mapping
     * @throws AozanException if an error occurs during updating alias genomes
     *           file
     */
    private Set<String> collectGenomesForMapping() throws AozanException {

        // Identify genomes can be use for mapping
        final Set<String> genomes = new HashSet<>();
        final Set<String> newGenomes = new HashSet<>();

        for (final String genome : this.genomesReferencesSample) {
            final DataFile genomeFile = new DataFile("genome://" + genome);

            // Retrieve genome description if it exists
            GenomeDescription gdesc = null;
            try {
                gdesc = this.createGenomeDescription(genomeFile);
            } catch (final Exception isIgnored) {
                // Do nothing
            }

            // Check if a genome is available for mapping
            if (gdesc != null) {
                // Genome description exist for the genome
                genomes.add(genome);
                this.genomesReferencesSampleRenamed.put(genome, genome);

            } else {
                // Parse alias file to find a valid genome name
                final String aliasGenomeName = this.genomesNamesConvertor.get(genome);

                if (aliasGenomeName == null || aliasGenomeName.isEmpty()) {
                    // No genome name found, add entry in alias genomes file
                    newGenomes.add(genome);

                } else {
                    // Replace genome name from design file by valid name
                    genomes.add(aliasGenomeName);
                    this.genomesReferencesSampleRenamed.put(genome, aliasGenomeName);
                }
            }
        }

        // Update alias genomes file
        this.updateAliasGenomeFile(Collections.unmodifiableSet(newGenomes));

        // Union genomes contaminants and genomes references
        final Set<String> genomesToMapping = Sets.newLinkedHashSet(this.genomesContaminants);
        genomesToMapping.addAll(genomes);

        return Collections.unmodifiableSet(genomesToMapping);
    }

    /**
     * Create a GenomeDescription object from a Fasta file.
     * @param genomeFile file used for create index
     * @return genomeDescription description of the genome
     * @throws BadBioEntryException if an error occurs during create genome
     *           description object
     * @throws IOException if an error occurs during access genome file
     */
    public GenomeDescription createGenomeDescription(final DataFile genomeFile)
            throws BadBioEntryException, IOException {

        if (!genomeFile.exists()) {
            LOGGER.warning("Fastqscreen " + genomeFile.getBasename() + " not exists.");
        }

        GenomeDescription desc = null;

        if (this.storage != null) {
            desc = this.storage.get(genomeFile);
        }

        // Compute the genome description
        if (desc == null) {
            desc = GenomeDescription.createGenomeDescFromFasta(genomeFile.open(), genomeFile.getName());

            if (this.storage != null) {
                this.storage.put(genomeFile, desc);
            }
        }

        return desc;
    }

    /**
     * Add the genome of the sample in the file which does correspondence with
     * reference genome.
     * @param genomesToAdd genomes must be added in alias genomes file
     */
    private void updateAliasGenomeFile(final Set<String> genomesToAdd) {

        // None genome to add
        if (genomesToAdd.isEmpty()) {
            return;
        }

        final File aliasGenomesFile = new File(
                this.properties.get(Settings.QC_CONF_FASTQSCREEN_SETTINGS_GENOMES_ALIAS_PATH_KEY));

        try {
            if (aliasGenomesFile.exists()) {

                final Writer fw = Files
                        .asCharSink(aliasGenomesFile, Globals.DEFAULT_FILE_ENCODING, FileWriteMode.APPEND)
                        .openStream();

                for (final String genomeSample : genomesToAdd) {
                    fw.write(genomeSample + "=\n");
                }

                fw.flush();
                fw.close();
            }
        } catch (final IOException ignored) {
            LOGGER.warning("Writing alias genomes file failed : file can not be updated.");
        }
    }

    /**
     * Initialize collection on genomes reference names from the samples
     * sequencing.
     * @return collection on genomes reference names for the samples
     */
    private Set<String> initGenomesReferencesSample() {

        // Samplesheet file for demultiplexing
        final File designFile = new File(this.properties.get(QC.CASAVA_DESIGN_PATH));

        final Set<String> genomesFromDesign = new HashSet<>();

        if (designFile.exists() && designFile.isFile()) {

            final CasavaDesignCSVReader casavaReader;
            final CasavaDesign casavaDesign;

            try {
                // Reading casava design file in format csv
                casavaReader = new CasavaDesignCSVReader(designFile);
                casavaDesign = casavaReader.read();

            } catch (final Exception e) {
                // Return empty list
                return Collections.emptySet();
            }

            // Retrieve all genome sample included in casava design file
            for (final CasavaSample casavaSample : casavaDesign) {
                final String genomeSample = casavaSample.getSampleRef().replaceAll("\"", "").toLowerCase();

                // Replace all symbols not letters or numbers by space
                PATTERN.matcher(genomeSample).replaceAll(" ");

                genomesFromDesign.add(genomeSample.trim());
            }
            return genomesFromDesign;
        }

        // Fail to read design file
        return Collections.emptySet();
    }

    /**
     * Initialization genomes or dataset contaminant define in Aozan
     * configuration.
     * @return genomes list
     * @throws AozanException
     */
    private Set<String> initGenomesContaminant() throws AozanException {

        // Set genomes in configuration file
        final String val = this.properties.get(Settings.QC_CONF_FASTQSCREEN_GENOMES_KEY);

        final Set<String> genomes = new HashSet<>();

        if (val == null || val.isEmpty()) {
            throw new AozanException("FastqScreen : none genomes contaminant define.");
        }

        for (final String genome : COMMA_SPLITTER.split(val)) {
            final DataFile genomeFile = new DataFile("genome://" + genome);

            // Retrieve genome description if it exists
            GenomeDescription gdesc = null;
            try {
                gdesc = this.createGenomeDescription(genomeFile);
            } catch (final Exception isIgnored) {
                // Do nothing
            }
            // Check genomes can be use for mapping
            if (gdesc != null) {
                genomes.add(genome);
            }
        }

        // No genome can be use for mapping
        if (genomes.isEmpty()) {
            throw new AozanException(
                    "FastqScreen : none genomes contaminant can be use from configuration file: " + val);
        }

        return Collections.unmodifiableSet(genomes);
    }

    /**
     * Create a map which does correspondence between genome of sample and
     * reference genome from a file, the path is in aozan configuration.
     * @param map to correspondence between common genome name and valid call name
     *          for mapping
     * @throws AozanException if alias genomes file doesn't exist.
     */
    private Map<String, String> loadAliasGenomesFile() throws AozanException {

        // Extract property on alias genomes path
        final String val = this.properties.get(Settings.QC_CONF_FASTQSCREEN_SETTINGS_GENOMES_ALIAS_PATH_KEY).trim();
        if (val == null || val.length() == 0) {
            LOGGER.fine("FastqScreen no alias genome file parameter define.");
            return Collections.emptyMap();
        }

        final File aliasGenomesFile = new File(val);

        final Map<String, String> genomes = new HashMap<>();

        // Not found alias genomes file
        if (!aliasGenomesFile.exists()) {
            throw new AozanException(
                    "FastqScreen alias genome file doesn't exists " + aliasGenomesFile.getAbsolutePath());
        }

        try {
            // Read alias genomes files
            final BufferedReader br = Files.newReader(aliasGenomesFile, Globals.DEFAULT_FILE_ENCODING);

            String line = null;

            while ((line = br.readLine()) != null) {

                final int pos = line.indexOf('=');
                if (pos == -1) {
                    continue;
                }

                final String key = line.substring(0, pos);
                final String value = line.substring(pos + 1);

                // Retrieve genomes identified in Casava design file
                // Certain have not genome name reference
                if (!(value == null || value.isEmpty())) {
                    genomes.put(key, value);
                }
            }
            br.close();

        } catch (final IOException ignored) {
            LOGGER.warning(
                    "Reading alias genomes file failed : none genome sample can be used for detection contamination.");
            return Collections.emptyMap();
        }

        return Collections.unmodifiableMap(genomes);
    }

    /**
     * Return the reference genome corresponding to the genome sample if it is
     * present in alias genomes file.
     * @param genome name of genome sample
     * @return reference genome corresponding to genome if it exists or empty
     *         string or null if no genome exist.
     */
    public String getGenomeReferenceCorresponding(final String genome) {

        final String genomeTrimmed = genome.replaceAll("\"", "").trim().toLowerCase();

        return this.genomesReferencesSampleRenamed.get(genomeTrimmed);

    }

    /**
     * Get the collection genomes name can be used for the mapping.
     * @return collection genomes name for mapping
     */
    public Set<String> getGenomesToMapping() {
        return this.genomesToMapping;
    }

    /**
     * Check genomes name included in collection genomes contaminants.
     * @param genome genome name
     * @return true if is a genome contaminant otherwise false
     */
    public boolean isGenomeContamination(final String genome) {
        return this.genomesContaminants.contains(genome);
    }

    /**
     * Get the collection of genomes contaminants can be used for mapping.
     * @return collection of genomes contaminans
     */
    public Set<String> getGenomesContaminants() {
        return this.genomesContaminants;
    }

    /**
     * Create a instance of FastqScreenGenomeMapper or if it exists return
     * instance.
     * @param props the props
     * @return instance of FastqScreenGenomeMapper
     * @throws AozanException if the initialization of instance fail.
     */
    public static FastqScreenGenomeMapper getInstance(final Map<String, String> props) throws AozanException {

        if (singleton == null) {
            singleton = new FastqScreenGenomeMapper(props);

        }
        return singleton;
    }

    /**
     * Create a instance of FastqScreenGenomeMapper or if it exists return
     * instance.
     * @return instance of FastqScreenGenomeMapper
     * @throws AozanException if the instance doesn't exist
     */
    public static FastqScreenGenomeMapper getInstance() throws AozanException {

        if (singleton == null) {
            throw new AozanException(
                    "FastqScreenGenomeMapper instance doesn't exist. It should be initialize with congfiguration Aozan properties.");
        }
        return singleton;
    }

    //
    // Constructor
    //

    /**
     * Private constructor of FastqScreenGenomeMapper.
     * @throws AozanException if the initialization of instance fail.
     */
    private FastqScreenGenomeMapper(final Map<String, String> props) throws AozanException {
        this.properties = props;

        // Init setting of eoulsan to access on storage genome objects
        final fr.ens.transcriptome.eoulsan.Settings settings = getSettings();

        settings.setGenomeDescStoragePath(
                this.properties.get(Settings.QC_CONF_FASTQSCREEN_SETTINGS_GENOMES_DESC_PATH_KEY));
        settings.setGenomeMapperIndexStoragePath(
                this.properties.get(Settings.QC_CONF_FASTQSCREEN_SETTINGS_MAPPERS_INDEXES_PATH_KEY));
        settings.setGenomeStoragePath(this.properties.get(Settings.QC_CONF_FASTQSCREEN_SETTINGS_GENOMES_KEY));

        // Set data protocol from Eoulsan not load for Aozan because it needs to add
        // dependencies
        DataProtocolService.getInstance().addClassesToNotLoad(
                Lists.newArrayList("fr.ens.transcriptome.eoulsan.data.protocols.S3DataProtocol",
                        "fr.ens.transcriptome.eoulsan.data.protocols.S3NDataProtocol"));

        final DataFile genomeDescStoragePath = new DataFile(settings.getGenomeDescStoragePath());
        this.storage = SimpleGenomeDescStorage.getInstance(genomeDescStoragePath);

        // Load alias genomes file
        this.genomesNamesConvertor = this.loadAliasGenomesFile();

        // Collect genomes references list sample from design file
        this.genomesReferencesSample = this.initGenomesReferencesSample();

        // Collect genomes contaminant list
        this.genomesContaminants = this.initGenomesContaminant();

        // Correspondance between initial genome name and valid genome name for
        // mapping
        this.genomesReferencesSampleRenamed = new HashMap<>();

        // Collect genomes useful to contaminant detection
        this.genomesToMapping = this.collectGenomesForMapping();

    }
}