ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.expression.arrayDesign;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.core.context.SecurityContext;
import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

import ubic.gemma.analysis.report.ArrayDesignReportService;
import ubic.gemma.analysis.sequence.ProbeMapper;
import ubic.gemma.analysis.sequence.ProbeMapperConfig;
import ubic.gemma.analysis.service.ArrayDesignAnnotationService;
import ubic.gemma.analysis.service.ExpressionDataFileService;
import ubic.gemma.externalDb.GoldenPathSequenceAnalysis;
import ubic.gemma.genome.gene.service.GeneService;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.designElement.CompositeSequenceService;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.BioSequenceService;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.model.genome.gene.GeneAlias;
import ubic.gemma.model.genome.gene.GeneProduct;
import ubic.gemma.model.genome.gene.GeneProductService;
import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociationService;
import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResultService;
import ubic.gemma.persistence.Persister;

/**
 * For an array design, generate gene product mappings for the sequences.
 * 
 * @author pavlidis
 * @version $Id: ArrayDesignProbeMapperServiceImpl.java,v 1.10 2013/04/25 16:53:48 paul Exp $
 */
@Component
public class ArrayDesignProbeMapperServiceImpl implements ArrayDesignProbeMapperService {

    private static Log log = LogFactory.getLog(ArrayDesignProbeMapperServiceImpl.class.getName());

    private static final int QUEUE_SIZE = 20000;

    @Autowired
    private AnnotationAssociationService annotationAssociationService;

    @Autowired
    private ArrayDesignService arrayDesignService;

    @Autowired
    private BioSequenceService bioSequenceService;

    @Autowired
    private BlatResultService blatResultService;

    @Autowired
    private CompositeSequenceService compositeSequenceService;

    @Autowired
    private GeneService geneService;

    @Autowired
    private GeneProductService geneProductService;

    @Autowired
    private Persister persisterHelper;

    @Autowired
    private ProbeMapper probeMapper;

    @Autowired
    private ArrayDesignReportService arrayDesignReportService;

    @Autowired
    private ArrayDesignAnnotationService arrayDesignAnnotationService;

    @Autowired
    private ExpressionDataFileService expressionDataFileService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperService#printResult(ubic.gemma.model.expression
     * .designElement.CompositeSequence, java.util.Collection)
     */
    @Override
    public void printResult(CompositeSequence compositeSequence, Collection<BlatAssociation> col) {
        for (BlatAssociation blatAssociation : col) {
            printResult(compositeSequence, blatAssociation);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperService#processArrayDesign(ubic.gemma.model.expression
     * .arrayDesign.ArrayDesign)
     */
    @Override
    public void processArrayDesign(ArrayDesign arrayDesign) {
        this.processArrayDesign(arrayDesign, new ProbeMapperConfig(), true);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperService#processArrayDesign(ubic.gemma.model.expression
     * .arrayDesign.ArrayDesign, ubic.gemma.analysis.sequence.ProbeMapperConfig, boolean)
     */
    @Override
    public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {

        assert config != null;

        if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
            throw new IllegalArgumentException(
                    "Do not use this service to process platforms that do not use an probe-based technology.");
        }

        Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());

        Taxon taxon = arrayDesign.getPrimaryTaxon();
        if (taxa.size() > 1 && taxon == null) {
            throw new IllegalArgumentException(
                    "Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
        }

        GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);

        BlockingQueue<BlatAssociation> persistingQueue = new ArrayBlockingQueue<BlatAssociation>(QUEUE_SIZE);
        AtomicBoolean generatorDone = new AtomicBoolean(false);
        AtomicBoolean loaderDone = new AtomicBoolean(false);

        load(persistingQueue, generatorDone, loaderDone);

        if (useDB) {
            log.info("Removing any old associations");
            arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        }

        int count = 0;
        int hits = 0;
        log.info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            Map<String, Collection<BlatAssociation>> results = processCompositeSequence(config, taxon, goldenPathDb,
                    compositeSequence);

            if (results == null)
                continue;

            for (Collection<BlatAssociation> col : results.values()) {
                for (BlatAssociation association : col) {
                    if (log.isDebugEnabled())
                        log.debug(association);
                }

                if (useDB) {
                    // persisting is done in a separate thread.
                    persistingQueue.addAll(col);
                } else {
                    printResult(compositeSequence, col);
                }
                ++hits;
            }

            if (++count % 200 == 0) {
                log.info("Processed " + count + " composite sequences" + " with blat results; " + hits
                        + " mappings found.");
            }
        }

        generatorDone.set(true);

        log.info("Waiting for loading to complete ...");
        while (!loaderDone.get()) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        log.info("Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");
        try {
            this.deleteOldFiles(arrayDesign);
        } catch (IOException e) {
            log.error("Failed to delete all old files associated with " + arrayDesign
                    + ", be sure to clean them up manually or regenerate them");
        }
        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

    }

    /**
     * Delete outdated annotation and associated experiment files.
     * 
     * @param arrayDesign
     */
    private void deleteOldFiles(ArrayDesign arrayDesign) throws IOException {
        arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
        Collection<ExpressionExperiment> ees4platform = arrayDesignService.getExpressionExperiments(arrayDesign);
        for (ExpressionExperiment ee : ees4platform) {
            expressionDataFileService.deleteAllFiles(ee);
        }

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperService#processArrayDesign(ubic.gemma.model.expression
     * .arrayDesign.ArrayDesign, ubic.gemma.model.genome.Taxon, java.io.File,
     * ubic.gemma.model.common.description.ExternalDatabase, boolean)
     */
    @Override
    public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB,
            boolean ncbiIds) throws IOException {

        if (taxon == null && !ncbiIds) {
            throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
        }

        if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
            throw new IllegalArgumentException(
                    "Do not use this service to process platforms that do not use an probe-based technology.");
        }

        BufferedReader b = new BufferedReader(new FileReader(source));
        String line = null;
        int numSkipped = 0;

        log.info("Removing any old associations");
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);

        while ((line = b.readLine()) != null) {

            if (StringUtils.isBlank(line)) {
                continue;
            }
            if (line.startsWith("#")) {
                continue;
            }

            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
            if (fields.length != 3) {
                throw new IOException("Illegal format, expected three columns, got " + fields.length);
            }

            String probeId = fields[0];
            String seqName = fields[1];

            /*
             * FIXME. We have to allow NCBI gene ids here.
             */
            String geneSymbol = fields[2];

            if (StringUtils.isBlank(geneSymbol)) {
                numSkipped++;
                continue;
            }

            CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);

            if (c == null) {
                if (log.isDebugEnabled())
                    log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
                numSkipped++;
                continue;
            }

            // a probe can have more than one gene associated with it if so they are piped |
            Collection<Gene> geneListProbe = new HashSet<Gene>();

            // indicate multiple genes
            Gene geneDetails = null;

            StringTokenizer st = new StringTokenizer(geneSymbol, "|");
            while (st.hasMoreTokens()) {
                String geneToken = st.nextToken().trim();
                if (ncbiIds) {
                    geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
                } else {
                    geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
                }
                if (geneDetails != null) {
                    geneListProbe.add(geneDetails);
                }
            }

            if (geneListProbe.size() == 0) {
                log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
                numSkipped++;
                continue;
            } else if (geneListProbe.size() > 1) {
                // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to make a
                // lot of fuss.
                log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
            }

            BioSequence bs = c.getBiologicalCharacteristic();

            if (bs != null) {
                if (StringUtils.isNotBlank(seqName)) {
                    bs = bioSequenceService.thaw(bs);
                    if (!bs.getName().equals(seqName)) {
                        log.warn("Sequence name '" + seqName + "' given for " + probeId
                                + " does not match existing entry " + bs.getName() + ", skipping");
                        numSkipped++;
                        continue;
                    }

                }
                // otherwise we assume everything is okay.
            } else {
                // create one based on the text provided.
                if (StringUtils.isBlank(seqName)) {
                    log.warn("You must provide sequence names for probes which are not already mapped. probeName="
                            + probeId + " had no sequence associated and no name provided; skipping");
                    numSkipped++;
                    continue;
                }

                bs = BioSequence.Factory.newInstance();
                bs.setName(seqName);
                bs.setTaxon(taxon);
                bs.setDescription("Imported from annotation file");

                // Placeholder.
                bs.setType(SequenceType.OTHER);

                bs = bioSequenceService.create(bs);

                c.setBiologicalCharacteristic(bs);

                // fixme: possibly move outside the loop if that's faster.
                compositeSequenceService.update(c);
            }

            assert bs.getId() != null;
            for (Gene gene : geneListProbe) {
                gene = geneService.thaw(gene);
                if (gene.getProducts().size() == 0) {
                    log.warn(
                            "There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
                    numSkipped++;
                    continue;
                }
                for (GeneProduct gp : gene.getProducts()) {
                    AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                    association.setBioSequence(bs);
                    association.setGeneProduct(gp);
                    association.setSource(sourceDB);
                    annotationAssociationService.create(association);
                }

            }

        }

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

        this.deleteOldFiles(arrayDesign);

        log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
        b.close();
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperService#processCompositeSequence(ubic.gemma.analysis
     * .sequence.ProbeMapperConfig, ubic.gemma.model.genome.Taxon, ubic.gemma.externalDb.GoldenPathSequenceAnalysis,
     * ubic.gemma.model.expression.designElement.CompositeSequence)
     */
    @Override
    @Transactional
    public Map<String, Collection<BlatAssociation>> processCompositeSequence(ProbeMapperConfig config, Taxon taxon,
            GoldenPathSequenceAnalysis goldenPathDb, CompositeSequence compositeSequence) {
        BioSequence bs = compositeSequence.getBiologicalCharacteristic();
        if (bs == null)
            return null;

        /*
         * It isn't 100% clear what the right thing to do is. But this seems at least _reasonable_ when there is a
         * mismatch
         */
        if (taxon != null && !bs.getTaxon().equals(taxon)) {
            return null;
        }

        GoldenPathSequenceAnalysis db;
        if (goldenPathDb == null) {
            db = new GoldenPathSequenceAnalysis(bs.getTaxon());
        } else {
            db = goldenPathDb;
        }

        final Collection<BlatResult> blatResults = blatResultService.findByBioSequence(bs);

        if (blatResults == null || blatResults.isEmpty())
            return null;

        Map<String, Collection<BlatAssociation>> results = probeMapper.processBlatResults(db, blatResults, config);

        if (log.isDebugEnabled())
            log.debug("Found " + results.size() + " mappings for " + compositeSequence + " (" + blatResults.size()
                    + " BLAT results)");
        return results;
    }

    /**
     * @param queue
     * @param generatorDone
     * @param loaderDone
     */
    void doLoad(final BlockingQueue<BlatAssociation> queue, AtomicBoolean generatorDone, AtomicBoolean loaderDone) {
        int loadedAssociationCount = 0;
        while (!(generatorDone.get() && queue.isEmpty())) {

            try {
                BlatAssociation ba = queue.poll();
                if (ba == null) {
                    continue;
                }

                GeneProduct geneProduct = ba.getGeneProduct();
                if (geneProduct.getId() == null) {
                    GeneProduct existing = geneProductService.find(geneProduct);

                    if (existing == null) {

                        existing = checkForAlias(geneProduct);
                        if (existing == null) {
                            /*
                             * We have to be careful not to cruft up the gene table now that I so carefully cleaned it.
                             * But this is a problem if we aren't adding some other association to the gene at least.
                             * But generally the mRNAs that GP has that NCBI doesn't are "alternative" or "additional".
                             */
                            if (log.isDebugEnabled())
                                log.debug("New gene product from GoldenPath is not in Gemma: " + geneProduct
                                        + " skipping association to " + ba.getBioSequence()
                                        + " [skipping policy in place]");
                            continue;
                        }
                    }
                    ba.setGeneProduct(existing);
                }

                persisterHelper.persist(ba);

                if (++loadedAssociationCount % 1000 == 0) {
                    log.info("Persisted " + loadedAssociationCount + " blat associations. " + "Current queue has "
                            + queue.size() + " items.");
                }

            } catch (Exception e) {
                log.error(e, e);
                loaderDone.set(true);
                throw new RuntimeException(e);
            }
        }
        log.info("Load thread done: loaded " + loadedAssociationCount + " blat associations. ");
        loaderDone.set(true);
    }

    /**
     * @param ba
     * @param geneProduct
     */
    private final GeneProduct checkForAlias(GeneProduct geneProduct) {
        Collection<GeneProduct> candidates = geneProductService.findByName(geneProduct.getName(),
                geneProduct.getGene().getTaxon());

        if (candidates.isEmpty())
            return null;

        Gene gene = geneProduct.getGene();
        for (GeneProduct existing2 : candidates) {
            Collection<GeneAlias> aliases = existing2.getGene().getAliases();
            for (GeneAlias geneAlias : aliases) {
                if (geneAlias.getAlias().equalsIgnoreCase(gene.getOfficialSymbol())) {
                    /*
                     * So, our gene products match, and the genes match but via an alias. That's pretty solid.
                     */
                    log.info("Associated gene product " + geneProduct
                            + " has a match in Gemma through an aliased gene: " + existing2);
                    return existing2;
                }
            }

        }
        return null;
    }

    /**
     * @param queue
     * @param generatorDone
     * @param loaderDone
     */
    private void load(final BlockingQueue<BlatAssociation> queue, final AtomicBoolean generatorDone,
            final AtomicBoolean loaderDone) {
        final SecurityContext context = SecurityContextHolder.getContext();
        assert context != null;

        Thread loadThread = new Thread(new Runnable() {
            @Override
            public void run() {
                SecurityContextHolder.setContext(context);
                doLoad(queue, generatorDone, loaderDone);
            }

        }, "PersistBlatAssociations");

        loadThread.start();

    }

    /**
     * Print line of result to STDOUT.
     * 
     * @param cs
     * @param blatAssociation
     */
    private void printResult(CompositeSequence cs, BlatAssociation blatAssociation) {

        GeneProduct geneProduct = blatAssociation.getGeneProduct();
        Gene gene = geneProduct.getGene();
        System.out.println(cs.getName() + '\t' + blatAssociation.getBioSequence().getName() + '\t'
                + geneProduct.getName() + '\t' + gene.getOfficialSymbol() + "\t" + gene.getClass().getSimpleName());
    }

}