uk.ac.ebi.ep.parser.parsers.ChEBICompounds.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ebi.ep.parser.parsers.ChEBICompounds.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package uk.ac.ebi.ep.parser.parsers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.apache.log4j.Logger;
import org.springframework.util.StringUtils;
import uk.ac.ebi.chebi.webapps.chebiWS.client.ChebiWebServiceClient;
import uk.ac.ebi.chebi.webapps.chebiWS.model.ChebiWebServiceFault_Exception;
import uk.ac.ebi.chebi.webapps.chebiWS.model.DataItem;
import uk.ac.ebi.chebi.webapps.chebiWS.model.Entity;
import uk.ac.ebi.chebi.webapps.chebiWS.model.LiteEntity;
import uk.ac.ebi.chebi.webapps.chebiWS.model.LiteEntityList;
import uk.ac.ebi.chebi.webapps.chebiWS.model.SearchCategory;
import uk.ac.ebi.chebi.webapps.chebiWS.model.StarsCategory;
import uk.ac.ebi.ep.data.domain.EnzymePortalCompound;
import uk.ac.ebi.ep.data.domain.EnzymePortalSummary;
import uk.ac.ebi.ep.data.domain.UniprotEntry;
import uk.ac.ebi.ep.data.repositories.EnzymePortalCompoundRepository;
import uk.ac.ebi.ep.data.repositories.EnzymePortalSummaryRepository;
import uk.ac.ebi.ep.parser.helper.CompoundUtil;
import uk.ac.ebi.ep.parser.helper.EPUtil;
import uk.ac.ebi.ep.parser.helper.MmDatabase;
import uk.ac.ebi.ep.parser.helper.Relationship;
import static uk.ac.ebi.ep.parser.inbatch.PartitioningSpliterator.partition;

/**
 *
 * @author joseph
 */
public class ChEBICompounds {

    // use this when parsing the molecule name - we don't want a molecule with parentesis eg sucrose (DTTI)
    private static final Pattern COMPOUND_NAME_PATTERN = Pattern.compile("(.*?)(?: \\((.*?)\\))?");

    private final Logger LOGGER = Logger.getLogger(ChEBICompounds.class);
    private final ChebiWebServiceClient chebiWsClient;

    Map<UniprotEntry, Set<String>> inhibitors = new LinkedHashMap<>();
    Map<UniprotEntry, Set<String>> activators = new LinkedHashMap<>();

    List<EnzymePortalCompound> compounds = new LinkedList<>();

    //private static final String COMMENT_TYPE = "ENZYME_REGULATION";
    private static final String COMMENT_TYPE = "REGULATION";

    private final EnzymePortalCompoundRepository compoundRepository;

    private final EnzymePortalSummaryRepository enzymeSummaryRepository;

    public static final String[] BLACKLISTED_COMPOUNDS = { "ACID", "acid", "H(2)O", "H(+)", "ACID", "WATER",
            "water", "ion", "ION", "", " " };
    List<String> blackList = Arrays.asList(BLACKLISTED_COMPOUNDS);

    public ChEBICompounds(EnzymePortalSummaryRepository enzymeSummaryRepository,
            EnzymePortalCompoundRepository repository) {
        this.compoundRepository = repository;
        this.enzymeSummaryRepository = enzymeSummaryRepository;
        chebiWsClient = new ChebiWebServiceClient();

    }

    public void computeAndLoadChEBICompounds() {

        List<EnzymePortalSummary> enzymeSummary = enzymeSummaryRepository.findSummariesByCommentType(COMMENT_TYPE);
        LOGGER.warn("Number of Regulation Text from EnzymeSummary Table " + enzymeSummary.size());

        //String text = "Activated by cell stresses such as DNA damage, heat shock, osmotic shock, anisomycin and sodium arsenite, as well as pro-inflammatory stimuli such as bacterial lipopolysaccharide (LPS) and interleukin-1. Activation occurs through dual phosphorylation of Thr-180 and Tyr-182 by either of two dual specificity kinases, MAP2K3/MKK3 or MAP2K6/MKK6, and potentially also MAP2K4/MKK4, as well as by TAB1-mediated autophosphorylation. MAPK14 phosphorylated on both Thr-180 and Tyr-182 is 10-20-fold more active than MAPK14 phosphorylated only on Thr-180, whereas MAPK14 phosphorylated on Tyr-182 alone is inactive. whereas Thr-180 is necessary for catalysis, Tyr-182 may be required for auto-activation and substrate recognition. Phosphorylated at Tyr-323 by ZAP70 in an alternative activation pathway in response to TCR signaling in T-cells. This alternative pathway is inhibited by GADD45A. Inhibited by dual specificity phosphatases, such as DUSP1, DUSP10, and DUSP16. Specifically inhibited by the binding of pyridinyl-imidazole compounds, which are cytokine-suppressive anti-inflammatory drugs (CSAID). Isoform Mxi2 is 100-fold less sensitive to these agents than the other isoforms and is not inhibited by DUSP1. Isoform Exip is not activated by MAP2K6. SB203580 is an inhibitor of MAPK14.";   
        //Java 7 and before only. uncomment if Java 8 is not available in your env
        //        for (EnzymePortalSummary summary : enzymeSummary) {
        //            String enzyme_regulation_text = summary.getCommentText();
        //            
        //         
        //            inhibitors.put(summary.getUniprotAccession(), EPUtil.parseTextForInhibitors(enzyme_regulation_text));
        //            activators.put(summary.getUniprotAccession(), EPUtil.parseTextForActivators(enzyme_regulation_text));
        //        }
        //        
        //
        //        for (Map.Entry<UniprotEntry, Set<String>> map : inhibitors.entrySet()) {
        //            UniprotEntry key = map.getKey();
        //            for (String inhibitor : map.getValue()) {
        //                EnzymePortalCompound inhibitor_from_chebi = searchMoleculeInChEBI(inhibitor);
        //               
        //                if (inhibitor_from_chebi != null) {
        //                    
        //                    inhibitor_from_chebi.setRelationship(Relationship.is_inhibitor_of.name());
        //                    inhibitor_from_chebi.setUniprotAccession(key);
        //                    compounds.add(inhibitor_from_chebi);
        //                }
        //            }
        //
        //        }
        //
        //        for (Map.Entry<UniprotEntry, Set<String>> map : activators.entrySet()) {
        //            UniprotEntry key = map.getKey();
        //            for (String activator : map.getValue()) {
        //                EnzymePortalCompound activator_from_chebi = searchMoleculeInChEBI(activator);
        //                if (activator_from_chebi != null) {
        //                 
        //                    activator_from_chebi.setRelationship(Relationship.is_activator_of.name());
        //                    activator_from_chebi.setUniprotAccession(key);
        //                    compounds.add(activator_from_chebi);
        //                }
        //            }
        //
        //        }
        //Java 8 specifics - comment out  and uncomment above if java 8 is not found in env
        //        enzymeSummary.stream().forEach((summary) -> {
        //            String enzyme_regulation_text = summary.getCommentText();
        //            inhibitors.put(summary.getUniprotAccession(), EPUtil.parseTextForInhibitors(enzyme_regulation_text));
        //            activators.put(summary.getUniprotAccession(), EPUtil.parseTextForActivators(enzyme_regulation_text));
        //        });

        Stream<EnzymePortalSummary> existingStream = enzymeSummary.stream();
        Stream<List<EnzymePortalSummary>> partitioned = partition(existingStream, 500, 1);
        AtomicInteger count = new AtomicInteger(1);
        partitioned.parallel().forEach((chunk) -> {
            //System.out.println(count.getAndIncrement() + " BATCH SIZE" + chunk.size());
            chunk.stream().forEach((summary) -> {
                String enzyme_regulation_text = summary.getCommentText();

                inhibitors.put(summary.getUniprotAccession(),
                        EPUtil.parseTextForInhibitors(enzyme_regulation_text));
                activators.put(summary.getUniprotAccession(),
                        EPUtil.parseTextForActivators(enzyme_regulation_text));

            });
        });

        LOGGER.debug("number of inhibitors and activators to process are : " + inhibitors.size() + ": "
                + activators.size());
        inhibitors.entrySet().stream().forEach((map) -> {
            map.getValue().stream().map((inhibitor) -> searchMoleculeInChEBI(inhibitor))
                    .filter((inhibitor_from_chebi) -> (inhibitor_from_chebi != null))
                    .map((inhibitor_from_chebi) -> {
                        inhibitor_from_chebi.setRelationship(Relationship.is_inhibitor_of.name());
                        inhibitor_from_chebi = CompoundUtil.computeRole(inhibitor_from_chebi,
                                inhibitor_from_chebi.getRelationship());
                        return inhibitor_from_chebi;
                    }).map((inhibitor_from_chebi) -> {
                        inhibitor_from_chebi.setUniprotAccession(map.getKey());
                        return inhibitor_from_chebi;
                    }).forEach((inhibitor_from_chebi) -> {
                        compounds.add(inhibitor_from_chebi);
                    });
        });

        activators.entrySet().stream().forEach((map) -> {
            map.getValue().stream().map((activator) -> searchMoleculeInChEBI(activator))
                    .filter((activator_from_chebi) -> (activator_from_chebi != null))
                    .map((activator_from_chebi) -> {
                        activator_from_chebi.setRelationship(Relationship.is_activator_of.name());
                        activator_from_chebi = CompoundUtil.computeRole(activator_from_chebi,
                                activator_from_chebi.getRelationship());
                        return activator_from_chebi;
                    }).map((activator_from_chebi) -> {
                        activator_from_chebi.setUniprotAccession(map.getKey());
                        return activator_from_chebi;
                    }).forEach((activator_from_chebi) -> {
                        compounds.add(activator_from_chebi);
                    });
        });

        LOGGER.warn("Number of compounds before filtering : " + compounds.size());

        compounds.removeIf(c -> (c.getCompoundId().equalsIgnoreCase("CHEBI:338412")
                || c.getCompoundId().equalsIgnoreCase("CHEBI:16412")
                || c.getCompoundId().equalsIgnoreCase("CHEBI:29678"))
                && c.getUniprotAccession().getAccession().equalsIgnoreCase("Q16539"));

        LOGGER.warn("Writing to Enzyme Portal database... Number of compounds to write : " + compounds.size());

        compoundRepository.save(compounds);

        inhibitors.clear();
        activators.clear();
        compounds.clear();
    }

    /**
     * Searches a compound name in ChEBI. Please note that if the name does not
     * match <i>exactly</i> any names/synonyms returned by ChEBI, the result
     * will be <code>null</code>.
     *
     * @param moleculeName the compound name.
     * @return an entry with a ChEBI ID, or <code>null</code> if not found.
     */
    protected EnzymePortalCompound searchMoleculeInChEBI(String moleculeName) {

        EnzymePortalCompound entry = null;
        // Sometimes moleculeName comes as "moleculeName (ACRONYM)"
        // sometimes as "moleculeName (concentration)":
        Matcher m = COMPOUND_NAME_PATTERN.matcher(moleculeName);
        m.matches(); // always
        String[] nameAcronym = { m.group(1), m.group(2) };
        // first name, then acronym (if any):
        nameLoop: for (String name : nameAcronym) {
            if (name == null) {
                continue; // acronym, usually
            }
            try {
                LiteEntityList lites = chebiWsClient.getLiteEntity(name, SearchCategory.ALL_NAMES, 25,
                        StarsCategory.ALL);
                String chebiId = null;

                if (lites != null) {
                    liteLoop: for (LiteEntity lite : lites.getListElement()) {
                        Entity completeEntity = chebiWsClient.getCompleteEntity(lite.getChebiId());
                        List<String> synonyms = new ArrayList<>();
                        for (DataItem dataItem : completeEntity.getSynonyms()) {
                            synonyms.add(dataItem.getData().toLowerCase());
                        }
                        List<String> formulae = new ArrayList<>();
                        for (DataItem formula : completeEntity.getFormulae()) {
                            formulae.add(formula.getData());
                        }
                        if (completeEntity.getChebiAsciiName().equalsIgnoreCase(name)
                                || synonyms.contains(name.toLowerCase()) || formulae.contains(name)) {
                            chebiId = completeEntity.getChebiId();
                        }
                        if (chebiId != null) {
                            break liteLoop;
                        }
                    }
                }

                if ((chebiId == null || blackList.contains(name)) || StringUtils.isEmpty(name)) {
                    LOGGER.warn("Not found in ChEBI: " + name);
                } else {
                    entry = new EnzymePortalCompound();
                    entry.setCompoundSource(MmDatabase.ChEBI.name());
                    entry.setCompoundId(chebiId);
                    entry.setCompoundName(name);
                    break;
                }
            } catch (ChebiWebServiceFault_Exception e) {
                LOGGER.error("Searching for " + name, e);
            }
        }
        return entry;
    }
}