net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.databases.HMDBGateway.java Source code

Introduction

Here is the source code for net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.databases.HMDBGateway.java
Source

/*
 * Copyright 2006-2015 The MZmine 2 Development Team
 * 
 * This file is part of MZmine 2.
 * 
 * MZmine 2 is free software; you can redistribute it and/or modify it under the
 * terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 * 
 * MZmine 2 is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * MZmine 2; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
 * Fifth Floor, Boston, MA 02110-1301 USA
 */

package net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.databases;

import java.io.IOException;
import java.net.URL;
import java.util.TreeSet;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.DBCompound;
import net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.DBGateway;
import net.sf.mzmine.modules.peaklistmethods.identification.onlinedbsearch.OnlineDatabase;
import net.sf.mzmine.parameters.ParameterSet;
import net.sf.mzmine.parameters.parametertypes.tolerances.MZTolerance;
import net.sf.mzmine.util.InetUtils;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import com.google.common.collect.Range;

public class HMDBGateway implements DBGateway {

    private Logger logger = Logger.getLogger(this.getClass().getName());

    private static final String hmdbSeachAddress = "http://www.hmdb.ca/structures/search/metabolites/mass?";
    private static final String hmdbStructureAddress = "http://structures.wishartlab.com/molecules/";
    private static final String hmdbEntryAddress = "http://www.hmdb.ca/metabolites/";

    public String[] findCompounds(double mass, MZTolerance mzTolerance, int numOfResults, ParameterSet parameters)
            throws IOException {

        Range<Double> toleranceRange = mzTolerance.getToleranceRange(mass);

        String queryAddress = hmdbSeachAddress + "&query_from=" + toleranceRange.lowerEndpoint() + "&query_to="
                + toleranceRange.upperEndpoint();

        URL queryURL = new URL(queryAddress);

        // Submit the query
        String queryResult = InetUtils.retrieveData(queryURL);

        // Organize the IDs as a TreeSet to keep them sorted
        TreeSet<String> results = new TreeSet<String>();

        // Find IDs in the HTML data
        Pattern pat = Pattern.compile("metabolites/(HMDB[0-9]{5})");
        Matcher matcher = pat.matcher(queryResult);
        while (matcher.find()) {
            String hmdbID = matcher.group(1);
            results.add(hmdbID);
        }

        // Remove all except first numOfResults IDs. The reason why we first
        // retrieve all results and then remove those above numOfResults is to
        // keep the lowest HDMB IDs - these may be the most interesting ones.
        while (results.size() > numOfResults) {
            String lastItem = results.last();
            results.remove(lastItem);
        }

        return results.toArray(new String[0]);

    }

    /**
     * This method retrieves the details about HMDB compound
     * 
     */
    public DBCompound getCompound(String ID, ParameterSet parameters) throws IOException {

        logger.finest("Obtaining information about HMDB compound id " + ID);

        Element nameElement, formulaElement;

        try {

            final String url = hmdbEntryAddress + ID + ".xml";
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = dbf.newDocumentBuilder();
            Document parsedResult = builder.parse(url);

            XPathFactory factory = XPathFactory.newInstance();
            XPath xpath = factory.newXPath();

            XPathExpression expr = xpath.compile("//metabolite/name");
            NodeList nameElementNL = (NodeList) expr.evaluate(parsedResult, XPathConstants.NODESET);
            nameElement = (Element) nameElementNL.item(0);

            if (nameElement == null)
                throw new IOException("Could not parse compound name");

            expr = xpath.compile("//metabolite/chemical_formula");
            NodeList formulaElementNL = (NodeList) expr.evaluate(parsedResult, XPathConstants.NODESET);
            formulaElement = (Element) formulaElementNL.item(0);
            if (formulaElement == null)
                throw new IOException("Could not parse compound formula");

        } catch (Exception e) {
            throw new IOException(e);
        }

        final String compoundName = nameElement.getTextContent();
        final String compoundFormula = formulaElement.getTextContent();
        final URL structure2DURL = new URL(hmdbStructureAddress + ID + ".sdf");
        final URL structure3DURL = new URL(hmdbStructureAddress + ID + ".pdb");
        final URL entryURL = new URL(hmdbEntryAddress + ID);

        DBCompound newCompound = new DBCompound(OnlineDatabase.HMDB, ID, compoundName, compoundFormula, entryURL,
                structure2DURL, structure3DURL);

        return newCompound;

    }
}