act.installer.pubchem.PubchemSynonymFinder.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.pubchem.PubchemSynonymFinder.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;

import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.RocksDB;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public class PubchemSynonymFinder {
    private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemSynonymFinder.class);
    private static final Charset UTF8 = StandardCharsets.UTF_8;
    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    public static final String OPTION_INDEX_PATH = "x";
    public static final String OPTION_PUBCHEM_COMPOUND_ID = "c";
    public static final String OPTION_IDS_FILE = "f";
    public static final String OPTION_OUTPUT = "o";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class finds and prints Pubchem synonym data from a RocksDB index created from Pubchem RDF files. ",
            "Specify one or more Pubchem compound ids to find." }, "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INDEX_PATH).argName("index path")
                    .desc("A path to the directory where the on-disk index will be stored; must not already exist")
                    .hasArg().required().longOpt("index"));
            add(Option.builder(OPTION_PUBCHEM_COMPOUND_ID).argName("compound id")
                    .desc("Lookup one compound ID in the database").hasArg().longOpt("pc-cid"));
            add(Option.builder(OPTION_IDS_FILE).argName("compound ids file").desc(
                    "Lookup a list of compound ids and print them as one large JSON document; comments (#) will be ignored")
                    .hasArg().longOpt("pc-cids-file"));
            add(Option.builder(OPTION_OUTPUT).argName("output file")
                    .desc("Write output to a file; default is stdout").hasArg().longOpt("output"));
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    private static final Pattern PC_CID_PATTERN = Pattern.compile("^CID\\d+$");

    public static void main(String[] args) throws Exception {
        org.apache.commons.cli.Options opts = new org.apache.commons.cli.Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        File rocksDBFile = new File(cl.getOptionValue(OPTION_INDEX_PATH));
        if (!rocksDBFile.isDirectory()) {
            System.err.format("Index directory does not exist or is not a directory at '%s'",
                    rocksDBFile.getAbsolutePath());
            HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        List<String> compoundIds = null;
        if (cl.hasOption(OPTION_PUBCHEM_COMPOUND_ID)) {
            compoundIds = Collections.singletonList(cl.getOptionValue(OPTION_PUBCHEM_COMPOUND_ID));
        } else if (cl.hasOption(OPTION_IDS_FILE)) {
            File idsFile = new File(cl.getOptionValue(OPTION_IDS_FILE));
            if (!idsFile.exists()) {
                System.err.format("Cannot find Pubchem CIDs file at %s", idsFile.getAbsolutePath());
                HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                        true);
                System.exit(1);
            }

            compoundIds = getCIDsFromFile(idsFile);

            if (compoundIds.size() == 0) {
                System.err.format("Found zero Pubchem CIDs to process in file at '%s', exiting",
                        idsFile.getAbsolutePath());
                HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                        true);
                System.exit(1);
            }
        } else {
            System.err.format("Must specify one of '%s' or '%s'; index is too big to print all synonyms.",
                    OPTION_PUBCHEM_COMPOUND_ID, OPTION_IDS_FILE);
            HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        // Run a quick check to warn users of malformed ids.
        compoundIds.forEach(x -> {
            if (!PC_CID_PATTERN.matcher(x).matches()) { // Use matches() for complete matching.
                LOGGER.warn("Specified compound id does not match expected format: %s", x);
            }
        });

        LOGGER.info("Opening DB and searching for %d Pubchem CIDs", compoundIds.size());
        Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = null;
        Map<String, PubchemSynonyms> results = new LinkedHashMap<>(compoundIds.size());
        try {
            dbAndHandles = PubchemTTLMerger.openExistingRocksDB(rocksDBFile);
            RocksDB db = dbAndHandles.getLeft();
            ColumnFamilyHandle cidToSynonymsCfh = dbAndHandles.getRight()
                    .get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS);

            for (String cid : compoundIds) {
                PubchemSynonyms synonyms = null;
                byte[] val = db.get(cidToSynonymsCfh, cid.getBytes(UTF8));
                if (val != null) {
                    ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(val));
                    // We're relying on our use of a one-value-type per index model here so we can skip the instanceof check.
                    synonyms = (PubchemSynonyms) oi.readObject();
                } else {
                    LOGGER.warn("No synonyms available for compound id '%s'", cid);
                }
                results.put(cid, synonyms);
            }
        } finally {
            if (dbAndHandles != null) {
                dbAndHandles.getLeft().close();
            }
        }

        try (OutputStream outputStream = cl.hasOption(OPTION_OUTPUT)
                ? new FileOutputStream(cl.getOptionValue(OPTION_OUTPUT))
                : System.out) {
            OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(outputStream, results);
            new OutputStreamWriter(outputStream).append('\n');
        }
        LOGGER.info("Done searching for Pubchem synonyms");
    }

    private static List<String> getCIDsFromFile(File idsFile) throws IOException {
        List<String> compoundIds = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(new FileReader(idsFile))) {
            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.startsWith("#")) { // skip comments
                    continue;
                }
                compoundIds.add(line);
            }
        }
        return compoundIds;
    }
}