com.act.biointerpretation.Utils.ReactionProjector.java Source code

Introduction

Here is the source code for com.act.biointerpretation.Utils.ReactionProjector.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.Utils;

import chemaxon.marvin.io.MolExportException;
import chemaxon.reaction.ConcurrentReactorProcessor;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.struc.Molecule;
import chemaxon.struc.RxnMolecule;
import chemaxon.util.iterator.MoleculeIterator;
import chemaxon.util.iterator.MoleculeIteratorFactory;
import com.act.analysis.chemicals.molecules.MoleculeExporter;
import com.act.analysis.chemicals.molecules.MoleculeFormat;
import com.act.analysis.chemicals.molecules.MoleculeFormat$;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import org.apache.commons.collections4.iterators.PermutationIterator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

public class ReactionProjector {

    private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionProjector.class);
    private static final String MOL_NOT_FOUND = "NOT_FOUND";

    private static final MolSearchOptions LAX_SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
    private static final MolSearch DEFAULT_SEARCHER = new MolSearch();

    private static final String DEFAULT_MOLECULE_FORMAT = MoleculeFormat.noAuxInchi$.MODULE$.toString();
    private final String moleculeFormat;

    /**
     * Set searcher to ignore stereo and bond type.  This will allow us to optimistically match products so that we don't
     * end up throwing out a reactor that produces the right compound in a slightly different form.
     */
    static {
        LAX_SEARCH_OPTIONS.setStereoSearchType(SearchConstants.STEREO_IGNORE);
        LAX_SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
        DEFAULT_SEARCHER.setSearchOptions(LAX_SEARCH_OPTIONS);
    }

    private transient ThreadLocal<MolSearch> substrateSearcher = new ThreadLocal<MolSearch>() {
        @Override
        protected MolSearch initialValue() {
            MolSearch search = new MolSearch();
            /* These parameters were identified by trial and error as reasonable defaults for doing substructure searches for
             * reactor substrate matching.  */
            MolSearchOptions options = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
            // This allows H's in RO strings to match implicit hydrogens in our target molecules.
            options.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_ENABLED);
            /* This allows for vague bond matching in ring structures.  From the Chemaxon Docs:
             *    In the query all single ring bonds are replaced by "single or aromatic" and all double ring bonds are
             *    replaced by "double or aromatic" prior to search.
             *    (https://www.chemaxon.com/jchem/doc/dev/java/api/chemaxon/sss/SearchConstants.html)
             *
             * This should allow us to handle aromatized molecules gracefully without handling non-ring single and double
             * bonds ambiguously. */
            options.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL2);
            // Few if any of our ROs concern stereo chemistry, so we can just ignore it.
            options.setStereoSearchType(SearchConstants.STEREO_IGNORE);
            /* Chemaxon's tautomer handling is weird, as sometimes it picks a non-representative tautomer as its default.
             * As such, we'll allow tautomer matches to avoid excluding viable candidates. */
            options.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON);
            search.setSearchOptions(options);
            return search;
        }
    };

    private boolean useSubstructureFiltering = false;

    private MolSearch searcher;

    public ReactionProjector() {
        this(DEFAULT_SEARCHER, DEFAULT_MOLECULE_FORMAT);
    }

    public ReactionProjector(MolSearch searcher) {
        this(searcher, DEFAULT_MOLECULE_FORMAT);
    }

    public ReactionProjector(String moleculeFormat) {
        this(DEFAULT_SEARCHER, moleculeFormat);
    }

    public ReactionProjector(MolSearch searcher, String moleculeFormat) {
        this.searcher = searcher;
        this.moleculeFormat = moleculeFormat;
    }

    public ReactionProjector(boolean useSubstructureFiltering) {
        // I don't anticipate either of these ever being used if this is the constructor employed.
        this.searcher = DEFAULT_SEARCHER;
        this.moleculeFormat = DEFAULT_MOLECULE_FORMAT;
        this.useSubstructureFiltering = useSubstructureFiltering;
    }

    /**
     * This method should be called if projecting on more chemicals than should be stored in memory
     * simultaneously.  Until this is called, all chemicals acted on by the projector will be cached along
     * with their inchis.
     */
    public void clearInchiCache() {
        MoleculeExporter.clearCache();
    }

    /**
     * Run the given reactor until it produces the expected product. Reactor must produce one product at a time.
     *
     * @param reactor         The reactor to run.
     * @param expectedProduct The product we expect to see.
     * @return The produced product; this is necessary because the reactor will produce the product with atom maps
     * corresponding to the substrate, whereas the expectedProduct Molecule will not have such atom maps.
     * @throws ReactionException If the expected product is not produced at all by the Reactor.
     * @throws IOException
     */
    public Molecule reactUntilProducesProduct(Reactor reactor, Molecule expectedProduct) throws ReactionException {
        if (reactor.getProductCount() != 1) {
            throw new IllegalArgumentException("Reactor must produce exactly one product.");
        }

        Molecule[] products;
        while ((products = reactor.react()) != null) {
            if (testEquality(products[0], expectedProduct)) {
                return products[0];
            }
        }

        throw new ReactionException("Expected product not among Reactor's predictions.");
    }

    /**
     * Tests equality of molecules based on two substructure queries. This is desirable because it makes the
     * equality comparison very configurable as compared to comparing inchis.
     *
     * @param A One molecule.
     * @param B Another molecule.
     * @return True if they are equivalent.
     */
    private boolean testEquality(Molecule A, Molecule B) {
        return substructureTest(A, B) && substructureTest(B, A);
    }

    private boolean substructureTest(Molecule substructure, Molecule superstructure) {
        searcher.setQuery(substructure);
        searcher.setTarget(superstructure);
        try {
            return searcher.findFirst() != null;
        } catch (SearchException e) {
            // Log error but don't propagate upward. Have never seen this before.
            LOGGER.error("Error on ReactionProjector.substructureTest(), %s", e.getMessage());
            return false;
        }
    }

    /**
     * Get the results of a reaction in list form, rather than as a map from substrates to products.
     *
     * @param mols    The substrates.
     * @param reactor The reactor.
     * @param maxProducts The maximum number of products to compute before exiting.  No limit is imposed for max values
     *                    of null or 0.
     * @return A list of product sets produced by this reaction.
     * @throws IOException
     * @throws ReactionException
     */
    public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor, Integer maxProducts)
            throws IOException, ReactionException {
        Map<Molecule[], List<Molecule[]>> map = getRoProjectionMap(mols, reactor, maxProducts);

        List<Molecule[]> allProductSets = new ArrayList<>();

        for (Map.Entry<Molecule[], List<Molecule[]>> entry : map.entrySet()) {
            allProductSets.addAll(entry.getValue());
        }

        return allProductSets;
    }

    public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor)
            throws IOException, ReactionException {
        return getAllProjectedProductSets(mols, reactor, null);
    }

    /**
     * This function takes as input an array of molecules and a Reactor and outputs the products of the transformation.
     * The results are returned as a map from orderings of the substrates to the products produced by those orderings.
     * In most cases the map will have only one entry, but in some cases different orderings of substrates can lead to
     * different valid predictions.
     * <p>
     * Note that, for efficient two substrate expansion, the specialized method
     * fastProjectionOfTwoSubstrateRoOntoTwoMolecules should be used instead of this one.
     *
     * @param mols    An array of molecules representing the chemical reactants.
     * @param reactor A Reactor representing the reaction to apply.
     * @param maxProducts The maximum number of products to compute before exiting.  No limit is imposed for max values
     *                    of null or 0.
     * @return The substrate -> product map.
     */
    public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor,
            Integer maxProducts) throws ReactionException, IOException {

        Map<Molecule[], List<Molecule[]>> resultsMap = new HashMap<>();

        if (mols.length != reactor.getReactantCount()) {
            LOGGER.debug("Tried to project RO with %d substrates on set of %d substrates",
                    reactor.getReactantCount(), mols.length);
            return resultsMap;
        }

        // If there is only one reactant, we can do just a simple reaction computation. However, if we have multiple reactants,
        // we have to use the ConcurrentReactorProcessor API since it gives us the ability to combinatorially explore all
        // possible matching combinations of reactants on the substrates of the RO.
        if (mols.length == 1) {
            List<Molecule[]> productSets = getProductsFixedOrder(reactor, mols, maxProducts);
            if (!productSets.isEmpty()) {
                resultsMap.put(mols, productSets);
            }
        } else if (useSubstructureFiltering) {
            Optional<List<Set<Integer>>> viableSubstrateIndexes;
            try {
                viableSubstrateIndexes = matchCandidatesToSubstrateStructures(reactor, mols);
            } catch (SearchException e) {
                throw new ReactionException("Caught exception when performing pre-reaction substructure search", e);
            }

            if (viableSubstrateIndexes.isPresent()) {
                List<Integer> allIndexes = new ArrayList<Integer>(mols.length) {
                    {
                        for (int i = 0; i < mols.length; i++) {
                            add(i);
                        }
                    }
                };

                int permutationIndex = 0;
                PermutationIterator<Integer> iter = new PermutationIterator<>(allIndexes);
                while (iter.hasNext()) {
                    permutationIndex++;
                    List<Integer> permutation = iter.next();
                    if (permutationFitsSubstructureMatches(permutation, viableSubstrateIndexes.get())) {
                        Molecule[] substrates = indexPermutationToMolecules(mols, permutation);
                        reactor.setReactants(substrates);
                        Molecule[] products;
                        List<Molecule[]> results = new ArrayList<>();
                        while ((products = reactor.react()) != null) {
                            results.add(products);
                            if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) {
                                break;
                            }
                        }
                        if (results.size() > 0) {
                            resultsMap.put(substrates, results);
                        }
                    }
                }
            } // Otherwise just return the empty map.
        } else {
            // TODO: why not make one of these per ReactionProjector object?
            // TODO: replace this with Apache commons PermutationIterator for clean iteration over distinct permutations.
            ConcurrentReactorProcessor reactorProcessor = new ConcurrentReactorProcessor();
            reactorProcessor.setReactor(reactor);

            // This step is needed by ConcurrentReactorProcessor for the combinatorial exploration.
            // The iterator takes in the same substrates in each iteration step to build a matrix of combinations.
            // For example, if we have A + B -> C, the iterator creates this array: [[A,B], [A,B]]. Based on this,
            // ConcurrentReactorProcessor will cross the elements in the first index with the second, creating the combinations
            // A+A, A+B, B+A, B+B and operates all of those on the RO, which is what is desired.
            MoleculeIterator[] iterator = new MoleculeIterator[mols.length];
            for (int i = 0; i < mols.length; i++) {
                iterator[i] = MoleculeIteratorFactory.createMoleculeIterator(mols);
            }

            reactorProcessor.setReactantIterators(iterator, ConcurrentReactorProcessor.MODE_COMBINATORIAL);

            // Bag is a multi-set class that ships with Apache commons collection, and we already use many commons libs--easy!
            Bag<Molecule> originalReactantsSet = new HashBag<>(Arrays.asList(mols));

            // This set keeps track of substrate combinations we've used, and avoids repeats.  Repeats can occur
            // when several substrates are identical, and can be put in "different" but symmetric orderings.
            Set<String> substrateHashes = new HashSet<>();

            List<Molecule[]> results = null;
            int reactantCombination = 0;
            while ((results = reactorProcessor.reactNext()) != null) {
                reactantCombination++;
                Molecule[] reactants = reactorProcessor.getReactants();

                if (results.isEmpty()) {
                    LOGGER.debug("No results found for reactants combination %d, skipping", reactantCombination);
                    continue;
                }

                Bag<Molecule> thisReactantSet = new HashBag<>(Arrays.asList(reactants));
                if (!originalReactantsSet.equals(thisReactantSet)) {
                    LOGGER.debug("Reactant set %d does not represent original, complete reactant sets, skipping",
                            reactantCombination);
                    continue;
                }

                String thisHash = getStringHash(reactants);
                if (substrateHashes.contains(thisHash)) {
                    continue;
                }

                substrateHashes.add(thisHash);
                resultsMap.put(reactants, results);
                if (maxProducts != null && maxProducts > 0 && resultsMap.size() >= maxProducts) {
                    break;
                }
            }
        }
        return resultsMap;
    }

    private Optional<List<Set<Integer>>> matchCandidatesToSubstrateStructures(Reactor reactor,
            Molecule[] candidates) throws SearchException, MolExportException {

        RxnMolecule rxnMol = reactor.getReaction();
        Molecule[] substrateStructures = rxnMol.getReactants();

        MolSearch search = substrateSearcher.get();

        /* Each list in `results` is a set of viable indexes into candidates for a given substrate position in the reactor.
         * So if candidates contains three molecules and results looks like:
         *   0: 0, 1
         *   1: 1
         *   2: 0, 2
         *
         * Then candidates[0] can appear as substrate 0 or 2, candidates[1] as substrate 0 or 1, and candidates[2] only as
         * substrate 2. */
        List<Set<Integer>> results = new ArrayList<>(substrateStructures.length);

        for (int i = 0; i < substrateStructures.length; i++) {
            search.setQuery(substrateStructures[i]);
            results.add(new HashSet<>());

            /* Test each candidate molecule against each substrate structure of the reactor.  If we can't find a match,
             * there's no way that the molecule could appear in a particular position in the reaction and we can later
             * eliminate input molecule permutations that don't conform to this structure match. */
            for (int j = 0; j < candidates.length; j++) {
                search.setTarget(candidates[j]);
                // Any match will do, so just call findFirst and ensure we get some sort of results.
                int[] res = search.findFirst();
                if (res != null) {
                    results.get(i).add(j);
                }
            }

            /* If one of the substrate patterns can't be matched to any input structure, then there are no viable permutations
             * to consider.  We tell the call as much by returning `empty`. */
            if (results.get(i).size() == 0) {
                return Optional.empty();
            }
        }

        return Optional.of(results);
    }

    private Boolean permutationFitsSubstructureMatches(List<Integer> permutation,
            List<Set<Integer>> viableIndexes) {
        /* For each permutation, match its value at each location against the list of viable indexes for that location.
         * If a given permutation is 0, 2, 1 and viableIndexes looks like
         *   0: 0, 1
         *   1: 1
         *   2: 0, 2
         * as above, then then the second structure index doesn't pass a substructure search and we can skip it. */
        for (int i = 0; i < permutation.size(); i++) {
            if (!viableIndexes.get(i).contains(permutation.get(i))) {
                return false;
            }
        }
        return true;
    }

    private Molecule[] indexPermutationToMolecules(Molecule[] molecules, List<Integer> permutation) {
        Molecule[] results = new Molecule[molecules.length];
        for (int i = 0; i < permutation.size(); i++) {
            results[i] = molecules[permutation.get(i)];
        }
        return results;
    }

    public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor)
            throws ReactionException, IOException {
        return getRoProjectionMap(mols, reactor, null);
    }

    /**
     * This function projects all possible combinations of two input substrates onto a 2 substrate RO, then
     * cleans and returns the results of that projection.
     * TODO: Expand this class to handle 3 or 4 substrate reactions.
     *
     * @param mols    Substrate molecules
     * @param reactor The two substrate reactor
     * @return A list of product arrays, where each array represents the products of a given reaction combination.
     * @throws ReactionException
     * @throws IOException
     */
    public Map<Molecule[], List<Molecule[]>> fastProjectionOfTwoSubstrateRoOntoTwoMolecules(Molecule[] mols,
            Reactor reactor) throws ReactionException, IOException {
        Map<Molecule[], List<Molecule[]>> results = new HashMap<>();

        Molecule[] firstCombinationOfSubstrates = new Molecule[] { mols[0], mols[1] };
        List<Molecule[]> productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates);
        if (!productSets.isEmpty()) {
            results.put(firstCombinationOfSubstrates, productSets);
        }

        // Second ordering is same if two molecules are equal.
        if (getMoleculeString(mols[0]).equals(getMoleculeString(mols[1]))) {
            return results;
        }

        Molecule[] secondCombinationOfSubstrates = new Molecule[] { mols[1], mols[0] };
        productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates);
        if (!productSets.isEmpty()) {
            results.put(secondCombinationOfSubstrates, productSets);
        }

        return results;
    }

    /**
     * Gets all product sets that a Reactor produces when applies to a given array of substrates, in only the order
     * that they are already in.
     *
     * @param reactor    The Reactor.
     * @param substrates The substrates.
     * @param maxProducts The maximum number of products to compute before exiting.  No limit is imposed for max values
     *                    of null or 0.
     * @return A list of product arrays returned by the Reactor.
     * @throws ReactionException
     */
    public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates, Integer maxProducts)
            throws ReactionException {

        reactor.setReactants(substrates);
        List<Molecule[]> results = new ArrayList<>();

        Molecule[] products;
        while ((products = reactor.react()) != null) {
            results.add(products);
            if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) {
                break;
            }
        }

        return results;
    }

    public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates) throws ReactionException {
        return getProductsFixedOrder(reactor, substrates, null);
    }

    /**
     * Builds a string which will be identical for two molecule arrays which represent the same molecules
     * in the same order, and different otherwise.
     *
     * @param mols The array of molecules.
     * @return The string representation.
     * @throws IOException
     */
    private String getStringHash(Molecule[] mols) throws IOException {
        StringBuilder builder = new StringBuilder();
        for (Molecule molecule : mols) {
            builder.append(getMoleculeString(molecule));
        }
        return builder.toString();
    }

    /**
     * Gets a a string of the molecule.  The exporter takes care of all caching so we don't need to worry about it
     *
     * @param molecule The molecule.
     * @return The molecule's string presentation in the format that this class declares..
     * @throws IOException
     */
    private String getMoleculeString(Molecule molecule) throws IOException {
        return MoleculeExporter.exportMolecule(molecule, MoleculeFormat$.MODULE$.getName(this.moleculeFormat));
    }
}