RGroupQueryReader.java :  » Science » Chemistry-Development-Kit-1.3.5 » org » openscience » cdk » io » Java Open Source

Java Open Source » Science » Chemistry Development Kit 1.3.5 
Chemistry Development Kit 1.3.5 » org » openscience » cdk » io » RGroupQueryReader.java
/*
 * Copyright (C) 2010  Mark Rijnbeek <mark_rynbeek@users.sf.net>
 *
 * Contact: cdk-devel@lists.sourceforge.net
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 * All we ask is that proper credit is given for our work, which includes
 * - but is not limited to - adding the above copyright notice to the beginning
 * of your source code files, and to any copyright notice that you may
 * distribute with programs based on this work.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 */
package org.openscience.cdk.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.RGroupQueryFormat;
import org.openscience.cdk.isomorphism.matchers.IRGroupQuery;
import org.openscience.cdk.isomorphism.matchers.RGroup;
import org.openscience.cdk.isomorphism.matchers.RGroupList;
import org.openscience.cdk.isomorphism.matchers.RGroupQuery;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

/**
 * A reader for Symyx' Rgroup files (RGFiles).
 * An RGfile describes a single molecular query with Rgroups.
 * Each RGfile is a combination of Ctabs defining the root molecule and each
 * member of each Rgroup in the query.
 *
 * <p>The RGFile format is described in the manual
 * <a href="http://www.symyx.com/downloads/public/ctfile/ctfile.pdf">
 * "CTFile Formats"</a> , Chapter 5.
 *
 * @cdk.module io
 * @cdk.githash
 *
 * @cdk.keyword Rgroup
 * @cdk.keyword R group
 * @cdk.keyword R-group
 * @author Mark Rijnbeek
 */
@TestClass("org.openscience.cdk.io.RGroupQueryReaderTest")
public class RGroupQueryReader extends DefaultChemObjectReader {

    /**
     * Private bean style class to capture LOG (logic) lines.
     */
    private class RGroupLogic {
        int rgoupNumberRequired;
        boolean restH;
        String occurence;
    }

    BufferedReader input = null;
    private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(RGroupQueryReader.class);

    /**
     * Default constructor, input not set.
     */
    public RGroupQueryReader() {
        this(new StringReader(""));
    }

    /**
     * Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet
     * from a given InputStream.
     * @param in The InputStream to read from.
     */
    public RGroupQueryReader(InputStream in) {
        this(new InputStreamReader(in));
    }

    /**
     * Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet
     * from a given Reader.
     * @param  in  The Reader to read from.
     */
    public RGroupQueryReader(Reader in) {
        input = new BufferedReader(in);
    }


    /**
     * Sets the input Reader.
     * @param input Reader object
     * @throws CDKException
     */
    @TestMethod("testSetReader_Reader")
    public void setReader(Reader input) throws CDKException {
        if (input instanceof BufferedReader) {
            this.input = (BufferedReader)input;
        } else {
            this.input = new BufferedReader(input);
        }
    }

    @TestMethod("testSetReader_InputStream")
    public void setReader(InputStream input) throws CDKException {
        setReader(new InputStreamReader(input));
    }

    @TestMethod("testGetFormat")
    public IResourceFormat getFormat() {
        return RGroupQueryFormat.getInstance();
    }

    @TestMethod("testAccepts")
    public boolean accepts(Class classObject) {
      Class[] interfaces = classObject.getInterfaces();
          for (Class anInterface : interfaces) {
              if (IRGroupQuery.class.equals(anInterface)) return true;
          }
          Class superClass = classObject.getSuperclass();
          if (superClass != null) return this.accepts(superClass);
      return false;
    }

    @TestMethod("testClose")
    public void close() throws IOException {
        input.close();
    }

    /**
     * Check input IChemObject and proceed to parse.
     * Accepts/returns IChemObject of type RGroupQuery only.
     * @return IChemObject read from file
     * @param object class must be of type RGroupQuery
     */
    public IChemObject read(IChemObject object) throws CDKException {
        if (object instanceof RGroupQuery) {
            return parseRGFile((RGroupQuery)object);
        } else {
            throw new CDKException
               ("Reader only supports "+RGroupQuery.class.getName()+" objects");
        }
    }


    /**
     * Parse the RGFile. Uses of {@link org.openscience.cdk.io.MDLV2000Reader}
     * to parse individual $CTAB blocks.
     *
     * @param rGroupQuery empty
     * @return populated query
     * @throws CDKException
     */
    private RGroupQuery parseRGFile(RGroupQuery rGroupQuery) throws CDKException {
        String line = "";
        int lineCount = 0;
        String eol = System.getProperty("line.separator");
        StringTokenizer strTk=null;
        /* Variable to capture the LOG line(s) */
        Map<Integer, RGroupLogic> logicDefinitions = new HashMap<Integer, RGroupLogic>();

        /* Variable to captures attachment order for Rgroups.
         * Contains:
         *    - pseudo atom (Rgroup)
         *    - map with (integer,bond) meaning "bond" has attachment
         *      order "integer" (1,2,3) for the Rgroup
         *  The order is based on the atom block, unless there is an AAL line
         *  for the pseudo atom.
         */
        Map<IAtom, Map<Integer, IBond>> attachmentPoints = new HashMap<IAtom, Map<Integer, IBond>>();


        try {
            // Process the Header block_________________________________________
            //__________________________________________________________________
            logger.info("Process the Header block");
            checkLineBeginsWith(input.readLine(), "$MDL", ++lineCount);
            checkLineBeginsWith(input.readLine(), "$MOL", ++lineCount);
            checkLineBeginsWith(input.readLine(), "$HDR", ++lineCount);

            for (int i = 1; i <= 3; i++) {
                lineCount++;
                if (input.readLine() == null) {
                    throw new CDKException("RGFile invalid, empty/null header line at #" + lineCount);
                }
                //optional: parse header info here (not implemented)
            }
            checkLineBeginsWith(input.readLine(), "$END HDR", ++lineCount);


            //Process the root structure (scaffold)_____________________________
            //__________________________________________________________________
            logger.info("Process the root structure (scaffold)");
            checkLineBeginsWith(input.readLine(), "$CTAB", ++lineCount);
            //Force header
            StringBuilder sb = new StringBuilder(RGroup.ROOT_LABEL+"\n\n\n");
            line = input.readLine();
            ++lineCount;
            while (line != null && !line.equals("$END CTAB")) {
                sb.append(line + eol);

                //LOG lines: Logic, Unsatisfied Sites, Range of Occurrence.
                if (line.startsWith("M  LOG")) {
                    strTk = new StringTokenizer(line);
                    strTk.nextToken();
                    strTk.nextToken();
                    strTk.nextToken();
                    RGroupLogic log = null;

                    log = new RGroupLogic();
                    int rgroupNumber = new Integer(strTk.nextToken());
                    String tok = strTk.nextToken();
                    log.rgoupNumberRequired = tok.equals("0") ? 0 : new Integer(tok);
                    log.restH = strTk.nextToken().equals("1") ? true : false;
                    tok = "";
                    while (strTk.hasMoreTokens()) {
                        tok += strTk.nextToken();
                    }
                    log.occurence = tok;
                    logicDefinitions.put(rgroupNumber, log);
                }

                line = input.readLine();
                ++lineCount;
            }
            String rootStr = sb.toString();

            //Let MDL reader process $CTAB block of the root structure.
            MDLV2000Reader reader = new MDLV2000Reader(new StringReader(rootStr), ISimpleChemObjectReader.Mode.STRICT);
            IMolecule root = (IMolecule)reader.read(rGroupQuery.getBuilder().newInstance(IMolecule.class));
            rGroupQuery.setRootStructure(root);
            List<IAtom> atomsByLinePosition = reader.getAtomsByLinePosition();

            //Atom attachment order: parse AAL lines first
            strTk = new StringTokenizer(rootStr, eol);
            while (strTk.hasMoreTokens()) {
                line = strTk.nextToken();
                if (line.startsWith("M  AAL")) {
                    StringTokenizer stAAL = new StringTokenizer(line);
                    stAAL.nextToken();
                    stAAL.nextToken();
                    int pos = new Integer(stAAL.nextToken());
                    IAtom rGroup = atomsByLinePosition.get(pos);
                    stAAL.nextToken();
                    Map<Integer, IBond> bondMap = new HashMap<Integer, IBond>();
                    while (stAAL.hasMoreTokens()) {
                        pos = new Integer(stAAL.nextToken());
                        IAtom partner = atomsByLinePosition.get(pos);
                        IBond bond = root.getBond(rGroup, partner);
                        int order = new Integer(stAAL.nextToken());
                        bondMap.put(order, bond);
                        logger.info("AAL " + order + " " + ((IPseudoAtom)rGroup).getLabel() + 
                                           "-" + partner.getSymbol());
                    }
                    if (bondMap.size()!=0)  {
                        attachmentPoints.put(rGroup, bondMap);
                    }
                    
                }
            }
            //Deal with remaining attachment points (non AAL)
            for (IAtom atom : root.atoms()) {
                if (atom instanceof IPseudoAtom) {
                    IPseudoAtom rGroup = (IPseudoAtom)atom;
                    if (rGroup.getLabel().startsWith("R") && 
                        !rGroup.getLabel().equals("R") && // only numbered ones
                        !attachmentPoints.containsKey(rGroup)) {
                        //Order reflects the order of atoms in the Atom Block
                        int order = 0;
                        Map<Integer, IBond> bondMap = new HashMap<Integer, IBond>();
                        for (IAtom atom2 : atomsByLinePosition) {
                            if (!atom.equals(atom2)) {
                                for (IBond bond : root.bonds()) {
                                    if (bond.contains(atom) && bond.contains(atom2)) {
                                        bondMap.put(++order, bond);
                                        logger.info("Def " + order + " " + rGroup.getLabel() + "-" +
                                                           atom2.getSymbol());
                                        break;
                                    }
                                }
                            }
                        }
                        if (bondMap.size()!=0)  {
                            attachmentPoints.put(rGroup, bondMap);
                        }
                    }
                }
            }
            //Done with attachment points
            rGroupQuery.setRootAttachmentPoints(attachmentPoints);
            logger.info("Attachm.points defined for " + attachmentPoints.size() + " R# atoms");


            //Process each Rgroup's $CTAB block(s)_____________________________
            //__________________________________________________________________

            //Set up the RgroupLists, one for each unique R# (# = 1..32 max)
            Map<Integer,RGroupList> rGroupDefinitions = new HashMap<Integer,RGroupList>();

            for (IAtom atom : root.atoms()) {
                if (atom instanceof IPseudoAtom) {
                    IPseudoAtom rGroup = (IPseudoAtom)atom;
                    if (RGroupQuery.isValidRgroupQueryLabel(rGroup.getLabel())) {
                        int rgroupNum = new Integer(rGroup.getLabel().substring(1));
                        RGroupList rgroupList = new RGroupList(rgroupNum);
                        if (!rGroupDefinitions.containsKey(rgroupNum)) {
                            logger.info("Define Rgroup R" + rgroupNum);
                            RGroupLogic logic = logicDefinitions.get(rgroupNum);
                            if (logic != null) {
                                rgroupList.setRestH(logic.restH);
                                rgroupList.setOccurrence(logic.occurence);
                                rgroupList.setRequiredRGroupNumber(logic.rgoupNumberRequired);
                            } else {
                                rgroupList.setRestH(false);
                                rgroupList.setOccurrence(">0");
                                rgroupList.setRequiredRGroupNumber(0);
                            }
                            rgroupList.setRGroups(new ArrayList<RGroup>());
                            rGroupDefinitions.put(rgroupNum, rgroupList);
                        }
                    }
                }
            }

            //Parse all $CTAB blocks per Rgroup (there can be more than one)
            line = input.readLine();
            ++lineCount;
            boolean hasMoreRGP = true;
            while (hasMoreRGP) {

                checkLineBeginsWith(line, "$RGP", lineCount);
                line = input.readLine();
                ++lineCount;
                logger.info("line for num is " + line);
                int rgroupNum = new Integer(line.trim());
                line = input.readLine();
                ++lineCount;

                boolean hasMoreCTAB = true;
                while (hasMoreCTAB) {

                    checkLineBeginsWith(line, "$CTAB", lineCount);
                    sb = new StringBuilder(RGroup.makeLabel(rgroupNum)+"\n\n\n");
                    line = input.readLine();
                    while (line != null && !line.startsWith("$END CTAB")) {
                        sb.append(line + eol);
                        line = input.readLine();
                        ++lineCount;
                    }
                    String groupStr = sb.toString();
                    reader = new MDLV2000Reader
                        (new StringReader(groupStr), ISimpleChemObjectReader.Mode.STRICT);
                    IMolecule group = (IMolecule)reader.read(rGroupQuery.getBuilder().newInstance(IMolecule.class));
                    atomsByLinePosition = reader.getAtomsByLinePosition();
                    RGroup rGroup = new RGroup();
                    rGroup.setGroup(group);

                    //Parse the Rgroup's attachment points (APO)
                    strTk = new StringTokenizer(groupStr, eol);
                    while (strTk.hasMoreTokens()) {
                        line = strTk.nextToken();
                        if (line.startsWith("M  APO"))  {
                            StringTokenizer stAPO = new StringTokenizer(line);
                            stAPO.nextToken();
                            stAPO.nextToken();
                            stAPO.nextToken();
                            while (stAPO.hasMoreTokens())  {
                                int pos = new Integer(stAPO.nextToken());
                                int apo = new Integer(stAPO.nextToken());
                                IAtom at = atomsByLinePosition.get(pos);
                                switch (apo)  {
                                    case 1: 
                                        rGroup.setFirstAttachmentPoint(at);
                                        break;
                                    case 2: 
                                        rGroup.setSecondAttachmentPoint(at);
                                        break;
                                    case 3: {
                                            rGroup.setFirstAttachmentPoint(at);
                                            rGroup.setSecondAttachmentPoint(at);
                                        }
                                        break;
                                }
                            }
                        }
                    }
                    RGroupList rList = rGroupDefinitions.get(rgroupNum);
                    if (rList==null)  {
                        throw new CDKException("R"+rgroupNum+" not defined but referenced in $RGP.");
                    }
                    else {
                        rList.getRGroups().add(rGroup);
                    }
                    line = input.readLine();
                    ++lineCount;
                    if (line.startsWith("$END RGP")) {
                        logger.info("end of RGP block");
                        hasMoreCTAB = false;
                    }
                }

                line = input.readLine();
                ++lineCount;
                if (line.startsWith("$END MOL")) {
                    hasMoreRGP = false;
                }
            }

            rGroupQuery.setRGroupDefinitions(rGroupDefinitions);
            logger.info("Number of lines was " + lineCount);
            return rGroupQuery;

        } catch (CDKException exception) {
            String error = "CDK Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage();
            logger.error(error);
            logger.debug(exception);
            throw exception;
        } catch (Exception exception) {
            exception.printStackTrace();
            String error =
                exception.getClass() + "Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage();
            logger.error(error);
            logger.debug(exception);
            throw new CDKException(error, exception);
        }
    }

    /**
     * Checks that a given line starts as expected, according to RGFile format.
     * @param line
     * @param expect
     * @param lineCount
     * @throws CDKException
     */
    private void checkLineBeginsWith(String line, String expect, int lineCount) throws CDKException {
        if (line == null) {
            throw new CDKException("RGFile invalid, empty/null line at #" + lineCount);
        }
        if (!line.startsWith(expect)) {
            throw new CDKException("RGFile invalid, line #" + lineCount + " should start with:" + expect + ".");
        }
    }


}


java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.