org.eurocarbdb.util.carbbank.CarbbankManager.java Source code

Java tutorial

Introduction

Here is the source code for org.eurocarbdb.util.carbbank.CarbbankManager.java

Source

/*
*   EuroCarbDB, a framework for carbohydrate bioinformatics
*
*   Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
*   indicated by the @author tags or express copyright attribution
*   statements applied by the authors.  
*
*   This copyrighted material is made available to anyone wishing to use, modify,
*   copy, or redistribute it subject to the terms and conditions of the GNU
*   Lesser General Public License, as published by the Free Software Foundation.
*   A copy of this license accompanies this distribution in the file LICENSE.txt.
*
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
*   for more details.
*
*   Last commit: $Rev: 1870 $ by $Author: david@nixbioinf.org $ on $Date:: 2010-02-23 #$  
*/

package org.eurocarbdb.util.carbbank;

//  stdlib imports
import java.util.*;
import java.io.*;

//  3rd party imports
import org.apache.log4j.Logger;

import org.hibernate.Session;
import org.hibernate.EntityMode;

import org.dom4j.Element;
import org.dom4j.io.XMLWriter;
import org.dom4j.io.OutputFormat;

//  eurocarb imports
import org.eurocarbdb.dataaccess.Eurocarb;
import org.eurocarbdb.dataaccess.EntityManager;
import org.eurocarbdb.dataaccess.HibernateEntityManager;

import org.eurocarbdb.dataaccess.core.Reference;
import org.eurocarbdb.dataaccess.core.JournalReference;
import org.eurocarbdb.dataaccess.core.Contributor;
import org.eurocarbdb.dataaccess.core.GlycanSequence;
import org.eurocarbdb.dataaccess.core.BiologicalContext;
import org.eurocarbdb.dataaccess.core.Disease;
import org.eurocarbdb.dataaccess.exception.*;

import org.eurocarbdb.util.carbbank.CarbbankParser;
import org.eurocarbdb.util.carbbank.CarbbankRecord;

// import org.eurocarbdb.dataaccess.hibernate.HibernateUtil;

//  static imports
import static org.eurocarbdb.util.StringUtils.join;
import static org.eurocarbdb.dataaccess.Eurocarb.getEntityManager;

/*  class CarbbankManager  *//***************************************
                             *
                             *   A data loader and unloader for Carbbank structures, including
                             *   sequence, reference & biological context information.
                             *
                             *   @see      org.eurocarbdb.util.carbbank.CarbbankParser
                             *   @see      org.eurocarbdb.util.carbbank.CarbbankRecord
                             *   @author   mjh
                             *   @version  $Rev: 1870 $
                             */
public class CarbbankManager {

    //~~~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~//

    /** Logging handle. */
    static final Logger log = Logger.getLogger(CarbbankManager.class);

    /** Number of parsing/loading exceptions to tolerate until the 
    *   load process is aborted.  */
    private static final int ERROR_TOLERANCE = 100000;

    /** The stream from which we read Carbbank records. @see #getInputStream */
    private InputStream instream = null;

    /** The stream to which we output Carbbank CSV once parsed. @see #getOutputStreamErrorSequences */
    private PrintStream outstreamErrorSequences = null;

    /** This is the contributor that will be used when loading 
    *   (or unloading) Carbbank structures to the data store. 
    *   @see #getCarbbankContributor  */
    private static Contributor carbbankContributor = null;

    /** Carbbank parser instance. */
    private CarbbankParser parser = new CarbbankParser();

    /** Max number of entries to parse. Negative means parse all. */
    private int loadLimit = -1;

    /** Specifies the first record that will be fully parsed. For example, 
    *   firstRecord=10 means the first record loaded will be record 10. */
    private int firstRecord = 1;

    private static final String QUERY_GET_ALL_CARBBANK_STRUCTURES = "org.eurocarbdb.dataaccess.core.Contributor.GET_ALL_CARBBANK_STRUCTURES";

    /** Number of records that parsed with errors. */
    private int records_with_errors = 0;

    /** Number of records to save before committing a transaction. */
    private int save_after = 25;

    //~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~//    

    /**
    *   Returns a {@link CarbbankParser} that may be used for parsing
    *   a raw Carbbank file.
    */
    public CarbbankParser getCarbbankParser() {
        assert parser != null;
        return parser;
    }

    /** 
    *   Returns an {@link InputStream} to a Carbbank raw data file. The data 
    *   file used is determined at runtime by the value of the Eurocarb
    *   property 'carbbank.raw.file'.
    *   @see Eurocarb#getProperty
    */
    public InputStream getInputStream() {
        if (instream == null) {
            String filename = Eurocarb.getProperty("carbbank.raw.file");
            log.info("opening local Carbbank file '" + filename + "'");
            try {
                instream = new FileInputStream(filename);
            } catch (FileNotFoundException e) {
                log.warn("Couldn't open file '" + filename + "': " + e);
                return null;
            }
        }

        return instream;
    }

    public void setFirstRecord(int index) {
        if (index < 0)
            index = 0;
        firstRecord = index;
    }

    /** 
    *   Sets the passed {@link InputStream} from which Carbbank 
    *   raw data will be read. 
    *   @see #parseAndLoadCarbbank
    */
    public void setInputStream(InputStream in) {
        assert in != null;
        instream = in;
    }

    /**
    *   Returns the {@link PrintStream} that will be used to output Carbbank
    *   records that produce errors. If not set explicitly by 
    *   {@link #setOutputStreamErrorSequences} then the stream returned 
    *   will be directed to a file named by the Eurocarb property 
    *   <tt>'carbbank.errors.file'</tt>.
    *   @see Eurocarb#getProperty
    *   @throws DataAccessException if method cannot open file for writing
    */
    public PrintStream getOutputStreamErrorSequences() throws DataAccessException {
        if (outstreamErrorSequences == null) {
            String filename = Eurocarb.getProperty("carbbank.errors.file");

            if (log.isDebugEnabled())
                log.debug("creating cache file '" + filename + "'");

            try {
                outstreamErrorSequences = new PrintStream(new BufferedOutputStream(new FileOutputStream(filename)));
            } catch (Exception e) {
                outstreamErrorSequences = null;
                String msg = "Caught exception while trying to open file '" + filename + "' for writing: " + e;

                log.warn(msg);

                throw new DataAccessException(msg);
            }
        }

        return outstreamErrorSequences;
    }

    /** 
    *   Sets the passed {@link OutputStream} to which pre-parsed & cached 
    *   Carbbank data will be read (by the method {@link #parseAndLoadCarbbank}). 
    */
    public void setOutputStreamErrorSequences(PrintStream out) {
        assert out != null;
        outstreamErrorSequences = out;
    }

    /**
    *   Returns the canonical "Carbbank" contributor.
    *   If a "Carbbank" contributor does not exist in the 
    *   current data store at the time this method is called, 
    *   then it will be created. The name of this contributor
    *   is given by the Eurocarb property 'carbbank.contributor.name'.
    */
    public static Contributor getCarbbankContributor() {
        // if ( carbbankContributor != null )
        // return carbbankContributor;

        String contributor_name = Eurocarb.getProperty("carbbank.contributor.name");
        if (contributor_name == null) {
            log.warn("There is no value for property '" + "carbbank.contributor.name"
                    + "' configured! Using last-resort value of 'Carbbank'");
            contributor_name = "Carbbank";
        }

        if (log.isDebugEnabled())
            log.debug("Looking up the canonical Carbbank contributor " + "(contributor name '" + contributor_name
                    + "')");

        carbbankContributor = Contributor.lookupExactName(contributor_name);

        if (carbbankContributor == null) {
            log.debug("A Carbbank contributor could not be found " + "in the current data store, creating it");

            carbbankContributor = new Contributor();
            carbbankContributor.setContributorName(contributor_name);

            getEntityManager().store(carbbankContributor);

            if (log.isDebugEnabled())
                log.debug("Carbbank contributor with name '" + carbbankContributor.getContributorName() + "', id '"
                        + carbbankContributor.getContributorId() + "' successfully added to the data store");
        }

        return carbbankContributor;
    }

    /** 
    *   Sets a limit on the number of carbbank entries that will be 
    *   parsed and loaded via the {@link #parseAndLoadCarbbank} method.
    *   Less than zero means 'load all'.
    */
    public void setLoadLimit(int nmb_of_entries) {
        loadLimit = nmb_of_entries;
    }

    /** 
    *   Parses and loads Carbbank data from raw file. Note that
    *   this is much slower than the {@link #loadCarbbank} method,
    *   which loads a pre-parsed version of Carbbank data. 
    *   @return number of carbbank entries parsed
    */
    public int parseAndLoadCarbbank() throws IOException, DataAccessException {
        if (firstRecord < 0) {
            log.info("Nothing to do!");
            return 0;
        }

        EntityManager em = getEntityManager();

        //  all entries parsed will be added to eurocarb db under this contributor.
        Contributor c = this.getCarbbankContributor();

        InputStream in = getInputStream();
        parser.setInputStream(in);

        int count = 0;

        assert c != null;

        while (true) {
            CarbbankRecord r = parser.parse();
            if (r == null)
                break;
            count++;

            //  skip records until we reach the first record specified by firstRecord            
            if (count < firstRecord) {
                if (log.isDebugEnabled())
                    log.debug("skipping record " + count + "(<" + firstRecord + ")...");
                continue;
            }

            //  stop parsing if we've loaded more than loadLimit records.
            if (loadLimit == 0) {
                log.debug("Load limit reached, stopping...");
                break;
            }

            //  check seq has not already been added to DB
            if (recordAlreadySaved(r)) {
                log.debug("record already exists in DB, skipping...");
                continue;
            }

            //  otherwise process records as usual.
            //  skip records with unparseable sequences
            GlycanSequence gs = null;
            try {
                gs = r.getGlycanSequence();

                if (gs == null)
                    throw new RuntimeException("GlycanSequence returned null");

                if (gs.getSequenceCt() == null)
                    throw new RuntimeException("GlycanSequence returned a null Glycoct sequence");
            } catch (Exception ex) {
                logErrorRecord(r, ex, "Sequence unparseable");
                continue;
            }

            //  get references for entry
            JournalReference jref = r.getJournalReference();
            if (jref == null) {
                logErrorRecord(r, null, "Couldn't get a valid JournalReference");
                continue;
            }

            Reference ref = r.getEntryReference();
            assert ref != null;

            //  heavyweight biological context lookup
            List<BiologicalContext> bcs = r.getContexts();

            //  everythings ok so far, set associations between
            //  objects and then save them all
            jref.setContributor(c);
            ref.setContributor(c);
            gs.setContributor(c);

            gs.addReference(ref);
            gs.addReference(jref);

            for (BiologicalContext bc : bcs) {
                bc.addContributor(c, "");
                gs.addBiologicalContext(bc);
            }

            //  save the whole object graph
            log.debug("attempting to save carbbank record...");

            try {
                // update object with new information
                getEntityManager().update(jref);
                getEntityManager().update(ref);
                em.update(gs);

                log.debug("record was saved successfully");
            } catch (Exception ex) {
                log.warn("record not saved: " + ex.getMessage());
                logErrorRecord(r, ex, "caught exception while trying to save");
            }

            loadLimit--;

            if ((count % save_after) == 0) {
                periodicSaveProgress();
            }

        } // end while

        if (log.isInfoEnabled()) {
            log.info("Parsed " + count + " records, " + records_with_errors + " load error(s)");
        }

        return count;
    }

    protected void periodicSaveProgress() {
        log.info("saving progress...");
        getEntityManager().endUnitOfWork();
        getEntityManager().beginUnitOfWork();
    }

    /** Returns true if given CarbbankRecord already exists in the DB. */
    protected boolean recordAlreadySaved(CarbbankRecord r) {
        assert r != null;
        int id = r.getCarbbankId();
        assert id > 0;

        GlycanSequence existing = GlycanSequence.lookupByExternalRef("Carbbank", id);
        return existing != null;
    }

    /** 
    *   Records that the given Carbbank record had a problem and/or threw 
    *   an error during import. 
    *   @param ex can be null
    */
    protected void logErrorRecord(CarbbankRecord r, Exception ex, String msg) {
        assert r != null;
        PrintStream out = this.getOutputStreamErrorSequences();

        out.println(";~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        out.println("; carbbank id " + r.getCarbbankId());
        out.println("; eurocarb reason for failure: " + msg);

        if (ex != null) {
            out.println("; exception was: " + ex.getClass().getSimpleName() + " - " + ex.getMessage());
        }

        out.println(r.getRawEntry());

        out.println();
        out.println();

        records_with_errors++;

        if (records_with_errors > ERROR_TOLERANCE)
            throw new DataAccessException("Aborting load, too many errors");
    }

    /**
    *   Exports a freshly parsed & loaded Carbbank as CSV to the 
    *   {@link OutputStream} given by {@link #getOutputStreamErrorSequences}.
    */
    public int exportCarbbank() {
        assert false : "TODO";

        //PrintWriter out = getOutputWriter();
        OutputStream out = this.getOutputStreamErrorSequences();
        assert out != null;

        Session s = null;
        EntityManager em = Eurocarb.getEntityManager();
        if (em instanceof HibernateEntityManager) {
            s = ((HibernateEntityManager) em).getHibernateSession();
        } else {
            throw new RuntimeException(
                    "Only Hibernate-backed EntityManagers " + "support bulk exporting Carbbank data");
        }

        assert s != null;
        Session dom4j = s.getSession(EntityMode.DOM4J);

        String contrib_name = getCarbbankContributor().getContributorName();

        log.debug("query for all carbbank structures...");
        /*
                List structures = dom4j.getNamedQuery( QUERY_GET_ALL_CARBBANK_STRUCTURES )
                           .setParameter("name", contrib_name )
                           .list();
                    
                if ( log.isDebugEnabled() ) 
        log.debug( "found " 
                 + structures.size() 
                 + " carbbank structures..."
                 );
            
                Element e = (Element) structures.get(0);
        */
        Element e = (Element) dom4j.load(Disease.class, 9538);

        try {
            log.debug("generating XML...");
            OutputFormat format = OutputFormat.createPrettyPrint();
            XMLWriter writer = new XMLWriter(out, format);
            writer.write(e);
        } catch (IOException ioex) {
            log.warn("Caught " + ioex.getClass().getName() + " while generating export XML", ioex);
            throw new RuntimeException(ioex);
        }

        return 1;
    }

    /** 
    *   Loads previously-parsed Carbbank structures and associated 
    *   data into the current data store. This method requires a pre-parsed
    *   version of the raw data, which is created when loading Carbbank
    *   with the {@link parseAndLoadCarbbank} method. If this pre-parsed
    *   data does not exist when this method is called a 
    *   {@link UnsupportedOperationException} is thrown.
    *
    *   @return a string indicating success/failure.
    *   @throws UnsupportedOperationException 
    *           if pre-parsed Carbbank data does not exist at time of calling.
    *   @see    EntityManager
    */
    public int loadCarbank() {
        return 0;
    }

    /**
    *   Unloads (deletes!) Carbbank structures and associated data from 
    *   the current data store.
    *   @return a string indicating success/failure.
    */
    public int unloadCarbbank() {
        Contributor c = this.getCarbbankContributor();

        //TODO: getEntityManager().delete( c );

        this.carbbankContributor = null;

        return 0;
    }

    /**
    *   Saves the passed CarbbankRecord to the current data store.
    */
    protected void storeCarbbankRecord(CarbbankRecord r) {
        assert r != null;
    }

    public static class CLI {
        public static void main(String[] args) throws IOException {
            getEntityManager().beginUnitOfWork();

            CarbbankManager cm = new CarbbankManager();

            int parsed = cm.parseAndLoadCarbbank();

            getEntityManager().endUnitOfWork();
        }
    }

} // end class