Java tutorial
/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1870 $ by $Author: david@nixbioinf.org $ on $Date:: 2010-02-23 #$ */ package org.eurocarbdb.util.carbbank; // stdlib imports import java.util.*; import java.io.*; // 3rd party imports import org.apache.log4j.Logger; import org.hibernate.Session; import org.hibernate.EntityMode; import org.dom4j.Element; import org.dom4j.io.XMLWriter; import org.dom4j.io.OutputFormat; // eurocarb imports import org.eurocarbdb.dataaccess.Eurocarb; import org.eurocarbdb.dataaccess.EntityManager; import org.eurocarbdb.dataaccess.HibernateEntityManager; import org.eurocarbdb.dataaccess.core.Reference; import org.eurocarbdb.dataaccess.core.JournalReference; import org.eurocarbdb.dataaccess.core.Contributor; import org.eurocarbdb.dataaccess.core.GlycanSequence; import org.eurocarbdb.dataaccess.core.BiologicalContext; import org.eurocarbdb.dataaccess.core.Disease; import org.eurocarbdb.dataaccess.exception.*; import org.eurocarbdb.util.carbbank.CarbbankParser; import org.eurocarbdb.util.carbbank.CarbbankRecord; // import org.eurocarbdb.dataaccess.hibernate.HibernateUtil; // static imports import static org.eurocarbdb.util.StringUtils.join; import static org.eurocarbdb.dataaccess.Eurocarb.getEntityManager; /* class CarbbankManager *//*************************************** * * A data loader and unloader for Carbbank structures, including * sequence, reference & biological context information. * * @see org.eurocarbdb.util.carbbank.CarbbankParser * @see org.eurocarbdb.util.carbbank.CarbbankRecord * @author mjh * @version $Rev: 1870 $ */ public class CarbbankManager { //~~~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~// /** Logging handle. */ static final Logger log = Logger.getLogger(CarbbankManager.class); /** Number of parsing/loading exceptions to tolerate until the * load process is aborted. */ private static final int ERROR_TOLERANCE = 100000; /** The stream from which we read Carbbank records. @see #getInputStream */ private InputStream instream = null; /** The stream to which we output Carbbank CSV once parsed. @see #getOutputStreamErrorSequences */ private PrintStream outstreamErrorSequences = null; /** This is the contributor that will be used when loading * (or unloading) Carbbank structures to the data store. * @see #getCarbbankContributor */ private static Contributor carbbankContributor = null; /** Carbbank parser instance. */ private CarbbankParser parser = new CarbbankParser(); /** Max number of entries to parse. Negative means parse all. */ private int loadLimit = -1; /** Specifies the first record that will be fully parsed. For example, * firstRecord=10 means the first record loaded will be record 10. */ private int firstRecord = 1; private static final String QUERY_GET_ALL_CARBBANK_STRUCTURES = "org.eurocarbdb.dataaccess.core.Contributor.GET_ALL_CARBBANK_STRUCTURES"; /** Number of records that parsed with errors. */ private int records_with_errors = 0; /** Number of records to save before committing a transaction. */ private int save_after = 25; //~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~// /** * Returns a {@link CarbbankParser} that may be used for parsing * a raw Carbbank file. */ public CarbbankParser getCarbbankParser() { assert parser != null; return parser; } /** * Returns an {@link InputStream} to a Carbbank raw data file. The data * file used is determined at runtime by the value of the Eurocarb * property 'carbbank.raw.file'. * @see Eurocarb#getProperty */ public InputStream getInputStream() { if (instream == null) { String filename = Eurocarb.getProperty("carbbank.raw.file"); log.info("opening local Carbbank file '" + filename + "'"); try { instream = new FileInputStream(filename); } catch (FileNotFoundException e) { log.warn("Couldn't open file '" + filename + "': " + e); return null; } } return instream; } public void setFirstRecord(int index) { if (index < 0) index = 0; firstRecord = index; } /** * Sets the passed {@link InputStream} from which Carbbank * raw data will be read. * @see #parseAndLoadCarbbank */ public void setInputStream(InputStream in) { assert in != null; instream = in; } /** * Returns the {@link PrintStream} that will be used to output Carbbank * records that produce errors. If not set explicitly by * {@link #setOutputStreamErrorSequences} then the stream returned * will be directed to a file named by the Eurocarb property * <tt>'carbbank.errors.file'</tt>. * @see Eurocarb#getProperty * @throws DataAccessException if method cannot open file for writing */ public PrintStream getOutputStreamErrorSequences() throws DataAccessException { if (outstreamErrorSequences == null) { String filename = Eurocarb.getProperty("carbbank.errors.file"); if (log.isDebugEnabled()) log.debug("creating cache file '" + filename + "'"); try { outstreamErrorSequences = new PrintStream(new BufferedOutputStream(new FileOutputStream(filename))); } catch (Exception e) { outstreamErrorSequences = null; String msg = "Caught exception while trying to open file '" + filename + "' for writing: " + e; log.warn(msg); throw new DataAccessException(msg); } } return outstreamErrorSequences; } /** * Sets the passed {@link OutputStream} to which pre-parsed & cached * Carbbank data will be read (by the method {@link #parseAndLoadCarbbank}). */ public void setOutputStreamErrorSequences(PrintStream out) { assert out != null; outstreamErrorSequences = out; } /** * Returns the canonical "Carbbank" contributor. * If a "Carbbank" contributor does not exist in the * current data store at the time this method is called, * then it will be created. The name of this contributor * is given by the Eurocarb property 'carbbank.contributor.name'. */ public static Contributor getCarbbankContributor() { // if ( carbbankContributor != null ) // return carbbankContributor; String contributor_name = Eurocarb.getProperty("carbbank.contributor.name"); if (contributor_name == null) { log.warn("There is no value for property '" + "carbbank.contributor.name" + "' configured! Using last-resort value of 'Carbbank'"); contributor_name = "Carbbank"; } if (log.isDebugEnabled()) log.debug("Looking up the canonical Carbbank contributor " + "(contributor name '" + contributor_name + "')"); carbbankContributor = Contributor.lookupExactName(contributor_name); if (carbbankContributor == null) { log.debug("A Carbbank contributor could not be found " + "in the current data store, creating it"); carbbankContributor = new Contributor(); carbbankContributor.setContributorName(contributor_name); getEntityManager().store(carbbankContributor); if (log.isDebugEnabled()) log.debug("Carbbank contributor with name '" + carbbankContributor.getContributorName() + "', id '" + carbbankContributor.getContributorId() + "' successfully added to the data store"); } return carbbankContributor; } /** * Sets a limit on the number of carbbank entries that will be * parsed and loaded via the {@link #parseAndLoadCarbbank} method. * Less than zero means 'load all'. */ public void setLoadLimit(int nmb_of_entries) { loadLimit = nmb_of_entries; } /** * Parses and loads Carbbank data from raw file. Note that * this is much slower than the {@link #loadCarbbank} method, * which loads a pre-parsed version of Carbbank data. * @return number of carbbank entries parsed */ public int parseAndLoadCarbbank() throws IOException, DataAccessException { if (firstRecord < 0) { log.info("Nothing to do!"); return 0; } EntityManager em = getEntityManager(); // all entries parsed will be added to eurocarb db under this contributor. Contributor c = this.getCarbbankContributor(); InputStream in = getInputStream(); parser.setInputStream(in); int count = 0; assert c != null; while (true) { CarbbankRecord r = parser.parse(); if (r == null) break; count++; // skip records until we reach the first record specified by firstRecord if (count < firstRecord) { if (log.isDebugEnabled()) log.debug("skipping record " + count + "(<" + firstRecord + ")..."); continue; } // stop parsing if we've loaded more than loadLimit records. if (loadLimit == 0) { log.debug("Load limit reached, stopping..."); break; } // check seq has not already been added to DB if (recordAlreadySaved(r)) { log.debug("record already exists in DB, skipping..."); continue; } // otherwise process records as usual. // skip records with unparseable sequences GlycanSequence gs = null; try { gs = r.getGlycanSequence(); if (gs == null) throw new RuntimeException("GlycanSequence returned null"); if (gs.getSequenceCt() == null) throw new RuntimeException("GlycanSequence returned a null Glycoct sequence"); } catch (Exception ex) { logErrorRecord(r, ex, "Sequence unparseable"); continue; } // get references for entry JournalReference jref = r.getJournalReference(); if (jref == null) { logErrorRecord(r, null, "Couldn't get a valid JournalReference"); continue; } Reference ref = r.getEntryReference(); assert ref != null; // heavyweight biological context lookup List<BiologicalContext> bcs = r.getContexts(); // everythings ok so far, set associations between // objects and then save them all jref.setContributor(c); ref.setContributor(c); gs.setContributor(c); gs.addReference(ref); gs.addReference(jref); for (BiologicalContext bc : bcs) { bc.addContributor(c, ""); gs.addBiologicalContext(bc); } // save the whole object graph log.debug("attempting to save carbbank record..."); try { // update object with new information getEntityManager().update(jref); getEntityManager().update(ref); em.update(gs); log.debug("record was saved successfully"); } catch (Exception ex) { log.warn("record not saved: " + ex.getMessage()); logErrorRecord(r, ex, "caught exception while trying to save"); } loadLimit--; if ((count % save_after) == 0) { periodicSaveProgress(); } } // end while if (log.isInfoEnabled()) { log.info("Parsed " + count + " records, " + records_with_errors + " load error(s)"); } return count; } protected void periodicSaveProgress() { log.info("saving progress..."); getEntityManager().endUnitOfWork(); getEntityManager().beginUnitOfWork(); } /** Returns true if given CarbbankRecord already exists in the DB. */ protected boolean recordAlreadySaved(CarbbankRecord r) { assert r != null; int id = r.getCarbbankId(); assert id > 0; GlycanSequence existing = GlycanSequence.lookupByExternalRef("Carbbank", id); return existing != null; } /** * Records that the given Carbbank record had a problem and/or threw * an error during import. * @param ex can be null */ protected void logErrorRecord(CarbbankRecord r, Exception ex, String msg) { assert r != null; PrintStream out = this.getOutputStreamErrorSequences(); out.println(";~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); out.println("; carbbank id " + r.getCarbbankId()); out.println("; eurocarb reason for failure: " + msg); if (ex != null) { out.println("; exception was: " + ex.getClass().getSimpleName() + " - " + ex.getMessage()); } out.println(r.getRawEntry()); out.println(); out.println(); records_with_errors++; if (records_with_errors > ERROR_TOLERANCE) throw new DataAccessException("Aborting load, too many errors"); } /** * Exports a freshly parsed & loaded Carbbank as CSV to the * {@link OutputStream} given by {@link #getOutputStreamErrorSequences}. */ public int exportCarbbank() { assert false : "TODO"; //PrintWriter out = getOutputWriter(); OutputStream out = this.getOutputStreamErrorSequences(); assert out != null; Session s = null; EntityManager em = Eurocarb.getEntityManager(); if (em instanceof HibernateEntityManager) { s = ((HibernateEntityManager) em).getHibernateSession(); } else { throw new RuntimeException( "Only Hibernate-backed EntityManagers " + "support bulk exporting Carbbank data"); } assert s != null; Session dom4j = s.getSession(EntityMode.DOM4J); String contrib_name = getCarbbankContributor().getContributorName(); log.debug("query for all carbbank structures..."); /* List structures = dom4j.getNamedQuery( QUERY_GET_ALL_CARBBANK_STRUCTURES ) .setParameter("name", contrib_name ) .list(); if ( log.isDebugEnabled() ) log.debug( "found " + structures.size() + " carbbank structures..." ); Element e = (Element) structures.get(0); */ Element e = (Element) dom4j.load(Disease.class, 9538); try { log.debug("generating XML..."); OutputFormat format = OutputFormat.createPrettyPrint(); XMLWriter writer = new XMLWriter(out, format); writer.write(e); } catch (IOException ioex) { log.warn("Caught " + ioex.getClass().getName() + " while generating export XML", ioex); throw new RuntimeException(ioex); } return 1; } /** * Loads previously-parsed Carbbank structures and associated * data into the current data store. This method requires a pre-parsed * version of the raw data, which is created when loading Carbbank * with the {@link parseAndLoadCarbbank} method. If this pre-parsed * data does not exist when this method is called a * {@link UnsupportedOperationException} is thrown. * * @return a string indicating success/failure. * @throws UnsupportedOperationException * if pre-parsed Carbbank data does not exist at time of calling. * @see EntityManager */ public int loadCarbank() { return 0; } /** * Unloads (deletes!) Carbbank structures and associated data from * the current data store. * @return a string indicating success/failure. */ public int unloadCarbbank() { Contributor c = this.getCarbbankContributor(); //TODO: getEntityManager().delete( c ); this.carbbankContributor = null; return 0; } /** * Saves the passed CarbbankRecord to the current data store. */ protected void storeCarbbankRecord(CarbbankRecord r) { assert r != null; } public static class CLI { public static void main(String[] args) throws IOException { getEntityManager().beginUnitOfWork(); CarbbankManager cm = new CarbbankManager(); int parsed = cm.parseAndLoadCarbbank(); getEntityManager().endUnitOfWork(); } } } // end class