Java tutorial
/* * LingPipe v. 2.0 * Copyright (C) 2003-5 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://www.alias-i.com/lingpipe/licenseV1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */ package com.aliasi.lingmed.genelinkage; import com.aliasi.chunk.Chunk; import com.aliasi.cluster.CompleteLinkClusterer; import com.aliasi.cluster.HierarchicalClusterer; import com.aliasi.cluster.SingleLinkClusterer; import com.aliasi.cluster.Dendrogram; import com.aliasi.lingmed.dao.DaoException; import com.aliasi.lingmed.entrezgene.EntrezGene; import com.aliasi.lingmed.entrezgene.EntrezGeneCodec; import com.aliasi.lingmed.entrezgene.EntrezGeneSearcher; import com.aliasi.lingmed.entrezgene.EntrezGeneSearcherImpl; import com.aliasi.lingmed.lingblast.Constants; import com.aliasi.lingmed.medline.MedlineCodec; import com.aliasi.lingmed.medline.MedlineSearcher; import com.aliasi.lingmed.medline.MedlineSearcherImpl; import com.aliasi.lingmed.server.SearchClient; import com.aliasi.lingmed.utils.FileUtils; import com.aliasi.lingmed.utils.Logging; import com.aliasi.matrix.ProximityMatrix; import com.aliasi.util.AbstractCommand; import com.aliasi.util.Counter; import com.aliasi.util.Distance; import com.aliasi.util.Files; import com.aliasi.util.ObjectToCounterMap; import com.aliasi.util.Pair; import com.aliasi.util.ScoredObject; import com.aliasi.util.Strings; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.io.PrintStream; import java.sql.SQLException; import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import javax.naming.InitialContext; import javax.naming.NamingException; import javax.naming.Reference; import javax.naming.StringRefAddr; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Searcher; import org.apache.log4j.Logger; /** * <P>The <code>ClusterArticles</code> command * reads in a file containing geneIds, 1 per line * and clusters these genes according to co-occurances * in MEDLINE citations * * <P>The following arguments are required: * * <dl> * <dt><code>-geneIds</code></dt> * <dd>Name of input file containing geneIds. * </dd> * * <dt><code>-host</code></dt> * <dd>Name of Lucene search server. * If value is "localhost" then search * the local Lucene indexes, * else search remote Lucene indexes (via RMI). * </dd> * * <dt><code>-dbUserName</code></dt> * <dd>Name of database user. * </dd> * * <dt><code>-dbUserPassword</code></dt> * <dd>Password for database user. * </dd> * </dl> * * <P>The following arguments are optional: * * <dl> * <dt><code>-htmlDir</code></dt> * <dd> * Directory for html pages. * Defaults to "html". * </dd> * * <dt><code>-entrezgene</code></dt> * <dd>Name of remote entrezgene search service, or path to * local Lucene entrezgene index dir. * Defaults to "entrezgene". * </dd> * * <dt><code>-medline</code></dt> * <dd>Name of remote medline search service, or path to * local Lucene medline index dir. * Defaults to "medline". * </dd> * * <dt><code>-dbUrl</code></dt> * <dd>Url of database to connect to (for jdbc). * Defaults to "jdbc:mysql://localhost:3306/gene_linkage" * </dd> * * <dt><code>-dbName</code></dt> * <dd>Database name. * Defaults to "gene_linkage". * </dd> * * <dt><code>-maxArticles</code></dt> * <dd>Maximum number of articles to return. * Defaults to 100. * </dd> * * </dl> * * @author Mitzi Morris * @version 1.0 * @since LingMed1.3 */ public class ClusterArticles extends AbstractCommand { private final Logger mLogger = Logger.getLogger(NBestArticles.class); private File mGeneIdFile; private String mGeneIdFilePath; private File mHtmlDir; private String mHtmlDirPath; private int mMaxArticles; private String mSearchHost; private String mEntrezService; private EntrezGeneSearcher mEntrezGeneSearcher; private String mMedlineService; private MedlineSearcher mMedlineSearcher; private String mDbUrl; private String mDbName; private String mDbUserName; private String mDbUserPassword; private GeneLinkageDao mGeneLinkageDao; private GeneLinkageSearcher mGeneLinkageSearcher; private final static String GENE_IDS = "geneIds"; private final static String MAX_ARTICLES = "maxArticles"; private final static String HTML_DIR = "htmlDir"; private final static String SEARCH_HOST = "host"; private final static String MEDLINE_SERVICE = "medline"; private final static String ENTREZGENE_SERVICE = "entrezgene"; private final static String DB_URL = "dbUrl"; private final static String DB_NAME = "dbName"; private final static String DB_USERNAME = "dbUserName"; private final static String DB_USERPASSWORD = "dbUserPassword"; private final static Properties DEFAULT_PARAMS = new Properties(); static { DEFAULT_PARAMS.setProperty(DB_URL, "jdbc:mysql://localhost:3306/gene_linkage"); DEFAULT_PARAMS.setProperty(DB_NAME, "gene_linkage"); DEFAULT_PARAMS.setProperty(HTML_DIR, "html"); DEFAULT_PARAMS.setProperty(MAX_ARTICLES, "100"); DEFAULT_PARAMS.setProperty(MEDLINE_SERVICE, Constants.MEDLINE_SERVICE); DEFAULT_PARAMS.setProperty(ENTREZGENE_SERVICE, Constants.ENTREZ_SERVICE); } private ClusterArticles(String[] args) throws Exception { super(args, DEFAULT_PARAMS); mGeneIdFilePath = getExistingArgument(GENE_IDS); mHtmlDirPath = getExistingArgument(HTML_DIR); mMaxArticles = getArgumentInt(MAX_ARTICLES); mSearchHost = getExistingArgument(SEARCH_HOST); mMedlineService = getExistingArgument(MEDLINE_SERVICE); mEntrezService = getExistingArgument(ENTREZGENE_SERVICE); mDbUrl = getExistingArgument(DB_URL); mDbName = getExistingArgument(DB_NAME); mDbUserName = getExistingArgument(DB_USERNAME); mDbUserPassword = getExistingArgument(DB_USERPASSWORD); reportParameters(); processParameters(); } public static void main(String[] args) throws Exception { ClusterArticles generator = new ClusterArticles(args); generator.run(); } private String[] getGeneIds(File inFile) throws IOException { HashSet<String> geneIds = new HashSet<String>(); String line = null; LineNumberReader in = new LineNumberReader(new FileReader(inFile)); while ((line = in.readLine()) != null) { if (line.trim().length() < 1) continue; geneIds.add(line.trim()); } in.close(); String[] result = new String[geneIds.size()]; result = geneIds.toArray(result); return result; } public void run() { mLogger.info("find co-occurances of genes in articles"); try { HashSet<String> allArticles = new HashSet<String>(); String[] seedGeneIds = getGeneIds(mGeneIdFile); HashSet<ObjectToCounterMap> geneCooccuranceSet = new HashSet<ObjectToCounterMap>(); for (int i = 0; i < seedGeneIds.length; i++) { String geneId = seedGeneIds[i]; if (mLogger.isDebugEnabled()) { mLogger.debug("geneId: " + geneId); } ArticleMention[] mentions = mGeneLinkageSearcher.findTopMentions(geneId, mMaxArticles); if (mLogger.isDebugEnabled()) { mLogger.debug("total article mentions: " + mentions.length); } ObjectToCounterMap geneObjectToCounterMap = new ObjectToCounterMap(); for (ArticleMention mention : mentions) { allArticles.add(mention.pubmedId()); if (mLogger.isDebugEnabled()) { mLogger.debug("mentioned in article: " + mention.pubmedId()); } try { int pmid = Integer.parseInt(mention.pubmedId()); Pair<Double, Set<Chunk>> geneMentions = mGeneLinkageDao.getGeneMentionsForPubmedId(pmid); Set<Chunk> gMentions = geneMentions.b(); if (mLogger.isDebugEnabled()) { mLogger.debug("total genes mentioned: " + gMentions.size()); } // for each mention, grab the gene id and increment its count for (Chunk gMention : gMentions) { geneObjectToCounterMap.increment(gMention.type()); } } catch (NumberFormatException nfe) { } } geneCooccuranceSet.add(geneObjectToCounterMap); } mLogger.info("seed genes: " + seedGeneIds.length); mLogger.info("total articles: " + allArticles.size()); ObjectToCounterMap[] geneCooccurance = new ObjectToCounterMap[geneCooccuranceSet.size()]; geneCooccurance = geneCooccuranceSet.toArray(geneCooccurance); ProximityMatrix prox = new ProximityMatrix(seedGeneIds.length); for (int i = 0; i < seedGeneIds.length; ++i) { for (int j = i + 1; j < seedGeneIds.length; ++j) { double cosine = COSINE_DISTANCE.distance(geneCooccurance[i], geneCooccurance[j]); prox.setValue(i, j, cosine); if (mLogger.isDebugEnabled()) { mLogger.debug("gene_i : " + seedGeneIds[i] + " gene_j : " + seedGeneIds[j] + " cosine : " + cosine); } } } HierarchicalClusterer<ObjectToCounterMap> clClusterer = new CompleteLinkClusterer<ObjectToCounterMap>( 1.0, COSINE_DISTANCE); Dendrogram<ObjectToCounterMap> completeLinkDendrogram = clClusterer .hierarchicalCluster(geneCooccuranceSet); System.out.println(completeLinkDendrogram.prettyPrint()); } catch (Exception e) { mLogger.warn("Unexpected Exception: " + e.getMessage()); mLogger.warn("stack trace: " + Logging.logStackTrace(e)); IllegalStateException e2 = new IllegalStateException(e.getMessage()); e2.setStackTrace(e.getStackTrace()); throw e2; } } private void processParameters() throws Exception { mGeneIdFile = FileUtils.checkInputFile(mGeneIdFilePath); if (mSearchHost.equals("localhost")) { FileUtils.checkIndex(mMedlineService, false); Searcher medlineLocalSearcher = new IndexSearcher(mMedlineService); mMedlineSearcher = new MedlineSearcherImpl(new MedlineCodec(), medlineLocalSearcher); FileUtils.checkIndex(mEntrezService, false); Searcher egLocalSearcher = new IndexSearcher(mEntrezService); mEntrezGeneSearcher = new EntrezGeneSearcherImpl(new EntrezGeneCodec(), egLocalSearcher); } else { SearchClient medlineClient = new SearchClient(mMedlineService, mSearchHost, 1099); Searcher medlineRemoteSearcher = medlineClient.getSearcher(); mMedlineSearcher = new MedlineSearcherImpl(new MedlineCodec(), medlineRemoteSearcher); SearchClient egClient = new SearchClient(mEntrezService, mSearchHost, 1099); Searcher egRemoteSearcher = egClient.getSearcher(); mEntrezGeneSearcher = new EntrezGeneSearcherImpl(new EntrezGeneCodec(), egRemoteSearcher); } mLogger.info("instantiated lucene searchers"); InitialContext ic = new InitialContext(); // Construct Jndi object reference: arg1: classname, arg2: factory name, arg3:URL (can be null) Reference ref = new Reference("com.mysql.jdbc.jdbc2.optional.MysqlConnectionPoolDataSource", "com.mysql.jdbc.jdbc2.optional.MysqlDataSourceFactory", null); ref.add(new StringRefAddr("driverClassName", "com.mysql.jdbc.Driver")); ref.add(new StringRefAddr("url", mDbUrl)); ref.add(new StringRefAddr("databaseName", mDbName)); ref.add(new StringRefAddr("username", mDbUserName)); ref.add(new StringRefAddr("password", mDbUserPassword)); ic.rebind("jdbc/mysql", ref); mGeneLinkageDao = GeneLinkageDaoImpl.getInstance(ic, "jdbc/mysql", mDbUserName, mDbUserPassword); mLogger.info("instantiated mysql dao"); mLogger.info("mGeneLinkageDao: " + mGeneLinkageDao); mGeneLinkageSearcher = new GeneLinkageSearcher(mEntrezGeneSearcher, mMedlineSearcher, mGeneLinkageDao); mHtmlDir = new File(mHtmlDirPath); FileUtils.ensureDirExists(mHtmlDir); if (mMaxArticles < 1) { String msg = "Max Articles must be > 0, value found: " + mMaxArticles; throw new IllegalArgumentException(msg); } } static Distance<ObjectToCounterMap> COSINE_DISTANCE = new Distance<ObjectToCounterMap>() { public double distance(ObjectToCounterMap doc1, ObjectToCounterMap doc2) { return dotProduct(doc1, doc2) / (length(doc1) * length(doc2)); } }; static double dotProduct(ObjectToCounterMap doc1, ObjectToCounterMap doc2) { double product = 0.0; Iterator it = doc1.keySet().iterator(); while (it.hasNext()) { Object key = it.next(); // System.out.println("key " + key + " doc1 " + doc1.getCount(key) + " doc2 " + doc2.getCount(key)); product += doc1.getCount(key) * doc2.getCount(key); } return product == 0.0 ? Double.POSITIVE_INFINITY : product; } static double length(ObjectToCounterMap doc1) { double sumOfSquares = 0.0; Iterator it = doc1.values().iterator(); while (it.hasNext()) { Counter counter = (Counter) it.next(); double counterVal = counter.doubleValue(); sumOfSquares += counterVal * counterVal; } return Math.sqrt(sumOfSquares); } private void reportParameters() { mLogger.info("NBestArticles " + "\n\tgeneIds list (inputs)=" + mGeneIdFilePath + "\n\tHTML dir (outputs)=" + mHtmlDirPath + "\n\tmax articles for gene=" + mMaxArticles + "\n\tlucene host=" + mSearchHost + "\n\tmysql host=" + mDbUrl + "\n\tdb name=" + mDbName + "\n\tdb user name=" + mDbUserName); } }