com.aliasi.lingmed.entrezgene.CitationCompiler.java Source code

Introduction

Here is the source code for com.aliasi.lingmed.entrezgene.CitationCompiler.java
Source

/*
 * LingPipe v. 2.0
 * Copyright (C) 2003-5 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://www.alias-i.com/lingpipe/licenseV1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.lingmed.entrezgene;

import com.aliasi.lingmed.dao.*;
import com.aliasi.lingmed.lucene.Fields;
import com.aliasi.lingmed.medline.*;
import com.aliasi.lingmed.server.*;
import com.aliasi.lingmed.utils.FileUtils;
import com.aliasi.lingmed.utils.Logging;

import com.aliasi.lingmed.medline.parser.Abstract;
import com.aliasi.lingmed.medline.parser.Article;
import com.aliasi.lingmed.medline.parser.MedlineCitation;
import com.aliasi.lingmed.medline.parser.MedlineHandler;

import com.aliasi.util.AbstractCommand;
import com.aliasi.util.Arrays;
import com.aliasi.util.Strings;

import java.io.*;

import java.util.HashSet;
import java.util.Properties;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;

import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;

import org.apache.log4j.Logger;

/**
 * <P>The <code>CitationCompiler</code> command builds
 * a corpus of all PubMed citations that are
 * referenced in an EntrezGene entry.
 *
 * <P>The following arguments are required:
 *
 * <dl>
 * <dt><code>-corpusDir</code></dt>
 * <dd>Name of directory in which to store the corpus.
 * </dd>
 *
 * <dl>
 * <dt><code>-entrezgene</code></dt>
 * <dd>Path to Lucene entrezgene index dir.
 * </dd>
 *
 * <dt><code>-medline</code></dt>
 * <dd>Path to Lucene medline index dir.
 * </dd>
 *
 * <dt><code>-citationdir</code></dt>
 * <dd>Name of directory for html versions of Pubmed Citations.
 * </dd>
 *
 * <P>The following arguments are optional:
 *
 * <dl>
 * <dt><code>-citationIndex</code></dt>
 * <dd>Name of dedicated Lucene index for curated abstracts.
 * </dd>
 *
 * <dt><code>-maxGeneHits</code></dt>
 * <dd>Maximum number of genes mentioned in an article.
 * Used to exclude texts which are too general.
 * Defaults to 100.
 * </dd>
 *
 * </dl>
 *
 * @author Mitzi Morris
 * @version 1.0
 * @since   LingMed1.3
 */

public class CitationCompiler extends AbstractCommand {
    private final Logger mLogger = Logger.getLogger(CitationCompiler.class);

    private String mEntrezService;
    private EntrezGeneSearcher mEntrezGeneSearcher;

    private String mMedlineService;
    private Searcher mMedlineIndexSearcher;

    private File mCitationDir;
    private String mCitationDirPath;

    private boolean mkIndex;
    private File mCitationIndex;
    private String mCitationIndexName;
    private IndexWriter mIndexWriter;
    private MedlineCodec mCodec = new SearchableMedlineCodec();

    private final static double RAM_BUF_SIZE = 1000d; // size of in-memory index buffer, in MB
    private final static int MERGE_FACTOR_HI = 100; // higher number = fewer merges

    private int mMaxGeneHits;

    private final static String MEDLINE_SERVICE = "medline";
    private final static String ENTREZGENE_SERVICE = "entrezgene";
    private final static String CITATION_DIR = "citationDir";
    private final static String CITATION_INDEX = "citationIndex";
    private final static String MAX_GENE_HITS = "maxGeneHits";

    private final static Properties DEFAULT_PARAMS = new Properties();
    static {
        DEFAULT_PARAMS.setProperty(MEDLINE_SERVICE, MEDLINE_SERVICE);
        DEFAULT_PARAMS.setProperty(ENTREZGENE_SERVICE, ENTREZGENE_SERVICE);
        DEFAULT_PARAMS.setProperty(MAX_GENE_HITS, "100");
    }

    private CitationCompiler(String[] args) throws Exception {
        super(args, DEFAULT_PARAMS);
        mMedlineService = getExistingArgument(MEDLINE_SERVICE);
        mEntrezService = getExistingArgument(ENTREZGENE_SERVICE);
        mCitationDirPath = getExistingArgument(CITATION_DIR);
        mCitationIndexName = getArgument(CITATION_INDEX);
        mMaxGeneHits = getArgumentInt(MAX_GENE_HITS);

        reportParameters();

        FileUtils.checkIndex(mMedlineService, false);
        mMedlineIndexSearcher = new IndexSearcher(mMedlineService);

        FileUtils.checkIndex(mEntrezService, false);
        Searcher egLocalSearcher = new IndexSearcher(mEntrezService);
        mEntrezGeneSearcher = new EntrezGeneSearcherImpl(new EntrezGeneCodec(), egLocalSearcher);
        mLogger.info("instantiated lucene searchers");

        mCitationDir = new File(mCitationDirPath);
        FileUtils.ensureDirExists(mCitationDir);

        if (mCitationIndexName != null) {
            mkIndex = true;
            mCitationIndex = new File(mCitationIndexName);
            FileUtils.ensureDirExists(mCitationIndex);
            mIndexWriter = new IndexWriter(FSDirectory.getDirectory(mCitationIndex), mCodec.getAnalyzer(),
                    new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
            mIndexWriter.setRAMBufferSizeMB(RAM_BUF_SIZE);
            mIndexWriter.setMergeFactor(MERGE_FACTOR_HI);
            mLogger.info("opened writer for curated lucene index");
        }
    }

    private void reportParameters() {
        mLogger.info("CitationCompiler " + "\n\tCitation Directory (output dir)=" + mCitationDirPath
                + "\n\tmax gene hits per pubmed article=" + mMaxGeneHits + "\n\tcurated citation index="
                + mCitationIndexName);
    }

    public void run() {
        mLogger.info("Begin");
        Set<String> allPmids = new HashSet<String>();
        try {
            for (EntrezGene entrezGene : mEntrezGeneSearcher) {
                mLogger.debug("processing EntrezGene Id: " + entrezGene.getGeneId());
                String[] pubMedIds = entrezGene.getUniquePubMedRefs();
                for (String pmid : pubMedIds) {
                    SearchResults<EntrezGene> hits = mEntrezGeneSearcher.getGenesForPubmedId(pmid);
                    if (mLogger.isDebugEnabled())
                        mLogger.debug("pubmed id: " + pmid + "\t hits: " + hits.size());
                    if (hits.size() <= mMaxGeneHits) {
                        allPmids.add(pmid);
                    }
                }
            }
            mLogger.info("total unique pubmed references: " + allPmids.size());
            for (String pmid : allPmids) {
                // retrieve medline citation as doc (not MedlineCitation object)
                Term term = new Term(Fields.ID_FIELD, pmid);
                Query query = new TermQuery(term);
                TopDocs results = mMedlineIndexSearcher.search(query, 1);
                if (results.totalHits > 0) {
                    Document doc = mMedlineIndexSearcher.doc(results.scoreDocs[0].doc);
                    MedlineCitation citation = mCodec.toRecodableObject(doc);
                    outputCitation(citation);
                    if (mkIndex) {
                        mIndexWriter.addDocument(mCodec.toDocument(citation));
                    }
                } else {
                    if (mLogger.isDebugEnabled())
                        mLogger.debug("pubmed id: " + pmid + " not found in index");
                }
            }
            if (mkIndex) {
                mLogger.info("commit Lucene index");
                mIndexWriter.optimize();
                mIndexWriter.close();
            }
            mLogger.info("Processing complete.");

        } catch (Exception e) {
            mLogger.warn("Unexpected Exception: " + e.getMessage());
            mLogger.warn("stack trace: " + Logging.logStackTrace(e));
            IllegalStateException e2 = new IllegalStateException(e.getMessage());
            e2.setStackTrace(e.getStackTrace());
            throw e2;
        }
    }

    private void outputCitation(MedlineCitation citation) throws FileNotFoundException {
        String pmid = citation.pmid();
        PrintStream citationOut = new PrintStream(new FileOutputStream(new File(mCitationDir, pmid + ".html")));
        citationOut.println("<HTML><BODY>");
        citationOut.println("<H4>PubMed ID: " + citation.pmid() + "</H4>");
        citationOut.println("<H4>" + citation.article().articleTitleText() + "</H4>");
        if (citation.article().abstrct() != null) {
            citationOut.println("<P>");
            citationOut.println(citation.article().abstrct().textWithoutTruncationMarker());
            citationOut.println("</P>");
        }
        citationOut.println("</BODY></HTML>");
        citationOut.close();
    }

    public static void main(String[] args) throws Exception {
        CitationCompiler compiler = new CitationCompiler(args);
        compiler.run();
    }

}