edu.cmu.lti.oaqa.bioasq.passage.PmcContentSetter.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.lti.oaqa.bioasq.passage.PmcContentSetter.java

Source

/*
 * Open Advancement Question Answering (OAQA) Project Copyright 2016 Carnegie Mellon University
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations
 * under the License.
 */

package edu.cmu.lti.oaqa.bioasq.passage;

import com.google.common.io.CharStreams;
import com.google.gson.Gson;
import com.google.gson.JsonIOException;
import com.google.gson.JsonSyntaxException;
import edu.cmu.lti.oaqa.baseqa.util.UimaContextHelper;
import edu.cmu.lti.oaqa.type.retrieval.Document;
import edu.cmu.lti.oaqa.util.TypeUtil;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.stream.IntStream;

import static java.util.stream.Collectors.toList;

/**
 * This class should be used after a document retrieval step if the document's text is not returned
 * by the {@link Document} (e.g.
 * {@link edu.cmu.lti.oaqa.bioasq.document.retrieval.GoPubMedDocumentRetrievalExecutor}), and before
 * a passage retrieval step that requires the document text.
 *
 * This class fetches a <a href="https://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> document
 * using a HTTP service for JSON document, which can be generated using the official document
 * processing tool from <a href="http://bioasq.org/">http://bioasq.org/</a>, and then split into
 * individual JSON objects.
 *
 * @author <a href="mailto:ziy@cs.cmu.edu">Zi Yang</a> created on 10/19/14
 */
public class PmcContentSetter extends JCasAnnotator_ImplBase {

    private static Gson gson = new Gson();

    private String urlFormat;

    private static final Logger LOG = LoggerFactory.getLogger(PmcContentSetter.class);

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        urlFormat = UimaContextHelper.getConfigParameterStringValue(context, "url-format");
    }

    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {
        int count = 0;
        for (Document doc : TypeUtil.getRankedDocuments(jcas)) {
            URL url;
            try {
                url = new URL(String.format(urlFormat, doc.getDocId()));
            } catch (MalformedURLException e) {
                throw new AnalysisEngineProcessException(e);
            }
            String json;
            try {
                json = CharStreams.toString(new InputStreamReader(url.openStream()));
            } catch (IOException e) {
                LOG.error("Error access {}", url.toString());
                throw new AnalysisEngineProcessException(e);
            }
            if (json.isEmpty()) {
                continue;
            }
            List<String> sections;
            try {
                sections = gson.fromJson(json, PmcDocument.class).getSections();
            } catch (JsonSyntaxException | JsonIOException e) {
                throw new AnalysisEngineProcessException(e);
            }
            doc.setSections((StringArray) FSCollectionFactory.createStringArray(jcas, sections));
            List<String> sectionLabels = IntStream.range(0, sections.size()).mapToObj(i -> "sections." + i)
                    .collect(toList());
            doc.setSectionLabels((StringArray) FSCollectionFactory.createStringArray(jcas, sectionLabels));
            count++;
        }
        LOG.info("Total pmc documents with content: {}", count);
    }

    public static final class PmcDocument {

        private String pmid;

        private String title;

        private List<String> sections;

        public PmcDocument(String pmid, String title, List<String> sections) {
            super();
            this.pmid = pmid;
            this.title = title;
            this.sections = sections;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((pmid == null) ? 0 : pmid.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            PmcDocument other = (PmcDocument) obj;
            if (pmid == null) {
                if (other.pmid != null)
                    return false;
            } else if (!pmid.equals(other.pmid))
                return false;
            return true;
        }

        public String getPmid() {
            return pmid;
        }

        public void setPmid(String pmid) {
            this.pmid = pmid;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public List<String> getSections() {
            return sections;
        }

        public void setSections(List<String> sections) {
            this.sections = sections;
        }

    }

}