org.ambraproject.rhino.content.xml.ArticleXml.java Source code

Introduction

Here is the source code for org.ambraproject.rhino.content.xml.ArticleXml.java
Source

/*
 * Copyright (c) 2017 Public Library of Science
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

package org.ambraproject.rhino.content.xml;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import org.ambraproject.rhino.identity.ArticleIdentifier;
import org.ambraproject.rhino.identity.Doi;
import org.ambraproject.rhino.model.article.ArticleMetadata;
import org.ambraproject.rhino.model.article.AssetMetadata;
import org.ambraproject.rhino.model.article.RelatedArticleLink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

import java.time.LocalDate;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * An NLM-format XML document that can be "ingested" to build an {@link ArticleMetadata} object.
 */
public class ArticleXml extends AbstractArticleXml<ArticleMetadata> {

    private static final Logger log = LoggerFactory.getLogger(ArticleXml.class);

    public ArticleXml(Document xml) {
        super(xml);
    }

    public Document getDocument() {
        return (Document) xml;
    }

    /**
     * @return
     * @throws XmlContentException if the DOI is not present
     */
    public Doi readDoi() throws XmlContentException {
        String doi = readString("/article/front/article-meta/article-id[@pub-id-type=\"doi\"]");
        if (doi == null) {
            throw new XmlContentException("DOI not found");
        }
        return Doi.create(doi);
    }

    /**
     * Find each node within this object's XML whose name is expected to be associated with an asset entity.
     *
     * @return the list of asset nodes
     */
    public AssetNodesByDoi findAllAssetNodes() {
        // Find all nodes of an asset type and map them by DOI
        List<Node> rawNodes = readNodeList(ASSET_EXPRESSION);
        ListMultimap<Doi, Node> nodeMap = LinkedListMultimap.create(rawNodes.size());
        for (Node node : rawNodes) {
            Doi assetDoi = getAssetDoi(node);
            if (assetDoi != null) {
                nodeMap.put(assetDoi, node);
            } else {
                findNestedDoi(node, nodeMap);
            }
        }

        // Replace <graphic> nodes without changing keys or iteration order
        for (Map.Entry<Doi, Node> entry : nodeMap.entries()) {
            Node node = entry.getValue();
            if (node.getNodeName().equals(GRAPHIC)) {
                entry.setValue(replaceGraphicNode(node));
            }
        }

        return new AssetNodesByDoi(nodeMap);
    }

    /**
     * Return the node that should represent the asset for a "graphic" node.
     *
     * @param node a node such that {@code node.getNodeName().equals(GRAPHIC)}
     * @return the corresponding asset node
     */
    private static Node replaceGraphicNode(Node node) {
        Node parent = node.getParentNode();
        if (GRAPHIC_NODE_PARENTS.contains(parent.getNodeName())) {
            return parent;
        }
        if (parent.getNodeName().equals(ALTERNATIVES)) {
            return parent.getParentNode();
        }
        return node;
    }

    /**
     * Try to find a DOI at a special position within an XML node and, if it's found, insert it into the node map at that
     * key.
     *
     * @param outerNode an asset node such that {@code getAssetDoi(outerNode) == null}
     * @param nodeMap   the map to modify if a nested DOI is found
     */
    private void findNestedDoi(Node outerNode, Multimap<Doi, Node> nodeMap) {
        Preconditions.checkNotNull(nodeMap);

        // Currently, the only special case handled here is
        //   <table-wrap> ... <graphic xlink:href="..." /> ... </table-wrap>
        // See case pone.0012008 (asset 10.1371/journal.pone.0012008.t002) in the test suite.
        if (TABLE_WRAP.equals(outerNode.getNodeName())) {
            Node graphicNode = readNode("descendant::" + GRAPHIC, outerNode);
            if (graphicNode != null) {
                Doi doi = getAssetDoi(graphicNode);
                if (doi != null) {
                    nodeMap.put(doi, outerNode);
                }
            }
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public ArticleMetadata build() throws XmlContentException {
        ArticleMetadata.Builder article = ArticleMetadata.builder();
        setConstants(article);
        setFromXml(article);
        return article.build();
    }

    /**
     * Set constant values.
     *
     * @param article the article to modify
     */
    private static void setConstants(ArticleMetadata.Builder article) {
        // These are constants because they are implied by how we get the input
        article.setFormat("text/xml");
    }

    private void setFromXml(final ArticleMetadata.Builder article) throws XmlContentException {
        Doi doi = readDoi();
        article.setDoi(doi.getName());

        article.setTitle(getXmlFromNode(readNode("/article/front/article-meta/title-group/article-title")));

        if (isEissnTagPresent()) {
            String eissn = readString("/article/front/journal-meta/issn[@pub-type=\"epub\"]");
            if (Strings.isNullOrEmpty(eissn)) {
                eissn = readString("/article/front/journal-meta/issn");
            }
            article.seteIssn(eissn);
        }
        article.setJournalName(readString("/article/front/journal-meta/journal-title-group/journal-title"));
        article.setDescription(getXmlFromNode(findAbstractNode()));
        article.setAbstractText(getXmlFromNode(findFirstAbstractNode()));

        String rights = readString("/article/front/article-meta/permissions/copyright-statement");
        if (rights == null) {
            rights = buildRights(readString("/article/front/article-meta/permissions/copyright-holder"),
                    readString("/article/front/article-meta/permissions/license/license-p"));
        }
        article.setRights(rights);

        article.setPageCount(parsePageCount(readString("/article/front/article-meta/counts/page-count/@count")));
        article.seteLocationId(readString("/article/front/article-meta/elocation-id"));
        article.setVolume(readString("/article/front/article-meta/volume"));
        article.setIssue(readString("/article/front/article-meta/issue"));
        article.setPublisherName(readString("/article/front/journal-meta/publisher/publisher-name"));
        article.setPublisherLocation(readString("/article/front/journal-meta/publisher/publisher-loc"));
        article.setLanguage(parseLanguage(readString("/article/@xml:lang")));
        Node dateNode = readNode("/article/front/article-meta/pub-date[@pub-type=\"epub\"]");
        if (dateNode == null) {
            dateNode = readNode("/article/front/article-meta/pub-date[@publication-format=\"electronic\"]");
        }
        article.setPublicationDate(parseDate(dateNode));

        article.setNlmArticleType(readString("/article/@article-type"));
        article.setArticleType(parseArticleHeading());

        article.setEditors(readPersons(
                readNodeList("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"editor\"]/name")));

        article.setUrl(buildUrl(doi));

        article.setRelatedArticles(parseRelatedArticles());

        article.setAssets(parseAssets());
    }

    /**
     * Queries for where the article abstract is found, ordered by priority. If more than one matches to a node, the first
     * one should be stored as the article abstract.
     */
    private static final ImmutableList<String> QUERIES_FOR_ABSTRACT = ImmutableList.of(
            "/article/front/article-meta/abstract[@abstract-type=\"toc\"]",
            "/article/front/article-meta/abstract[@abstract-type=\"summary\"]",
            "/article/front/article-meta/abstract");

    /**
     * @return the node containing the article abstract
     */
    private Node findAbstractNode() {
        for (String query : QUERIES_FOR_ABSTRACT) {
            Node node = readNode(query);
            if (node != null) {
                return node;
            }
        }
        return null;
    }

    /**
     * @return the node containing the article abstract
     */
    private Node findFirstAbstractNode() {
        return readNode("/article/front/article-meta/abstract");
    }

    /**
     * @throws XmlContentException if the required issn tag is omitted. The value can be blank.
     */
    private boolean isEissnTagPresent() throws XmlContentException {
        final Node issnNode = readNode("/article/front/journal-meta/issn");
        if (issnNode == null) {
            throw new XmlContentException("Required eIssn tag is omitted");
        }
        return true;
    }

    /**
     * @return the appropriate value for the rights property of {@link ArticleMetadata}, based on the article XML.
     */
    private static String buildRights(String holder, String license) throws XmlContentException {
        if (license == null) {
            throw new XmlContentException("Required license statement is omitted");
        }

        // pmc2obj-v3.xslt lines 179-183
        String rights = (holder == null) ? license : (holder + ". " + license);
        return rights.trim();
    }

    /**
     * @return the appropriate value for the pages property of {@link ArticleMetadata}, based on the article XML.
     */
    private static Integer parsePageCount(String pageCount) {
        if (Strings.isNullOrEmpty(pageCount)) {
            return null;
        }
        try {
            return Integer.parseInt(pageCount);
        } catch (NumberFormatException e) {
            throw new XmlContentException("Expected number for page count", e);
        }
    }

    /**
     * @return a valid URL to the article (based on the DOI)
     */
    private static String buildUrl(Doi doi) {
        return doi.asUri(Doi.UriStyle.HTTPS_DOI_RESOLVER).toString();
    }

    private String parseArticleHeading() {
        List<String> headings = readTextList("/article/front/article-meta/article-categories/"
                + "subj-group[@subj-group-type = 'heading']/subject");
        if (headings.size() > 1) {
            throw new XmlContentException(
                    "Must not contain more than one subject group with subj-group-type=\"heading\"");
        }
        return headings.isEmpty() ? null : headings.get(0);
    }

    private static String parseLanguage(String language) {
        if (language == null) {
            log.debug("Language not specified in article XML; defaulting to English");
            return "en"; // Formerly hard-coded for all articles, so it's the most sensible default
        }
        return language.toLowerCase();
    }

    private LocalDate parseDate(Node dateNode) throws XmlContentException {
        int year, month, day;
        try {
            year = Integer.parseInt(readString("child::year", dateNode));
            month = Integer.parseInt(readString("child::month", dateNode));
            day = Integer.parseInt(readString("child::day", dateNode));
        } catch (NumberFormatException e) {
            throw new XmlContentException("Expected numbers for date fields", e);
        }

        return LocalDate.of(year, month, day);
    }

    /**
     * Parse {@link RelatedArticleLink} objects from XML.
     * <p/>
     * These are returned from here, rather than set in the {@link ArticleMetadata} object during normal parsing, so they
     * can get special handling.
     *
     * @return the article relationships defined by the XML
     */
    public List<RelatedArticleLink> parseRelatedArticles() {
        List<Node> relatedArticleNodes = readNodeList("//related-article");
        List<RelatedArticleLink> relatedArticles = Lists.newArrayListWithCapacity(relatedArticleNodes.size());
        for (Node relatedArticleNode : relatedArticleNodes) {
            String type = readString("attribute::related-article-type", relatedArticleNode);
            String doi = readHrefAttribute(relatedArticleNode);
            if (doi != null) {
                RelatedArticleLink relatedArticle = new RelatedArticleLink(type, ArticleIdentifier.create(doi));
                relatedArticles.add(relatedArticle);
            } else {
                throw new XmlContentException("Related article has no doi. node "
                        + relatedArticleNode.getAttributes().getNamedItem("id"));
            }
        }
        return relatedArticles;
    }

    private List<AssetMetadata> parseAssets() {
        AssetNodesByDoi nodeMap = findAllAssetNodes();
        return nodeMap.getDois().stream().map((Doi assetDoi) -> {
            ImmutableList<Node> nodes = nodeMap.getNodes(assetDoi);
            List<AssetMetadata> assetMetadataList = nodes.stream()
                    .map(assetNode -> new AssetXml(assetNode, assetDoi).build()).distinct()
                    .collect(Collectors.toList());
            if (assetMetadataList.size() > 1) {
                return disambiguateAssetNodes(assetMetadataList);
            }
            return assetMetadataList.get(0);
        }).collect(Collectors.toList());
    }

    private static final Comparator<AssetMetadata> ASSET_NODE_PREFERENCE = Comparator
            .<AssetMetadata, Boolean>comparing(node -> node.getTitle().isEmpty())
            .thenComparing(Comparator.comparing(node -> node.getDescription().isEmpty()));

    /**
     * @param assetNodesMetadata metadata for at least two asset nodes with the same DOI and unequal content
     * @return the metadata with the most non-empty fields
     * @throws XmlContentException if two or more asset nodes have non-empty, inconsistent title or description
     */
    @VisibleForTesting
    static AssetMetadata disambiguateAssetNodes(Collection<AssetMetadata> assetNodesMetadata) {
        // Find the node with the most substantial content
        AssetMetadata bestNode = assetNodesMetadata.stream().min(ASSET_NODE_PREFERENCE)
                .orElseThrow(() -> new IllegalArgumentException("Argument list must not be empty"));

        // If any other nodes have non-empty fields that are inconsistent with bestNode, complain about ambiguity
        Collection<AssetMetadata> ambiguous = assetNodesMetadata.stream().filter(node -> {
            String title = node.getTitle();
            boolean ambiguousTitle = !title.isEmpty() && !title.equals(bestNode.getTitle());

            String description = node.getDescription();
            boolean ambiguousDescription = !description.isEmpty() && !description.equals(bestNode.getDescription());

            return ambiguousTitle || ambiguousDescription;
        }).collect(Collectors.toList());
        if (!ambiguous.isEmpty()) {
            throw new XmlContentException(String.format("Ambiguous asset nodes: %s, %s", bestNode, ambiguous));
        }

        return bestNode;
    }

}