com.zimbra.common.soap.W3cDomUtil.java Source code

Introduction

Here is the source code for com.zimbra.common.soap.W3cDomUtil.java
Source

/*
 * ***** BEGIN LICENSE BLOCK *****
 * Zimbra Collaboration Suite Server
 * Copyright (C) 2012, 2013, 2014, 2016 Synacor, Inc.
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software Foundation,
 * version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program.
 * If not, see <https://www.gnu.org/licenses/>.
 * ***** END LICENSE BLOCK *****
 */

package com.zimbra.common.soap;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.dom4j.DocumentFactory;
import org.dom4j.io.DOMReader;
import org.dom4j.io.SAXReader;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;

import com.google.common.base.Strings;
import com.google.common.io.Closeables;
import com.zimbra.common.service.ServiceException;
import com.zimbra.common.soap.Element.ElementFactory;
import com.zimbra.common.util.Log;
import com.zimbra.common.util.ZimbraLog;

public class W3cDomUtil {
    private static final Log LOG = ZimbraLog.misc;

    private W3cDomUtil() {
    }

    /** Cache one DocumentBuilder per thread to avoid unnecessarily recreating them for every XML parse. */
    private static final ThreadLocal<DocumentBuilder> w3DomBuilderTL = new ThreadLocal<DocumentBuilder>() {
        @Override
        protected javax.xml.parsers.DocumentBuilder initialValue() {
            try {
                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
                dbf.setNamespaceAware(true);
                dbf.setIgnoringComments(true);
                // Prevent external entity reference attack.
                dbf.setExpandEntityReferences(false);
                dbf.setFeature("http://xml.org/sax/features/external-general-entities", false);
                // protect against recursive entity expansion DOS attack and perhaps other things
                dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
                try {
                    dbf.setAttribute("http://apache.org/xml/features/disallow-doctype-decl", true);
                } catch (IllegalArgumentException iae) {
                    ZimbraLog.misc.debug("Disabling doctype-decl not supported", iae);
                }
                return dbf.newDocumentBuilder();
            } catch (javax.xml.parsers.ParserConfigurationException pce) {
                ZimbraLog.misc.error("Problem setting up w3c DOM builder", pce);
                return null;
            }
        }
    };

    public static DocumentBuilder getBuilder() {
        return w3DomBuilderTL.get();
    }

    public static SAXParser getDom4jSAXParserWhichUsesSecureProcessing() throws XmlParseException {
        SAXParserFactory factory = SAXParserFactory.newInstance();
        factory.setNamespaceAware(true);
        factory.setXIncludeAware(false);
        factory.setValidating(false);
        try {
            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
            factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
        } catch (SAXNotRecognizedException | SAXNotSupportedException | ParserConfigurationException ex) {
            ZimbraLog.misc.error("Problem setting up SAXParser which supports secure XML processing", ex);
            throw XmlParseException.PARSE_ERROR();
        }
        try {
            return factory.newSAXParser();
        } catch (ParserConfigurationException | SAXException e) {
            ZimbraLog.misc.error("Problem setting up SAXParser", e);
            throw XmlParseException.PARSE_ERROR();
        }
    };

    public static SAXReader getDom4jSAXReaderWhichUsesSecureProcessing() throws XmlParseException, SAXException {
        return getDom4jSAXReaderWhichUsesSecureProcessing(null);
    }

    public static SAXReader getDom4jSAXReaderWhichUsesSecureProcessing(DocumentFactory fact)
            throws XmlParseException, SAXException {
        SAXReader dom4jSAXReader = new SAXReader(getDom4jSAXParserWhichUsesSecureProcessing().getXMLReader());
        if (null != fact) {
            dom4jSAXReader.setDocumentFactory(fact);
        }
        return dom4jSAXReader;
    }

    /** Cache one Transformer per thread to avoid unnecessarily recreating them for every XML parse. */
    private static final ThreadLocal<Transformer> transformerTL = new ThreadLocal<Transformer>() {
        @Override
        protected Transformer initialValue() {
            try {
                TransformerFactory transformerFactory = TransformerFactory.newInstance();
                Transformer transformer = transformerFactory.newTransformer();
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
                return transformer;
            } catch (TransformerFactoryConfigurationError factoryError) {
                LOG.error("Error creating TransformerFactory", factoryError);
            } catch (TransformerException transformerError) {
                LOG.error("Error creating Transformer", transformerError);
            }
            return null;
        }
    };

    /**
     * Return a pretty view of the XML fragment represented by {@code node}
     */
    public static String asXML(Node node) {
        return asXML(node, true, true);
    }

    public static String asXML(Node node, boolean indent) {
        return asXML(node, indent, true);
    }

    public static String asXML(Node node, boolean indent, boolean omitXmlDecl) {
        try {
            Source xmlSource = new DOMSource(node);
            StreamResult result = new StreamResult(new ByteArrayOutputStream());
            transformerTL.get().setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDecl ? "yes" : "no");
            transformerTL.get().setOutputProperty(OutputKeys.INDENT, indent ? "yes" : "no"); //Java XML Indent
            transformerTL.get().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            transformerTL.get().transform(xmlSource, result);
            return result.getOutputStream().toString();
        } catch (TransformerException transformerError) {
            LOG.error("Error transforming node", transformerError);
        }
        return null;
    }

    public static void asXML(OutputStream xmlStream, Node node, boolean indent, boolean omitXmlDecl) {
        try {
            Source xmlSource = new DOMSource(node);
            StreamResult result = new StreamResult(xmlStream);
            transformerTL.get().setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDecl ? "yes" : "no");
            transformerTL.get().setOutputProperty(OutputKeys.INDENT, indent ? "yes" : "no"); //Java XML Indent
            transformerTL.get().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            transformerTL.get().transform(xmlSource, result);
        } catch (TransformerException transformerError) {
            LOG.error("Error transforming node", transformerError);
        }
    }

    /**
     * Use JAXP to parse XML into an {@link Element} tree.
     * Note: DOCTYPE is disallowed for security reasons
     */
    public static Element parseXML(File file) throws ServiceException, FileNotFoundException {
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
            return parseXML(new FileInputStream(file));
        } finally {
            Closeables.closeQuietly(fis);
        }
    }

    /**
     * Use JAXP to parse XML into an {@link Element} tree.
     * Note: DOCTYPE is disallowed for security reasons
     */
    public static Element parseXML(InputStream is) throws XmlParseException {
        return parseXML(is, Element.XMLElement.mFactory);
    }

    /**
     * Use JAXP to parse XML into an {@link Element} tree.
     * This is faster and uses less resources than using dom4j
     * Note: DOCTYPE is disallowed for security reasons
     */
    public static Element parseXML(InputStream is, ElementFactory factory) throws XmlParseException {
        Document doc = parseXMLToDoc(is);
        return nodeToElement(doc, factory);
    }

    /**
     * Use JAXP to parse XML into an {@link Element} tree.
     * Note: DOCTYPE is disallowed for security reasons
     */
    public static Element parseXML(String xml) throws XmlParseException {
        return parseXML(xml, Element.XMLElement.mFactory);
    }

    /**
     * Use JAXP to parse XML into an {@link Element} tree.
     * This is faster and uses less resources than using dom4j
     * Note: DOCTYPE is disallowed for security reasons
     */
    public static Element parseXML(String xml, ElementFactory factory) throws XmlParseException {
        Document doc = parseXMLToDoc(xml);
        return nodeToElement(doc, factory);
    }

    public static Document parseXMLToDoc(String xml) throws XmlParseException {
        javax.xml.parsers.DocumentBuilder jaxbBuilder = getBuilder();
        jaxbBuilder.reset();
        jaxbBuilder.setErrorHandler(new JAXPErrorHandler());
        try {
            org.xml.sax.InputSource inStream = new org.xml.sax.InputSource();
            inStream.setCharacterStream(new java.io.StringReader(xml));
            return jaxbBuilder.parse(inStream);
        } catch (SAXException | IOException e) {
            /* Bug 93816 log actual problem but throw generic one to avoid information disclosure */
            logParseProblem(e);
            throw XmlParseException.PARSE_ERROR();
        }
    }

    public static Document parseXMLToDoc(InputStream is) throws XmlParseException {
        javax.xml.parsers.DocumentBuilder jaxbBuilder = getBuilder();
        jaxbBuilder.reset();
        jaxbBuilder.setErrorHandler(new JAXPErrorHandler());
        try {
            return jaxbBuilder.parse(is);
        } catch (SAXException | IOException e) {
            /* Bug 93816 log actual problem but throw generic one to avoid information disclosure */
            logParseProblem(e);
            throw XmlParseException.PARSE_ERROR();
        }
    }

    /**
     * Note: DOCTYPE is disallowed for reasons of security and protection against denial of service
     * @throws XmlParseException
     */
    public static org.dom4j.Document parseXMLToDom4jDocUsingSecureProcessing(InputStream is)
            throws XmlParseException {
        org.w3c.dom.Document w3cDoc = W3cDomUtil.parseXMLToDoc(is);
        DOMReader reader = new DOMReader();
        return reader.read(w3cDoc);
    }

    private static void logParseProblem(Exception e) {
        if (LOG.isDebugEnabled()) {
            LOG.warn("Problem parsing XML", e);
        } else {
            LOG.warn("Problem parsing XML - %s", e.getMessage());
        }
    }

    /**
     * Test whether we want to treat node as a single Element with text content or as a hierarchy.
     * There are 2 situations where we prefer text content:
     *    1 This is an XHTML node
     *    2 Contains a mixture of text and elements which isn't allowed in an Element hierarchy
     */
    private static boolean needToFlattenElementContent(Node node) {
        if (Element.XMLElement.XHTML_NS_URI.equalsIgnoreCase(node.getNamespaceURI())) {
            return true;
        }
        boolean hasElems = false;
        boolean hasText = false;
        for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
            int childNodeType = child.getNodeType();
            switch (childNodeType) {
            case Node.ELEMENT_NODE:
                if (hasText) {
                    return true;
                }
                hasElems = true;
                break;
            case Node.TEXT_NODE:
                String content = child.getNodeValue();
                if (content != null && !content.trim().equals("")) {
                    if (hasElems) {
                        return true;
                    }
                    hasText = true;
                }
                break;
            case Node.CDATA_SECTION_NODE:
                CDATASection cdata = (CDATASection) child;
                String cdataContent = cdata.getData();
                if (cdataContent != null && !cdataContent.trim().equals("")) {
                    if (hasElems) {
                        return true;
                    }
                    hasText = true;
                }
                break;
            default:
            }
        }
        return false;
    }

    private static final String XMLNS_COLON = Element.XMLElement.A_NAMESPACE + ":";

    public static Element nodeToElement(Node node, ElementFactory factory) {
        int nodeType = node.getNodeType();
        switch (nodeType) {
        case Node.DOCUMENT_NODE:
            Document doc = (Document) node;
            return nodeToElement(doc.getDocumentElement(), factory);
        case Node.ELEMENT_NODE:
            if (needToFlattenElementContent(node)) {
                return toFlattened(node, factory);
            } else {
                return toHierarchy(node, factory);
            }
        default:
            ZimbraLog.misc.debug("Unexpected nodeType %s in convertW3cDOM", Integer.toString(nodeType));
            return null;
        }
    }

    private static void makeAttributes(Element elt, Node node) {
        NamedNodeMap attrs = node.getAttributes();
        for (int ndx = 0; ndx < attrs.getLength(); ndx++) {
            Node attrNode = attrs.item(ndx);
            String nodeName = attrNode.getNodeName();
            String qualifiedName;
            String prefix = attrNode.getPrefix();
            if (nodeName.contains(":")) {
                qualifiedName = nodeName;
            } else {
                qualifiedName = prefix == null ? nodeName : String.format("%s:%s", prefix, nodeName);
            }
            if (!(Element.XMLElement.A_NAMESPACE.equals(qualifiedName) || qualifiedName.startsWith(XMLNS_COLON))) {
                elt.addAttribute(qualifiedName, attrNode.getNodeValue());
                String nsURI = attrNode.getNamespaceURI();
                if (!Strings.isNullOrEmpty(nsURI)) {
                    /* The approach to namespaces is to ALWAYS store them on elements that use them (for either the
                     * element's name or in one of its attributes names) but ignore them where they are not used.
                     * This means that unused namespace definitions may be dropped - but that shouldn't matter.
                     * It also means that namespaces won't be dropped by mistake from detached elements because the
                     * namespace is only stored in a parent element where it was defined.
                     */
                    elt.setNamespace(prefix, nsURI);
                }
            }
        }
    }

    private static Element toHierarchy(Node node, ElementFactory factory) {
        Element elt = factory.createElement(dom4jQNameForNode(node));
        makeAttributes(elt, node);
        StringBuilder content = new StringBuilder();
        for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
            switch (child.getNodeType()) {
            case Node.ELEMENT_NODE:
                elt.addElement(nodeToElement(child, factory));
                break;
            case Node.TEXT_NODE:
                content.append(child.getNodeValue());
                break;
            case Node.CDATA_SECTION_NODE:
                CDATASection cdata = (CDATASection) child;
                content.append(cdata.getData());
                break;
            }
        }
        String textContent = content.toString();
        if (!textContent.trim().equals("")) {
            elt.setText(textContent);
        }
        return elt;
    }

    private static Element toFlattened(Node node, ElementFactory factory) {
        Element elt = factory.createElement(dom4jQNameForNode(node));
        makeAttributes(elt, node);
        StringBuilder content = new StringBuilder();
        for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
            switch (child.getNodeType()) {
            case Node.ELEMENT_NODE:
                content.append(W3cDomUtil.asXML(child, false)); // i.e. add textual representation of the XML
                break;
            case Node.TEXT_NODE:
                content.append(child.getNodeValue());
                break;
            case Node.CDATA_SECTION_NODE:
                CDATASection cdata = (CDATASection) child;
                content.append(cdata.getData());
                break;
            }
        }
        return elt.setText(content.toString());
    }

    private static org.dom4j.QName dom4jQNameForNode(Node node) {
        org.dom4j.Namespace ns = node.getNamespaceURI() == null ? null
                : new org.dom4j.Namespace(node.getPrefix(), node.getNamespaceURI());
        String localName = node.getNodeName();
        if (localName.contains(":")) {
            localName = localName.substring(localName.indexOf(':') + 1);
        }
        return new org.dom4j.QName(localName, ns);
    }

    // Error handler to report errors and warnings
    public static class JAXPErrorHandler implements ErrorHandler {
        JAXPErrorHandler() {
        }

        /**
         * Returns a string describing parse exception details
         */
        private String getParseExceptionInfo(String category, SAXParseException spe) {
            return String.format("%s: Problem on line %d of document : %s", category, spe.getLineNumber(),
                    spe.getMessage());
        }

        @Override
        public void warning(SAXParseException spe) throws SAXException {
            ZimbraLog.misc.warn(getParseExceptionInfo("Warning", spe));
        }

        @Override
        public void error(SAXParseException spe) throws SAXException {
            throw new SAXException(getParseExceptionInfo("Error", spe));
        }

        @Override
        public void fatalError(SAXParseException spe) throws SAXException {
            throw new SAXException(getParseExceptionInfo("Fatal Error", spe));
        }
    }

}