org.htmlcleaner.XWikiDOMSerializer.java Source code

Introduction

Here is the source code for org.htmlcleaner.XWikiDOMSerializer.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.htmlcleaner;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;

/**
 * Generate a W3C Document from a SF's HTML Cleaner TagNode.
 *
 * Some code has been copy-pasted from SF's HTML Cleaner code (which is under a BDS license, see
 * http://htmlcleaner.sourceforge.net/license.php). Our goal is to remove this class completely if we can get SF's HTML
 * Cleaner to support the CDATA-related use cases that force us to have this class.
 *
 * Remove when the following issues have been fixed:
 * <ul>
 *     <li>https://sourceforge.net/p/htmlcleaner/bugs/169/</li>
 * </ul>
 *
 * Note: Even though in a public package this code is not meant to be a public API. We've had to put in under the {@code
 * org.htmlcleaner} package because of https://sourceforge.net/p/htmlcleaner/bugs/167/.
 *
 * @version $Id: 124eceb29fd098c392e9dcffa5c21bfef5cecb8e $
 * @since 1.8.2
 */
public class XWikiDOMSerializer {
    /**
     * The Regex Pattern to recognize a CDATA block.
     */
    private static final Pattern CDATA_PATTERN = Pattern.compile("<!\\[CDATA\\[.*(\\]\\]>|<!\\[CDATA\\[)",
            Pattern.DOTALL);

    private static final String CSS_COMMENT_START = "/*";

    private static final String CSS_COMMENT_END = "*/";

    private static final String JS_COMMENT = "//";

    private static final String NEW_LINE = "\n";

    private static final String SCRIPT_TAG_NAME = "script";

    private static final String STYLE_TAG_NAME = "style";

    private static final String HTML_TAG_NAME = "html";

    /**
     * The HTML Cleaner properties set by the user to control the HTML cleaning.
     */
    private CleanerProperties props;

    /**
     * Whether XML entities should be escaped or not.
     */
    private boolean escapeXml;

    /**
     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
     * @param escapeXml if true then escape XML entities
     */
    public XWikiDOMSerializer(CleanerProperties props, boolean escapeXml) {
        this.props = props;
        this.escapeXml = escapeXml;
    }

    /**
     * @param documentDocumentBuilder the {@link DocumentBuilder} instance to use, DocumentBuilder is not garantied to
     * be thread safe so at most the safe instance should be used only in the same thread
     * @param rootNode the HTML Cleaner root node to serialize
     * @return the W3C Document object
     * @throws ParserConfigurationException if there's an error during serialization
     */
    public Document createDOM(DocumentBuilder documentDocumentBuilder, TagNode rootNode)
            throws ParserConfigurationException {
        DOMImplementation impl = documentDocumentBuilder.getDOMImplementation();

        // Copied from the source code of HTML Cleaner.

        Document document;

        //
        // Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
        //
        // Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
        // common patterns for errors with the older HTML4 doctypes.
        //
        if (rootNode.getDocType() != null) {
            String qualifiedName = rootNode.getDocType().getPart1();
            String publicId = rootNode.getDocType().getPublicId();
            String systemId = rootNode.getDocType().getSystemId();

            //
            // If there is no qualified name, set it to html. See bug #153.
            //
            if (qualifiedName == null) {
                qualifiedName = HTML_TAG_NAME;
            }

            DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);

            //
            // While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html".
            // See bug #116
            //
            if (qualifiedName.equals("HTML")) {
                qualifiedName = HTML_TAG_NAME;
            }
            document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
        } else {
            document = documentDocumentBuilder.newDocument();
            Element rootElement = document.createElement(rootNode.getName());
            document.appendChild(rootElement);
        }

        //
        // Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
        //
        Map<String, String> attributes = rootNode.getAttributes();
        Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
        while (entryIterator.hasNext()) {
            Map.Entry<String, String> entry = entryIterator.next();
            String attrName = entry.getKey();
            String attrValue = entry.getValue();
            if (escapeXml) {
                attrValue = Utils.escapeXml(attrValue, props, true);
            }

            document.getDocumentElement().setAttribute(attrName, attrValue);

            //
            // Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
            //
            if (attrName.equalsIgnoreCase("id")) {
                document.getDocumentElement().setIdAttribute(attrName, true);
            }
        }

        createSubnodes(document, document.getDocumentElement(), rootNode.getAllChildren());

        return document;
    }

    /**
     * Perform CDATA transformations if the user has specified to use CDATA inside scripts and style elements.
     *
     * @param document the W3C Document to use for creating new DOM elements
     * @param element the W3C element to which we'll add the text content to
     * @param bufferedContent the buffered text content on which we need to perform the CDATA transformations
     * @param item the current HTML Cleaner node being processed
     */
    private void flushContent(Document document, Element element, StringBuffer bufferedContent, Object item) {
        if (bufferedContent.length() > 0 && !(item instanceof ContentNode)) {
            // Flush the buffered content
            boolean specialCase = this.props.isUseCdataForScriptAndStyle() && isScriptOrStyle(element);
            String content = bufferedContent.toString();

            if (this.escapeXml && !specialCase) {
                content = Utils.escapeXml(content, this.props, true);
            } else if (specialCase) {
                content = processCDATABlocks(content);
            }

            // Generate a javascript comment in front on the CDATA block so that it works in IE and when
            // serving XHTML under a mimetype of HTML.
            if (specialCase) {
                if (SCRIPT_TAG_NAME.equalsIgnoreCase(element.getNodeName())) {
                    // JS
                    element.appendChild(document.createTextNode(JS_COMMENT));
                    element.appendChild(document.createCDATASection(NEW_LINE + content + NEW_LINE + JS_COMMENT));
                } else {
                    // CSS
                    element.appendChild(document.createTextNode(CSS_COMMENT_START));
                    element.appendChild(document.createCDATASection(
                            CSS_COMMENT_END + StringUtils.chomp(content) + NEW_LINE + CSS_COMMENT_START));
                    element.appendChild(document.createTextNode(CSS_COMMENT_END));
                }
            } else {
                element.appendChild(document.createTextNode(content));
            }

            bufferedContent.setLength(0);
        }
    }

    /**
     * Remove any existing CDATA section and unencode HTML entities that are not inside a CDATA block.
     *
     * @param content the text input to transform
     * @return the transformed content that will be wrapped inside a CDATA block
     */
    private String processCDATABlocks(String content) {
        StringBuffer result = new StringBuffer();
        Matcher matcher = CDATA_PATTERN.matcher(content);
        int cursor = 0;
        while (matcher.find()) {
            result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor, matcher.start())));
            result.append(content.substring(matcher.start() + 9, matcher.end() - matcher.group(1).length()));
            cursor = matcher.end() - matcher.group(1).length() + 3;
        }
        // Copy the remaining text data in the result buffer
        if (cursor < content.length()) {
            result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor)));
        }
        // Ensure ther's no invalid <![CDATA[ or ]]> remaining.
        String contentResult = result.toString().replace("<![CDATA[", "").replace("]]>", "");

        return contentResult;
    }

    /**
     * @param element the element to check
     * @return true if the passed element is a script or style element
     */
    protected boolean isScriptOrStyle(Element element) {
        String tagName = element.getNodeName();
        return SCRIPT_TAG_NAME.equalsIgnoreCase(tagName) || STYLE_TAG_NAME.equalsIgnoreCase(tagName);
    }

    /**
     * Serialize a given SF HTML Cleaner node.
     *
     * @param document the W3C Document to use for creating new DOM elements
     * @param element the W3C element to which we'll add the subnodes to
     * @param tagChildren the SF HTML Cleaner nodes to serialize for that node
     */
    private void createSubnodes(Document document, Element element, List<? extends BaseToken> tagChildren) {
        // We've modified the original implementation based in SF's HTML Cleaner to better handle CDATA.
        // More specifically we want to handle the following 3 use cases:
        //
        // Use case 1: useCdata = true && input is:
        // <script>...<![CDATA[...]]>...</script>
        // In this case we must make sure to have only one CDATA block.
        //
        // Use case 2: useCdata = true && input is:
        // <script>...entities not encoded (e.g. "<")...</script>
        // We must generate a CDATA block around the whole content (the HTML Tokenizer split
        // ContentToken on "<" character so we need to join them before creating the CDATA block.
        // We must also unencode any entities (i.e. transform "&lt;" into "<") since we'll be
        // wrapping them in a CDATA section.
        //
        // Use case 3: useCData = false
        // Simply group all ContentToken together.

        StringBuffer bufferedContent = new StringBuffer();

        if (tagChildren != null) {
            for (Object item : tagChildren) {
                // Flush content tokens
                flushContent(document, element, bufferedContent, item);

                if (item instanceof CommentNode) {
                    CommentNode commentToken = (CommentNode) item;
                    Comment comment = document.createComment(commentToken.getContent());
                    element.appendChild(comment);
                } else if (item instanceof ContentNode) {
                    ContentNode contentToken = (ContentNode) item;
                    bufferedContent.append(contentToken.getContent());
                } else if (item instanceof TagNode) {
                    TagNode subTagNode = (TagNode) item;
                    Element subelement = document.createElement(subTagNode.getName());
                    Map<String, String> attributes = subTagNode.getAttributes();
                    for (Map.Entry<String, String> entry : attributes.entrySet()) {
                        String attrName = entry.getKey();
                        String attrValue = entry.getValue();
                        if (this.escapeXml) {
                            attrValue = Utils.escapeXml(attrValue, this.props, true);
                        }
                        subelement.setAttribute(attrName, attrValue);
                    }

                    // recursively create subnodes
                    createSubnodes(document, subelement, subTagNode.getAllChildren());

                    element.appendChild(subelement);
                } else if (item instanceof List<?>) {
                    @SuppressWarnings("unchecked")
                    List<BaseToken> sublist = (List<BaseToken>) item;
                    createSubnodes(document, element, sublist);
                }
            }
            flushContent(document, element, bufferedContent, null);
        }
    }
}