com.seajas.search.utilities.web.WebPages.java Source code

Introduction

Here is the source code for com.seajas.search.utilities.web.WebPages.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.utilities.web;

import nu.validator.htmlparser.dom.HtmlDocumentBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.xml.SimpleNamespaceContext;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.Reader;
import java.io.StringReader;

import static javax.xml.xpath.XPathConstants.NODESET;

/**
 * HTML utilities.
 *
 * @author Pascal S. de Kloe
 */
public class WebPages {

    private static final Logger logger = LoggerFactory.getLogger(WebPages.class);
    private DocumentBuilder htmlParser;
    private XPath xPathEngine;

    /**
     * Gets the textual content.
     *
     * @param html the markup fragment.
     */
    public static String getText(Node html) {
        if (html.getNodeType() == Node.ELEMENT_NODE) {
            logger.trace("Traveling element node");
            StringBuilder text = new StringBuilder();
            NodeList children = html.getChildNodes();
            int count = children.getLength();
            logger.debug(String.format("Searching for text in %d elements", count));
            for (int i = 0; i < count; ++i)
                text.append(getText(children.item(i)));
            return text.toString();
        }
        String result = html.getNodeValue();
        return result == null ? "" : result;
    }

    /**
     * Concatenates the {@link #getText(org.w3c.dom.Node) textual content}.
     *
     * @param html      the markup fragments.
     * @param separator the optional join characters.
     */
    public static String joinText(NodeList html, String separator) {
        return joinText(convertToArray(html), separator);
    }

    public static String joinText(Node[] html, String separator) {
        int count = html.length;
        if (logger.isDebugEnabled()) {
            String msg = "Joining %d nodes with '%s'";
            logger.debug(String.format(msg, count, separator));
        }

        StringBuilder text = new StringBuilder();
        for (int i = 0; i < count; ++i) {
            String addition = getText(html[i]);
            if (addition.isEmpty())
                continue;
            if (text.length() != 0 && separator != null)
                text.append(separator);
            text.append(addition);
        }
        return text.toString();
    }

    private static Node[] convertToArray(NodeList list) {
        Node[] copy = new Node[list.getLength()];
        for (int i = 0; i < list.getLength(); i++) {
            copy[i] = list.item(i);
        }
        return copy;
    }

    public Selector selector(Object data) {
        if (data instanceof Node) {
            return new CssSelector((Node) data);
        } else if (data instanceof String) {
            return new CssSelector(parse((String) data));
        } else {
            return new NullSelector();
        }
    }

    /**
     * Builds a DOM.
     *
     * @param html the serialized markup.
     */
    public Document parse(String html) {
        DocumentBuilder parser = htmlParser;
        if (parser == null) {
            parser = new HtmlDocumentBuilder();
            htmlParser = parser;
        }

        try {
            Reader reader = new StringReader(html);
            return parser.parse(new InputSource(reader));
        } catch (Exception e) {
            logger.error("Can't parse HTML", e);
            return null;
        }
    }

    /**
     * Evaluates an XPath.
     *
     * @param html  the markup root.
     * @param xPath the expression.
     */
    public NodeList findNodes(Node html, String xPath) throws XPathExpressionException {
        XPath engine = getXPathEngine();
        NodeList result = (NodeList) engine.evaluate(xPath, html, NODESET);
        if (logger.isDebugEnabled())
            logger.debug(String.format("XPath %s gave %d nodes", xPath, result.getLength()));
        return result;
    }

    private XPath getXPathEngine() {
        XPath engine = xPathEngine;
        if (engine == null) {
            XPathFactory factory = XPathFactory.newInstance();
            engine = factory.newXPath();
            SimpleNamespaceContext namespaces = new SimpleNamespaceContext();
            namespaces.bindNamespaceUri("ht", "http://www.w3.org/1999/xhtml");
            engine.setNamespaceContext(namespaces);
            xPathEngine = engine;
        }
        return engine;
    }

}