Example usage for org.jsoup.parser Parser htmlParser

List of usage examples for org.jsoup.parser Parser htmlParser

Introduction

In this page you can find the example usage for org.jsoup.parser Parser htmlParser.

Prototype

public static Parser htmlParser() 

Source Link

Document

Create a new HTML parser.

Usage

From source file:dslab.crawler.pack.CrawlerPack.java

public org.jsoup.nodes.Document htmlToJsoupDoc(String html) {

    Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}

From source file:com.github.abola.crawler.CrawlerPack.java

/**
 *  HTML  Jsoup Document /* www. j a  v a 2 s .  co m*/
 *
 * HTMLJsoup HTML Parser
 *
 * @param html Html document
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document htmlToJsoupDoc(String html) {

    //  html(html/html5)  jsoup Document 
    Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}

From source file:org.b3log.solo.plugin.list.ListHandler.java

@Override
public void action(final Event<JSONObject> event) throws EventException {
    final JSONObject data = event.getData();
    final JSONObject article = data.optJSONObject(Article.ARTICLE);

    String content = article.optString(Article.ARTICLE_CONTENT);

    final Document doc = Jsoup.parse(content, StringUtils.EMPTY, Parser.htmlParser());
    doc.outputSettings().prettyPrint(false);

    final StringBuilder listBuilder = new StringBuilder();

    listBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + Latkes.getStaticServePath()
            + "/plugins/list/style.css\" />");

    final Elements hs = doc.select("h1, h2, h3, h4, h5");

    listBuilder.append("<ul class='b3-solo-list'>");
    for (int i = 0; i < hs.size(); i++) {
        final Element element = hs.get(i);
        final String tagName = element.tagName().toLowerCase();
        final String text = element.text();
        final String id = "b3_solo_" + tagName + "_" + i;

        element.before("<span id='" + id + "'></span>");

        listBuilder.append("<li class='b3-solo-list-").append(tagName).append("'><a href='#").append(id)
                .append("'>").append(text).append("</a></li>");
    }//from  w  w  w.j  a va 2 s.  c o m
    listBuilder.append("</ul>");

    final Element body = doc.getElementsByTag("body").get(0);

    content = listBuilder.toString() + body.html();

    article.put(Article.ARTICLE_CONTENT, content);
}

From source file:org.b3log.symphony.util.Markdowns.java

/**
 * Gets the safe HTML content of the specified content.
 *
 * @param content the specified content// w w w  . j a  v a  2 s  .com
 * @param baseURI the specified base URI, the relative path value of href will starts with this URL
 * @return safe HTML content
 */
public static String clean(final String content, final String baseURI) {
    final Document.OutputSettings outputSettings = new Document.OutputSettings();
    outputSettings.prettyPrint(false);

    final String tmp = Jsoup.clean(content, baseURI,
            Whitelist.relaxed().addAttributes(":all", "id", "target", "class")
                    .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u")
                    .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight")
                    .addAttributes("audio", "controls", "src")
                    .addAttributes("video", "controls", "src", "width", "height")
                    .addAttributes("source", "src", "media", "type")
                    .addAttributes("object", "width", "height", "data", "type")
                    .addAttributes("param", "name", "value")
                    .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type",
                            "width", "height", "wmode", "allowNetworking"),
            outputSettings);
    final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser());

    final Elements ps = doc.getElementsByTag("p");
    for (final Element p : ps) {
        p.removeAttr("style");
    }

    final Elements iframes = doc.getElementsByTag("iframe");
    for (final Element iframe : iframes) {
        final String src = StringUtils.deleteWhitespace(iframe.attr("src"));
        if (StringUtils.startsWithIgnoreCase(src, "javascript")
                || StringUtils.startsWithIgnoreCase(src, "data:")) {
            iframe.remove();
        }
    }

    final Elements objs = doc.getElementsByTag("object");
    for (final Element obj : objs) {
        final String data = StringUtils.deleteWhitespace(obj.attr("data"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            obj.remove();

            continue;
        }

        final String type = StringUtils.deleteWhitespace(obj.attr("type"));
        if (StringUtils.containsIgnoreCase(type, "script")) {
            obj.remove();
        }
    }

    final Elements embeds = doc.getElementsByTag("embed");
    for (final Element embed : embeds) {
        final String data = StringUtils.deleteWhitespace(embed.attr("src"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            embed.remove();

            continue;
        }
    }

    final Elements as = doc.getElementsByTag("a");
    for (final Element a : as) {
        a.attr("rel", "nofollow");

        final String href = a.attr("href");
        if (href.startsWith(Latkes.getServePath())) {
            continue;
        }

        a.attr("target", "_blank");
    }

    final Elements audios = doc.getElementsByTag("audio");
    for (final Element audio : audios) {
        audio.attr("preload", "none");
    }

    final Elements videos = doc.getElementsByTag("video");
    for (final Element video : videos) {
        video.attr("preload", "none");
    }

    String ret = doc.body().html();
    ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue

    return ret;
}