List of usage examples for org.jsoup.parser Parser htmlParser
public static Parser htmlParser()
From source file:dslab.crawler.pack.CrawlerPack.java
public org.jsoup.nodes.Document htmlToJsoupDoc(String html) { Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
From source file:com.github.abola.crawler.CrawlerPack.java
/** * HTML Jsoup Document /* www. j a v a 2 s . co m*/ * * HTMLJsoup HTML Parser * * @param html Html document * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document htmlToJsoupDoc(String html) { // html(html/html5) jsoup Document Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
From source file:org.b3log.solo.plugin.list.ListHandler.java
@Override public void action(final Event<JSONObject> event) throws EventException { final JSONObject data = event.getData(); final JSONObject article = data.optJSONObject(Article.ARTICLE); String content = article.optString(Article.ARTICLE_CONTENT); final Document doc = Jsoup.parse(content, StringUtils.EMPTY, Parser.htmlParser()); doc.outputSettings().prettyPrint(false); final StringBuilder listBuilder = new StringBuilder(); listBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + Latkes.getStaticServePath() + "/plugins/list/style.css\" />"); final Elements hs = doc.select("h1, h2, h3, h4, h5"); listBuilder.append("<ul class='b3-solo-list'>"); for (int i = 0; i < hs.size(); i++) { final Element element = hs.get(i); final String tagName = element.tagName().toLowerCase(); final String text = element.text(); final String id = "b3_solo_" + tagName + "_" + i; element.before("<span id='" + id + "'></span>"); listBuilder.append("<li class='b3-solo-list-").append(tagName).append("'><a href='#").append(id) .append("'>").append(text).append("</a></li>"); }//from w w w.j a va 2 s. c o m listBuilder.append("</ul>"); final Element body = doc.getElementsByTag("body").get(0); content = listBuilder.toString() + body.html(); article.put(Article.ARTICLE_CONTENT, content); }
From source file:org.b3log.symphony.util.Markdowns.java
/** * Gets the safe HTML content of the specified content. * * @param content the specified content// w w w . j a v a 2 s .com * @param baseURI the specified base URI, the relative path value of href will starts with this URL * @return safe HTML content */ public static String clean(final String content, final String baseURI) { final Document.OutputSettings outputSettings = new Document.OutputSettings(); outputSettings.prettyPrint(false); final String tmp = Jsoup.clean(content, baseURI, Whitelist.relaxed().addAttributes(":all", "id", "target", "class") .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u") .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight") .addAttributes("audio", "controls", "src") .addAttributes("video", "controls", "src", "width", "height") .addAttributes("source", "src", "media", "type") .addAttributes("object", "width", "height", "data", "type") .addAttributes("param", "name", "value") .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type", "width", "height", "wmode", "allowNetworking"), outputSettings); final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser()); final Elements ps = doc.getElementsByTag("p"); for (final Element p : ps) { p.removeAttr("style"); } final Elements iframes = doc.getElementsByTag("iframe"); for (final Element iframe : iframes) { final String src = StringUtils.deleteWhitespace(iframe.attr("src")); if (StringUtils.startsWithIgnoreCase(src, "javascript") || StringUtils.startsWithIgnoreCase(src, "data:")) { iframe.remove(); } } final Elements objs = doc.getElementsByTag("object"); for (final Element obj : objs) { final String data = StringUtils.deleteWhitespace(obj.attr("data")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { obj.remove(); continue; } final String type = StringUtils.deleteWhitespace(obj.attr("type")); if (StringUtils.containsIgnoreCase(type, "script")) { obj.remove(); } } final Elements embeds = doc.getElementsByTag("embed"); for (final Element embed : embeds) { final String data = StringUtils.deleteWhitespace(embed.attr("src")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { embed.remove(); continue; } } final Elements as = doc.getElementsByTag("a"); for (final Element a : as) { a.attr("rel", "nofollow"); final String href = a.attr("href"); if (href.startsWith(Latkes.getServePath())) { continue; } a.attr("target", "_blank"); } final Elements audios = doc.getElementsByTag("audio"); for (final Element audio : audios) { audio.attr("preload", "none"); } final Elements videos = doc.getElementsByTag("video"); for (final Element video : videos) { video.attr("preload", "none"); } String ret = doc.body().html(); ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue return ret; }