List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:com.near.chimerarevo.fragments.PostFragment.java
private void parseHTML(String html) { Document doc = Jsoup.parse(html); Elements el = doc.body().children(); for (Element e : el) { if (e.getElementsByTag("h1").size() > 0) parseTitles(e.getElementsByTag("h1"), 1); if (e.getElementsByTag("h2").size() > 0) parseTitles(e.getElementsByTag("h2"), 2); if (e.getElementsByTag("h3").size() > 0) parseTitles(e.getElementsByTag("h3"), 3); if (e.getElementsByTag("h4").size() > 0) parseTitles(e.getElementsByTag("h4"), 4); if (e.getElementsByTag("h5").size() > 0) parseTitles(e.getElementsByTag("h5"), 5); if (e.getElementsByTag("p").size() > 0) parseParagraphs(e.getElementsByTag("p")); if (e.getElementsByTag("img").size() > 0) parseNormalImages(e.getElementsByTag("img")); if (e.getElementsByTag("a").size() > 0) parseLinkedImages(e.getElementsByTag("a")); if (e.getElementsByTag("iframe").size() > 0) parseYoutubeVideos(e.getElementsByTag("iframe")); if (e.getElementsByTag("ul").size() > 0) parseBulletedLists(e.getElementsByTag("ul")); if (e.getElementsByTag("ol").size() > 0) parseOrderedLists(e.getElementsByTag("ol")); if (e.getElementsByTag("pre").size() > 0) parseCodeText(e.getElementsByTag("pre")); if (e.getElementsByTag("tr").size() > 0) parseTables(e.getElementsByTag("tr")); }// w w w. j a v a 2 s . c o m ((PostContainerActivity) getActivity()).setIsLoading(false); }
From source file:nl.ivonet.epub.metadata.BigBookSearch.java
Map<String, String> retrievePossibles(final String search) { final String tokens = tokenize(search); final Map<String, String> pictures = new HashMap<>(); int page = 1; Document document = webPage.get(bigBookSearchUrl(tokens, page)); while (!NO_RESULTS.equals(document.body().text()) && (page <= 10)) { LOG.debug("Searching cover for [{}] on page [{}]", search, page); document.body().select("img") .forEach(element -> pictures.put(element.attr("alt"), element.attr("src"))); page++;/*from www. j a va2 s. c om*/ document = webPage.get(bigBookSearchUrl(tokens, page)); } return pictures; }
From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java
@Override public Document runFilter(Document document) { final Document clean = Document.createShell(document.baseUri()); if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body. copySafeNodes(document.body(), clean.body()); return clean; }
From source file:org.apache.archiva.web.docs.RestDocsServlet.java
@Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { logger.debug("docs request to path: {}", req.getPathInfo()); String path = StringUtils.removeStart(req.getPathInfo(), "/"); InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path); if (StringUtils.endsWith(path, ".xsd")) { StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is)); //IOUtils.copy( is, resp.getOutputStream() ); return;/*from w w w. j av a2s .co m*/ } String startPath = StringUtils.substringBefore(path, "/"); // replace all links !! Document document = Jsoup.parse(is, "UTF-8", ""); Element body = document.body().child(0); Elements links = body.select("a[href]"); for (Element link : links) { link.attr("href", "#" + startPath + "/" + link.attr("href")); } Elements datalinks = body.select("[data-href]"); for (Element link : datalinks) { link.attr("data-href", "#" + startPath + "/" + link.attr("data-href")); } Elements codes = body.select("code"); for (Element code : codes) { code.attr("class", code.attr("class") + " nice-code"); } //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5 Elements headers = body.select("h1"); for (Element header : headers) { header.tagName("h3"); } headers = body.select("h2"); for (Element header : headers) { header.tagName("h4"); } headers = body.select("h3"); for (Element header : headers) { header.tagName("h5"); } Document res = new Document(""); res.appendChild(body.select("div[id=main]").first()); Elements scripts = body.select("script"); for (Element script : scripts) { res.appendChild(script); } resp.getOutputStream().write(res.outerHtml().getBytes()); }
From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java
@Override public String toPlainText(String html) { try {//www.j ava 2s . c om Document document = Jsoup.parse(html); Element body = Optional.ofNullable(document.body()).orElse(document); return flatten(body, INITIAL_LIST_NESTED_LEVEL).map(this::convertNodeToText).reduce("", (s1, s2) -> s1 + s2); } catch (Exception e) { LOGGER.warn("Failed extracting text from html", e); return html; } }
From source file:org.apdplat.superword.tools.PrefixSuffixOptimizer.java
public static void main(String[] args) throws Exception { InputStream in = PrefixSuffixOptimizer.class.getResourceAsStream(SRC_HTML); Document document = Jsoup.parse(in, "utf-8", ""); document.select("table tbody tr td p").stream().forEach(PrefixSuffixOptimizer::replace); Files.write(Paths.get(DST_HTML), document.body().html().getBytes("utf-8")); AtomicInteger i = new AtomicInteger(); StringBuilder text = new StringBuilder(); WORDS.stream().sorted()/*from www . j ava2s .c o m*/ .forEach(word -> text.append(i.incrementAndGet()).append("\t").append(word).append("\n")); Files.write(Paths.get(DST_WORD), text.toString().getBytes("utf-8")); }
From source file:org.b3log.symphony.util.Markdowns.java
/** * Gets the safe HTML content of the specified content. * * @param content the specified content//from w ww .ja va2 s . c om * @param baseURI the specified base URI, the relative path value of href will starts with this URL * @return safe HTML content */ public static String clean(final String content, final String baseURI) { final Document.OutputSettings outputSettings = new Document.OutputSettings(); outputSettings.prettyPrint(false); final String tmp = Jsoup.clean(content, baseURI, Whitelist.relaxed().addAttributes(":all", "id", "target", "class") .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u") .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight") .addAttributes("audio", "controls", "src") .addAttributes("video", "controls", "src", "width", "height") .addAttributes("source", "src", "media", "type") .addAttributes("object", "width", "height", "data", "type") .addAttributes("param", "name", "value") .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type", "width", "height", "wmode", "allowNetworking"), outputSettings); final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser()); final Elements ps = doc.getElementsByTag("p"); for (final Element p : ps) { p.removeAttr("style"); } final Elements iframes = doc.getElementsByTag("iframe"); for (final Element iframe : iframes) { final String src = StringUtils.deleteWhitespace(iframe.attr("src")); if (StringUtils.startsWithIgnoreCase(src, "javascript") || StringUtils.startsWithIgnoreCase(src, "data:")) { iframe.remove(); } } final Elements objs = doc.getElementsByTag("object"); for (final Element obj : objs) { final String data = StringUtils.deleteWhitespace(obj.attr("data")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { obj.remove(); continue; } final String type = StringUtils.deleteWhitespace(obj.attr("type")); if (StringUtils.containsIgnoreCase(type, "script")) { obj.remove(); } } final Elements embeds = doc.getElementsByTag("embed"); for (final Element embed : embeds) { final String data = StringUtils.deleteWhitespace(embed.attr("src")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { embed.remove(); continue; } } final Elements as = doc.getElementsByTag("a"); for (final Element a : as) { a.attr("rel", "nofollow"); final String href = a.attr("href"); if (href.startsWith(Latkes.getServePath())) { continue; } a.attr("target", "_blank"); } final Elements audios = doc.getElementsByTag("audio"); for (final Element audio : audios) { audio.attr("preload", "none"); } final Elements videos = doc.getElementsByTag("video"); for (final Element video : videos) { video.attr("preload", "none"); } String ret = doc.body().html(); ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue return ret; }
From source file:org.eclipse.skalli.commons.HtmlUtils.java
/** * Filters untrusted tags and attributes from the given HTML fragment by using * a whitelist of allowed tags and attributes. * * @param html the HTML fragment to clean. * @param whitelist whitelist of allowed tags and attributes, or <code>null</code> * if the {@link #getWhiteList() default whitelist} should be used. * @param baseUri base URL to resolve relative URLs against, or <code>null</code>. * @param escapeMode determines how XML/HTML entities are to be escaped, * or <code>null</code>. The default escape mode is {@link EscapeMode.xhtml}, * i.e. only the XML entities <tt>&quot</tt>, <tt>&amp</tt>, <tt>&apos</tt>, * <tt>&lt</tt>, and <tt>&gt</tt> are recognized. * * @return the cleaned input string./*from w w w .j a v a 2s. co m*/ */ public static String clean(String html, Whitelist whitelist, String baseUri, EscapeMode escapeMode) { if (StringUtils.isBlank(html)) { return html; } if (whitelist == null) { whitelist = getWhiteList(); } String cleaned = Jsoup.clean(html, baseUri != null ? baseUri : "", whitelist); //$NON-NLS-1$ Document cleanedDocument = Jsoup.parse(cleaned); cleanedDocument.outputSettings().escapeMode(escapeMode != null ? escapeMode : EscapeMode.xhtml); return cleanedDocument.body().html(); }
From source file:org.finra.herd.core.HerdStringUtils.java
/** * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist * * @param fragment the specified String//from w w w. j av a 2 s. c o m * @param whitelistTags the specified whitelist tags * * @return cleaned String with allowed tags */ public static String stripHtml(String fragment, String... whitelistTags) { // Parse out html tags except those from a given list of whitelist tags Document dirty = Jsoup.parseBodyFragment(fragment); Whitelist whitelist = new Whitelist(); for (String whitelistTag : whitelistTags) { // Get the actual tag name from the whitelist tag // this is vulnerable in general to complex tags but will suffice for our simple needs whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]"); // Add all specified tags to the whitelist while preserving inline css whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class"); } Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); // Set character encoding to UTF-8 and make sure no line-breaks are added clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8) .prettyPrint(false); // return 'cleaned' html body return clean.body().html(); }
From source file:org.jboss.tools.windup.ui.internal.issues.IssueDetailsView.java
public static void addPrism(Document doc) { try {/* w w w .j a v a 2 s. co m*/ Bundle bundle = WindupUIPlugin.getDefault().getBundle(); Elements codeElements = doc.getElementsByTag("code"); codeElements.forEach(element -> { Set<String> classNames = element.classNames(); Set<String> newNames = Sets.newHashSet(); classNames.forEach(className -> { // prismjs requires prefix, i'm not sure about another/easier workaround. newNames.add("language-" + className); }); element.classNames(newNames); }); DocumentType type = new DocumentType("html", "", "", ""); doc.insertChildren(0, Lists.newArrayList(type)); Element head = doc.head(); Element css = doc.createElement("link"); URL fileURL = FileLocator.find(bundle, new Path("html/prism.css"), null); String srcPath = FileLocator.resolve(fileURL).getPath(); css.attr("href", srcPath); css.attr("rel", "stylesheet"); head.appendChild(css); Element body = doc.body(); Element script = doc.createElement("script"); fileURL = FileLocator.find(bundle, new Path("html/prism.js"), null); srcPath = FileLocator.resolve(fileURL).getPath(); script.attr("src", srcPath); body.appendChild(script); } catch (Exception e) { WindupUIPlugin.log(e); } }