Example usage for org.jsoup.nodes Document body

List of usage examples for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body() 

Source Link

Document

Accessor to the document's body element.

Usage

From source file:com.near.chimerarevo.fragments.PostFragment.java

private void parseHTML(String html) {
    Document doc = Jsoup.parse(html);
    Elements el = doc.body().children();

    for (Element e : el) {
        if (e.getElementsByTag("h1").size() > 0)
            parseTitles(e.getElementsByTag("h1"), 1);

        if (e.getElementsByTag("h2").size() > 0)
            parseTitles(e.getElementsByTag("h2"), 2);

        if (e.getElementsByTag("h3").size() > 0)
            parseTitles(e.getElementsByTag("h3"), 3);

        if (e.getElementsByTag("h4").size() > 0)
            parseTitles(e.getElementsByTag("h4"), 4);

        if (e.getElementsByTag("h5").size() > 0)
            parseTitles(e.getElementsByTag("h5"), 5);

        if (e.getElementsByTag("p").size() > 0)
            parseParagraphs(e.getElementsByTag("p"));

        if (e.getElementsByTag("img").size() > 0)
            parseNormalImages(e.getElementsByTag("img"));

        if (e.getElementsByTag("a").size() > 0)
            parseLinkedImages(e.getElementsByTag("a"));

        if (e.getElementsByTag("iframe").size() > 0)
            parseYoutubeVideos(e.getElementsByTag("iframe"));

        if (e.getElementsByTag("ul").size() > 0)
            parseBulletedLists(e.getElementsByTag("ul"));

        if (e.getElementsByTag("ol").size() > 0)
            parseOrderedLists(e.getElementsByTag("ol"));

        if (e.getElementsByTag("pre").size() > 0)
            parseCodeText(e.getElementsByTag("pre"));

        if (e.getElementsByTag("tr").size() > 0)
            parseTables(e.getElementsByTag("tr"));
    }//  w  w  w.  j  a v a 2  s .  c o m

    ((PostContainerActivity) getActivity()).setIsLoading(false);
}

From source file:nl.ivonet.epub.metadata.BigBookSearch.java

Map<String, String> retrievePossibles(final String search) {
    final String tokens = tokenize(search);
    final Map<String, String> pictures = new HashMap<>();
    int page = 1;
    Document document = webPage.get(bigBookSearchUrl(tokens, page));
    while (!NO_RESULTS.equals(document.body().text()) && (page <= 10)) {
        LOG.debug("Searching cover for [{}] on page [{}]", search, page);
        document.body().select("img")
                .forEach(element -> pictures.put(element.attr("alt"), element.attr("src")));
        page++;/*from   www. j a  va2  s. c  om*/
        document = webPage.get(bigBookSearchUrl(tokens, page));
    }
    return pictures;
}

From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java

@Override
public Document runFilter(Document document) {
    final Document clean = Document.createShell(document.baseUri());
    if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
        copySafeNodes(document.body(), clean.body());

    return clean;
}

From source file:org.apache.archiva.web.docs.RestDocsServlet.java

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {

    logger.debug("docs request to path: {}", req.getPathInfo());

    String path = StringUtils.removeStart(req.getPathInfo(), "/");
    InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path);

    if (StringUtils.endsWith(path, ".xsd")) {
        StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is));
        //IOUtils.copy( is, resp.getOutputStream() );
        return;/*from  w  w w. j av  a2s .co m*/
    }

    String startPath = StringUtils.substringBefore(path, "/");

    // replace all links !!
    Document document = Jsoup.parse(is, "UTF-8", "");

    Element body = document.body().child(0);

    Elements links = body.select("a[href]");

    for (Element link : links) {
        link.attr("href", "#" + startPath + "/" + link.attr("href"));
    }

    Elements datalinks = body.select("[data-href]");

    for (Element link : datalinks) {
        link.attr("data-href", "#" + startPath + "/" + link.attr("data-href"));
    }

    Elements codes = body.select("code");

    for (Element code : codes) {
        code.attr("class", code.attr("class") + " nice-code");
    }

    //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5

    Elements headers = body.select("h1");

    for (Element header : headers) {
        header.tagName("h3");
    }

    headers = body.select("h2");

    for (Element header : headers) {
        header.tagName("h4");
    }

    headers = body.select("h3");

    for (Element header : headers) {
        header.tagName("h5");
    }

    Document res = new Document("");
    res.appendChild(body.select("div[id=main]").first());

    Elements scripts = body.select("script");
    for (Element script : scripts) {
        res.appendChild(script);
    }
    resp.getOutputStream().write(res.outerHtml().getBytes());

}

From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java

@Override
public String toPlainText(String html) {
    try {//www.j ava 2s .  c  om
        Document document = Jsoup.parse(html);

        Element body = Optional.ofNullable(document.body()).orElse(document);

        return flatten(body, INITIAL_LIST_NESTED_LEVEL).map(this::convertNodeToText).reduce("",
                (s1, s2) -> s1 + s2);
    } catch (Exception e) {
        LOGGER.warn("Failed extracting text from html", e);
        return html;
    }
}

From source file:org.apdplat.superword.tools.PrefixSuffixOptimizer.java

public static void main(String[] args) throws Exception {
    InputStream in = PrefixSuffixOptimizer.class.getResourceAsStream(SRC_HTML);
    Document document = Jsoup.parse(in, "utf-8", "");
    document.select("table tbody tr td p").stream().forEach(PrefixSuffixOptimizer::replace);
    Files.write(Paths.get(DST_HTML), document.body().html().getBytes("utf-8"));

    AtomicInteger i = new AtomicInteger();
    StringBuilder text = new StringBuilder();
    WORDS.stream().sorted()/*from   www . j ava2s  .c  o m*/
            .forEach(word -> text.append(i.incrementAndGet()).append("\t").append(word).append("\n"));
    Files.write(Paths.get(DST_WORD), text.toString().getBytes("utf-8"));
}

From source file:org.b3log.symphony.util.Markdowns.java

/**
 * Gets the safe HTML content of the specified content.
 *
 * @param content the specified content//from  w  ww .ja va2 s  .  c om
 * @param baseURI the specified base URI, the relative path value of href will starts with this URL
 * @return safe HTML content
 */
public static String clean(final String content, final String baseURI) {
    final Document.OutputSettings outputSettings = new Document.OutputSettings();
    outputSettings.prettyPrint(false);

    final String tmp = Jsoup.clean(content, baseURI,
            Whitelist.relaxed().addAttributes(":all", "id", "target", "class")
                    .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u")
                    .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight")
                    .addAttributes("audio", "controls", "src")
                    .addAttributes("video", "controls", "src", "width", "height")
                    .addAttributes("source", "src", "media", "type")
                    .addAttributes("object", "width", "height", "data", "type")
                    .addAttributes("param", "name", "value")
                    .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type",
                            "width", "height", "wmode", "allowNetworking"),
            outputSettings);
    final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser());

    final Elements ps = doc.getElementsByTag("p");
    for (final Element p : ps) {
        p.removeAttr("style");
    }

    final Elements iframes = doc.getElementsByTag("iframe");
    for (final Element iframe : iframes) {
        final String src = StringUtils.deleteWhitespace(iframe.attr("src"));
        if (StringUtils.startsWithIgnoreCase(src, "javascript")
                || StringUtils.startsWithIgnoreCase(src, "data:")) {
            iframe.remove();
        }
    }

    final Elements objs = doc.getElementsByTag("object");
    for (final Element obj : objs) {
        final String data = StringUtils.deleteWhitespace(obj.attr("data"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            obj.remove();

            continue;
        }

        final String type = StringUtils.deleteWhitespace(obj.attr("type"));
        if (StringUtils.containsIgnoreCase(type, "script")) {
            obj.remove();
        }
    }

    final Elements embeds = doc.getElementsByTag("embed");
    for (final Element embed : embeds) {
        final String data = StringUtils.deleteWhitespace(embed.attr("src"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            embed.remove();

            continue;
        }
    }

    final Elements as = doc.getElementsByTag("a");
    for (final Element a : as) {
        a.attr("rel", "nofollow");

        final String href = a.attr("href");
        if (href.startsWith(Latkes.getServePath())) {
            continue;
        }

        a.attr("target", "_blank");
    }

    final Elements audios = doc.getElementsByTag("audio");
    for (final Element audio : audios) {
        audio.attr("preload", "none");
    }

    final Elements videos = doc.getElementsByTag("video");
    for (final Element video : videos) {
        video.attr("preload", "none");
    }

    String ret = doc.body().html();
    ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue

    return ret;
}

From source file:org.eclipse.skalli.commons.HtmlUtils.java

/**
 * Filters untrusted tags and attributes from the given HTML fragment by using
 * a whitelist of allowed tags and attributes.
 *
 * @param html  the HTML fragment to clean.
 * @param whitelist  whitelist of allowed tags and attributes, or <code>null</code>
 * if the {@link #getWhiteList() default whitelist} should be used.
 * @param baseUri  base URL to resolve relative URLs against, or <code>null</code>.
 * @param escapeMode  determines how XML/HTML entities are to be escaped,
 * or <code>null</code>. The default escape mode is {@link EscapeMode.xhtml},
 * i.e. only the XML entities <tt>&amp;quot</tt>, <tt>&amp;amp</tt>, <tt>&amp;apos</tt>,
 * <tt>&amp;lt</tt>, and <tt>&amp;gt</tt> are recognized.
 *
 * @return the cleaned input string./*from w  w w .j a  v  a 2s.  co m*/
 */
public static String clean(String html, Whitelist whitelist, String baseUri, EscapeMode escapeMode) {
    if (StringUtils.isBlank(html)) {
        return html;
    }
    if (whitelist == null) {
        whitelist = getWhiteList();
    }
    String cleaned = Jsoup.clean(html, baseUri != null ? baseUri : "", whitelist); //$NON-NLS-1$
    Document cleanedDocument = Jsoup.parse(cleaned);
    cleanedDocument.outputSettings().escapeMode(escapeMode != null ? escapeMode : EscapeMode.xhtml);
    return cleanedDocument.body().html();
}

From source file:org.finra.herd.core.HerdStringUtils.java

/**
 * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
 *
 * @param fragment the specified String//from   w w w.  j av a  2 s.  c o m
 * @param whitelistTags the specified whitelist tags
 *
 * @return cleaned String with allowed tags
 */
public static String stripHtml(String fragment, String... whitelistTags) {

    // Parse out html tags except those from a given list of whitelist tags
    Document dirty = Jsoup.parseBodyFragment(fragment);

    Whitelist whitelist = new Whitelist();

    for (String whitelistTag : whitelistTags) {
        // Get the actual tag name from the whitelist tag
        // this is vulnerable in general to complex tags but will suffice for our simple needs
        whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");

        // Add all specified tags to the whitelist while preserving inline css
        whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
    }

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    // Set character encoding to UTF-8 and make sure no line-breaks are added
    clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8)
            .prettyPrint(false);

    // return 'cleaned' html body
    return clean.body().html();
}

From source file:org.jboss.tools.windup.ui.internal.issues.IssueDetailsView.java

public static void addPrism(Document doc) {
    try {/*  w  w w .j a  v  a  2 s. co  m*/
        Bundle bundle = WindupUIPlugin.getDefault().getBundle();
        Elements codeElements = doc.getElementsByTag("code");
        codeElements.forEach(element -> {
            Set<String> classNames = element.classNames();
            Set<String> newNames = Sets.newHashSet();
            classNames.forEach(className -> {
                // prismjs requires prefix, i'm not sure about another/easier workaround.
                newNames.add("language-" + className);
            });
            element.classNames(newNames);
        });

        DocumentType type = new DocumentType("html", "", "", "");
        doc.insertChildren(0, Lists.newArrayList(type));

        Element head = doc.head();
        Element css = doc.createElement("link");

        URL fileURL = FileLocator.find(bundle, new Path("html/prism.css"), null);
        String srcPath = FileLocator.resolve(fileURL).getPath();

        css.attr("href", srcPath);
        css.attr("rel", "stylesheet");
        head.appendChild(css);

        Element body = doc.body();
        Element script = doc.createElement("script");

        fileURL = FileLocator.find(bundle, new Path("html/prism.js"), null);
        srcPath = FileLocator.resolve(fileURL).getPath();

        script.attr("src", srcPath);
        body.appendChild(script);
    } catch (Exception e) {
        WindupUIPlugin.log(e);
    }
}