Java HTML Parse Jsoup clean(String html, Whitelist whitelist)

Here you can find the source of clean(String html, Whitelist whitelist)

Description

Cleans the specified HTML with the specified white list.

License

Open Source License

Parameter

Parameter Description
html The HTML code to clean.
whitelist The whitelist.

Return

The cleaned HTML.

Declaration

public static String clean(String html, Whitelist whitelist) 

Method Source Code


//package com.java2s;
/*/*from  www  .  j a v a2s .  co m*/
 * Copyright (C) 2012 Klaus Reimer <k@ailis.de>
 * See LICENSE.md for licensing information.
 */

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import org.jsoup.nodes.TextNode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;

public class Main {
    /**
     * Cleans the specified HTML with the specified white list.
     * 
     * @param html
     *            The HTML code to clean.
     * @param whitelist
     *            The whitelist.
     * @return The cleaned HTML.
     */
    public static String clean(String html, Whitelist whitelist) {
        Document doc = parse(mask(html));
        Cleaner cleaner = new Cleaner(whitelist);
        Document clean = cleaner.clean(doc);
        clean.outputSettings().prettyPrint(false);
        return unmask(normalizeWhitespaces(clean).body().html());
    }

    /**
     * Cleans the specified HTML code so it only contains valid and allowed
     * tags.
     * 
     * @param html
     *            The HTML to clean.
     * @return The cleaned HTML.
     */
    public static String clean(final String html) {
        return clean(html, Whitelist.basic());
    }

    /**
     * Parses the specified html code.
     * 
     * @param html
     *            The HTML code to parse.
     * @return The parsed document.
     */
    public static Document parse(final String html) {
        Document doc = Jsoup.parseBodyFragment(html);
        doc.outputSettings().prettyPrint(false);
        return doc;
    }

    /**
     * Masks problematic code which Jsoup doesn't handle as we need it. After
     * Jsoup did its work the string must be piped through unmask() to restore
     * the original code.
     * 
     * @param html
     *            The HTML text to mask.
     * @return The masked HTML text.
     */
    private static String mask(final String html) {
        return html.replace("&#125;", "@jasdoc.unicode#125;");
    }

    /**
     * Unmasks previously masked html text to restore masked code it.
     * 
     * @param html
     *            The previously masked html text.
     * @return The unmasked HTML text.
     */
    private static String unmask(final String html) {
        return html.replace("@jasdoc.unicode", "&");
    }

    /**
     * Normalizes the whitespaces in text nodes of the specified document.
     * Normally this is done by pretty printing but I disabled it because
     * indentation done by Jsoup is pretty buggy. So I have to normalize the
     * whitespaces manually here.
     * 
     * @param doc
     *            The document to normalise whitespaces in.
     * @return The normalized document.
     */
    private static Document normalizeWhitespaces(Document doc) {
        for (TextNode node : doc.body().textNodes()) {
            node.text(node.text());
        }
        return doc;
    }
}

Related

  1. br2nl(String html)
  2. clean(String html)
  3. cleanHTML(final String html)
  4. cleanHtmlCode(String html)
  5. cleanHtmlFromString(String stringToClean)
  6. cleanHTMLTags(String str)