Example usage for org.jsoup.nodes Document outputSettings

List of usage examples for org.jsoup.nodes Document outputSettings

Introduction

In this page you can find the example usage for org.jsoup.nodes Document outputSettings.

Prototype

OutputSettings outputSettings

To view the source code for org.jsoup.nodes Document outputSettings.

Click Source Link

Usage

From source file:Main.java

public static String toXHTML(String html) {
    html = html.replaceAll("(?s)<script>.*?</script>", "<!-- removed scripts --!>");
    final Document document = Jsoup.parse(html);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    return document.html();
}

From source file:com.fpt.xml.hth.crawler.utils.JsoupConnect.java

/**
 * Jsoup connect to an url to get html//w w w.jav  a 2 s  .  c om
 * @param url
 * @return
 * @throws IOException 
 */
public static Document getHTML(String url) throws IOException {
    Document doc;
    doc = Jsoup.connect(url).timeout(10000).get();
    doc.outputSettings().charset("UTF-8");
    return doc;
}

From source file:Main.java

public static String cleanHtml(String str) {
    Document.OutputSettings settings = new Document.OutputSettings();
    settings.escapeMode(Entities.EscapeMode.xhtml);
    return Jsoup.clean(str, "", Whitelist.none(), settings);
}

From source file:automation.Launcher.java

public static String br2nl(String html) {
    if (html == null) {
        return html;
    }//ww  w.j  a  v  a2 s .c  o  m
    Document document = Jsoup.parse(html);
    document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing
    document.select("p").prepend("\\n\\n");
    document.select("div").prepend("\\n");
    //   System.out.println(document.html());
    document.select("br").append("\\n");
    //   System.out.println(document.html());

    String s = document.html().replaceAll("\\\\n", "\n");
    //   System.out.println(s);
    return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}

From source file:com.slidespeech.server.service.TextToSpeechService.java

private static String createXML4Cereproc(String fileName, String speakernotes) throws IOException {
    List<String> voices = new ArrayList<String>();

    try {/*w ww . j  av a2  s .  c  om*/
        Document doc = Jsoup.parse(speakernotes, "");
        doc.outputSettings().prettyPrint(false);
        Elements voiceNodes = doc.select("voice");

        for (Element voiceNode : voiceNodes) {
            String lang = (voiceNode.hasAttr("xml:lang") && !voiceNode.attr("xml:lang").equals(""))
                    ? voiceNode.attr("xml:lang")
                    : "en";
            String gender = (voiceNode.hasAttr("gender") && !voiceNode.attr("gender").equals(""))
                    ? voiceNode.attr("gender")
                    : "female";
            String voiceName = (voiceNode.hasAttr("name") && !voiceNode.attr("name").equals(""))
                    ? voiceNode.attr("name")
                    : "";

            //voice name not set by user -> choose one depending on language and gender
            if (voiceName.equals("")) {
                voiceName = "isabella";//default
                //if(lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("female")) voiceName = "isabella";
                if (lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("male"))
                    voiceName = "william";
                if (lang.equalsIgnoreCase("de"))
                    voiceName = "alex";

                voiceNode.attr("name", voiceName);

            }
            if (!voices.contains(voiceName)) {
                voices.add(voiceName);

            }
        }

        BufferedWriter out = new BufferedWriter(new FileWriter(fileName));
        out.write(doc.select("body").first().html());
        //out.write(doc.select("body").first().html());
        out.close();

        for (int i = 0; i < voices.size(); i++) {
            if (voices.get(i).equals("william"))
                voices.set(i, "/opt/cereproc/cerevoice_william_3.0.5_22k.voice");
            if (voices.get(i).equals("isabella"))
                voices.set(i, "/opt/cereproc/cerevoice_isabella_3.0.3_22k.voice");
            if (voices.get(i).equals("alex"))
                voices.set(i, "/opt/cereproc/cerevoice_alex_3.0.0_beta_22k.voice");
        }
    } catch (Exception e) {
        //Fallback if ssml parsing fails
        Writer out = new OutputStreamWriter(new FileOutputStream(fileName));
        try {
            out.write(speakernotes);
        } finally {
            out.close();
        }
        voices.add("ssml parsing failed");
    }

    return StringUtils.join(voices, ",");
}

From source file:elaborate.util.XmlUtil.java

public static String fixXhtml(String badxml) {
    Document doc = Jsoup.parse(badxml);
    doc.outputSettings().indentAmount(0).prettyPrint(false).escapeMode(Entities.EscapeMode.xhtml)
            .charset("UTF-8");
    return doc.body().html().replaceAll(" />", "/>").replace("\u00A0", "&#160;");
    // return Jsoup.clean(badxml, Whitelist.relaxed());
}

From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java

public static void extractCSS(final URI url, File targetdir) throws Exception {
    /*/* w ww.  j a v  a 2 s  .  co m*/
     * For each test URL: 1) extract <style> tag contents 2) extract from
     * <link rel="stylesheet"> files 3) extract inline style attributes from
     * all elements and wrap the result in .style {}
     */

    Document doc = Jsoup.connect(url.toString()).timeout(20000).get();

    List<String> tests = new ArrayList<String>();

    for (Element e : doc.select("style[type=text/css]")) {
        tests.add(e.data());
    }

    for (Element e : doc.select("link[rel=stylesheet][href][type=text/css]")) {
        URI cssUri = new URI(e.attr("href"));
        if (!cssUri.isAbsolute()) {
            cssUri = url.resolve(cssUri);
        }
        String encoding = doc.outputSettings().charset().name();
        tests.add(IOUtils.toString(cssUri, encoding));
    }

    for (Element e : doc.select("*[style]")) {
        tests.add(String.format(".style { %s }", e.attr("style")));
    }

    for (final String test : tests) {
        targetdir.mkdirs();
        String logfile = String.format("%s.%d.scss", FilenameUtils.getBaseName(url.toString()),
                tests.indexOf(test));
        PrintStream dataLogger = new PrintStream(new File(targetdir, logfile));

        dataLogger.println("/* Source: " + url + " */");
        dataLogger.println(test);

    }
}

From source file:com.screenslicer.common.CommonUtil.java

private static Element sanitize(Document doc, final boolean ascii) {
    if (ascii) {//from ww w  .  j  a va 2 s .  c  o m
        doc.outputSettings().charset("ascii");
    } else {
        doc.outputSettings().charset("utf-8");
    }
    doc.traverse(new NodeVisitor() {
        @Override
        public void tail(Node n, int d) {
        }

        @Override
        public void head(Node n, int d) {
            try {
                if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) {
                    ((TextNode) n).text(HtmlCoder.decode(n.toString()));
                }
            } catch (Throwable t) {
                Log.exception(t);
            }
        }
    });
    return doc;
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private static Document parseXhtml(final String inputXhtml) {
    final Document originalDocument = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser());
    originalDocument.outputSettings().prettyPrint(false);
    originalDocument.outputSettings().escapeMode(xhtml);
    originalDocument.outputSettings().charset("UTF-8");

    return originalDocument;
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private static String reformatXHtml(final String inputXhtml,
        final Map<String, ConfluenceLink> confluenceLinkMap) {
    final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser());
    document.outputSettings().prettyPrint(false);
    document.outputSettings().escapeMode(xhtml);
    document.outputSettings().charset("UTF-8");

    final Elements linkElements = document.select("a");

    for (final Element linkElement : linkElements) {
        final String originalHref = linkElement.attr("href");
        final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref);

        if (confluenceLink == null) {
            LOG.debug("NO LINK MAPPING FOUND TO COVERT LINK: {}", originalHref);
            continue;
        }//from  www . j av a2  s  .  co m

        final String confluenceLinkMarkup = confluenceLink.getConfluenceLinkMarkup();

        LOG.debug("LINK CONVERSION: {} -> {}", originalHref, confluenceLinkMarkup);

        linkElement.before(confluenceLinkMarkup);

        linkElement.html("");
        linkElement.unwrap();
    }

    reformatXHtmlHeadings(document, "h2");
    reformatXHtmlHeadings(document, "h3");
    reformatXHtmlHeadings(document, "#toctitle");

    final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get();

    if (swaggerConfluenceConfig.getPaginationMode() == PaginationMode.SINGLE_PAGE) {
        if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) {
            reformatXHtmlBreakAfterElements(document, "#toc");
        }

        reformatXHtmlBreakAfterElements(document, ".sect1");
    }

    reformatXHtmlSpacing(document.select(".sect2"));
    reformatXHtmlSpacing(document.select(".sect3"));

    return document.html();
}