List of usage examples for org.jsoup.nodes Document outputSettings
OutputSettings outputSettings
To view the source code for org.jsoup.nodes Document outputSettings.
Click Source Link
From source file:Main.java
public static String toXHTML(String html) { html = html.replaceAll("(?s)<script>.*?</script>", "<!-- removed scripts --!>"); final Document document = Jsoup.parse(html); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); return document.html(); }
From source file:com.fpt.xml.hth.crawler.utils.JsoupConnect.java
/** * Jsoup connect to an url to get html//w w w.jav a 2 s . c om * @param url * @return * @throws IOException */ public static Document getHTML(String url) throws IOException { Document doc; doc = Jsoup.connect(url).timeout(10000).get(); doc.outputSettings().charset("UTF-8"); return doc; }
From source file:Main.java
public static String cleanHtml(String str) { Document.OutputSettings settings = new Document.OutputSettings(); settings.escapeMode(Entities.EscapeMode.xhtml); return Jsoup.clean(str, "", Whitelist.none(), settings); }
From source file:automation.Launcher.java
public static String br2nl(String html) { if (html == null) { return html; }//ww w.j a v a2 s .c o m Document document = Jsoup.parse(html); document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing document.select("p").prepend("\\n\\n"); document.select("div").prepend("\\n"); // System.out.println(document.html()); document.select("br").append("\\n"); // System.out.println(document.html()); String s = document.html().replaceAll("\\\\n", "\n"); // System.out.println(s); return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
From source file:com.slidespeech.server.service.TextToSpeechService.java
private static String createXML4Cereproc(String fileName, String speakernotes) throws IOException { List<String> voices = new ArrayList<String>(); try {/*w ww . j av a2 s . c om*/ Document doc = Jsoup.parse(speakernotes, ""); doc.outputSettings().prettyPrint(false); Elements voiceNodes = doc.select("voice"); for (Element voiceNode : voiceNodes) { String lang = (voiceNode.hasAttr("xml:lang") && !voiceNode.attr("xml:lang").equals("")) ? voiceNode.attr("xml:lang") : "en"; String gender = (voiceNode.hasAttr("gender") && !voiceNode.attr("gender").equals("")) ? voiceNode.attr("gender") : "female"; String voiceName = (voiceNode.hasAttr("name") && !voiceNode.attr("name").equals("")) ? voiceNode.attr("name") : ""; //voice name not set by user -> choose one depending on language and gender if (voiceName.equals("")) { voiceName = "isabella";//default //if(lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("female")) voiceName = "isabella"; if (lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("male")) voiceName = "william"; if (lang.equalsIgnoreCase("de")) voiceName = "alex"; voiceNode.attr("name", voiceName); } if (!voices.contains(voiceName)) { voices.add(voiceName); } } BufferedWriter out = new BufferedWriter(new FileWriter(fileName)); out.write(doc.select("body").first().html()); //out.write(doc.select("body").first().html()); out.close(); for (int i = 0; i < voices.size(); i++) { if (voices.get(i).equals("william")) voices.set(i, "/opt/cereproc/cerevoice_william_3.0.5_22k.voice"); if (voices.get(i).equals("isabella")) voices.set(i, "/opt/cereproc/cerevoice_isabella_3.0.3_22k.voice"); if (voices.get(i).equals("alex")) voices.set(i, "/opt/cereproc/cerevoice_alex_3.0.0_beta_22k.voice"); } } catch (Exception e) { //Fallback if ssml parsing fails Writer out = new OutputStreamWriter(new FileOutputStream(fileName)); try { out.write(speakernotes); } finally { out.close(); } voices.add("ssml parsing failed"); } return StringUtils.join(voices, ","); }
From source file:elaborate.util.XmlUtil.java
public static String fixXhtml(String badxml) { Document doc = Jsoup.parse(badxml); doc.outputSettings().indentAmount(0).prettyPrint(false).escapeMode(Entities.EscapeMode.xhtml) .charset("UTF-8"); return doc.body().html().replaceAll(" />", "/>").replace("\u00A0", " "); // return Jsoup.clean(badxml, Whitelist.relaxed()); }
From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java
public static void extractCSS(final URI url, File targetdir) throws Exception { /*/* w ww. j a v a 2 s . co m*/ * For each test URL: 1) extract <style> tag contents 2) extract from * <link rel="stylesheet"> files 3) extract inline style attributes from * all elements and wrap the result in .style {} */ Document doc = Jsoup.connect(url.toString()).timeout(20000).get(); List<String> tests = new ArrayList<String>(); for (Element e : doc.select("style[type=text/css]")) { tests.add(e.data()); } for (Element e : doc.select("link[rel=stylesheet][href][type=text/css]")) { URI cssUri = new URI(e.attr("href")); if (!cssUri.isAbsolute()) { cssUri = url.resolve(cssUri); } String encoding = doc.outputSettings().charset().name(); tests.add(IOUtils.toString(cssUri, encoding)); } for (Element e : doc.select("*[style]")) { tests.add(String.format(".style { %s }", e.attr("style"))); } for (final String test : tests) { targetdir.mkdirs(); String logfile = String.format("%s.%d.scss", FilenameUtils.getBaseName(url.toString()), tests.indexOf(test)); PrintStream dataLogger = new PrintStream(new File(targetdir, logfile)); dataLogger.println("/* Source: " + url + " */"); dataLogger.println(test); } }
From source file:com.screenslicer.common.CommonUtil.java
private static Element sanitize(Document doc, final boolean ascii) { if (ascii) {//from ww w . j a va 2 s . c o m doc.outputSettings().charset("ascii"); } else { doc.outputSettings().charset("utf-8"); } doc.traverse(new NodeVisitor() { @Override public void tail(Node n, int d) { } @Override public void head(Node n, int d) { try { if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) { ((TextNode) n).text(HtmlCoder.decode(n.toString())); } } catch (Throwable t) { Log.exception(t); } } }); return doc; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private static Document parseXhtml(final String inputXhtml) { final Document originalDocument = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser()); originalDocument.outputSettings().prettyPrint(false); originalDocument.outputSettings().escapeMode(xhtml); originalDocument.outputSettings().charset("UTF-8"); return originalDocument; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private static String reformatXHtml(final String inputXhtml, final Map<String, ConfluenceLink> confluenceLinkMap) { final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser()); document.outputSettings().prettyPrint(false); document.outputSettings().escapeMode(xhtml); document.outputSettings().charset("UTF-8"); final Elements linkElements = document.select("a"); for (final Element linkElement : linkElements) { final String originalHref = linkElement.attr("href"); final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref); if (confluenceLink == null) { LOG.debug("NO LINK MAPPING FOUND TO COVERT LINK: {}", originalHref); continue; }//from www . j av a2 s . co m final String confluenceLinkMarkup = confluenceLink.getConfluenceLinkMarkup(); LOG.debug("LINK CONVERSION: {} -> {}", originalHref, confluenceLinkMarkup); linkElement.before(confluenceLinkMarkup); linkElement.html(""); linkElement.unwrap(); } reformatXHtmlHeadings(document, "h2"); reformatXHtmlHeadings(document, "h3"); reformatXHtmlHeadings(document, "#toctitle"); final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get(); if (swaggerConfluenceConfig.getPaginationMode() == PaginationMode.SINGLE_PAGE) { if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) { reformatXHtmlBreakAfterElements(document, "#toc"); } reformatXHtmlBreakAfterElements(document, ".sect1"); } reformatXHtmlSpacing(document.select(".sect2")); reformatXHtmlSpacing(document.select(".sect3")); return document.html(); }