List of usage examples for org.jsoup.nodes Document outputSettings
OutputSettings outputSettings
To view the source code for org.jsoup.nodes Document outputSettings.
Click Source Link
From source file:org.eclipse.skalli.commons.HtmlUtils.java
/** * Filters untrusted tags and attributes from the given HTML fragment by using * a whitelist of allowed tags and attributes. * * @param html the HTML fragment to clean. * @param whitelist whitelist of allowed tags and attributes, or <code>null</code> * if the {@link #getWhiteList() default whitelist} should be used. * @param baseUri base URL to resolve relative URLs against, or <code>null</code>. * @param escapeMode determines how XML/HTML entities are to be escaped, * or <code>null</code>. The default escape mode is {@link EscapeMode.xhtml}, * i.e. only the XML entities <tt>&quot</tt>, <tt>&amp</tt>, <tt>&apos</tt>, * <tt>&lt</tt>, and <tt>&gt</tt> are recognized. * * @return the cleaned input string.//from w ww . j av a 2s. co m */ public static String clean(String html, Whitelist whitelist, String baseUri, EscapeMode escapeMode) { if (StringUtils.isBlank(html)) { return html; } if (whitelist == null) { whitelist = getWhiteList(); } String cleaned = Jsoup.clean(html, baseUri != null ? baseUri : "", whitelist); //$NON-NLS-1$ Document cleanedDocument = Jsoup.parse(cleaned); cleanedDocument.outputSettings().escapeMode(escapeMode != null ? escapeMode : EscapeMode.xhtml); return cleanedDocument.body().html(); }
From source file:org.finra.herd.core.HerdStringUtils.java
/** * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist * * @param fragment the specified String// w w w. j a va 2s. co m * @param whitelistTags the specified whitelist tags * * @return cleaned String with allowed tags */ public static String stripHtml(String fragment, String... whitelistTags) { // Parse out html tags except those from a given list of whitelist tags Document dirty = Jsoup.parseBodyFragment(fragment); Whitelist whitelist = new Whitelist(); for (String whitelistTag : whitelistTags) { // Get the actual tag name from the whitelist tag // this is vulnerable in general to complex tags but will suffice for our simple needs whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]"); // Add all specified tags to the whitelist while preserving inline css whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class"); } Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); // Set character encoding to UTF-8 and make sure no line-breaks are added clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8) .prettyPrint(false); // return 'cleaned' html body return clean.body().html(); }
From source file:org.mar9000.space2latex.WikiPage.java
public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException { String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); Document document = Jsoup.parseBodyFragment(page.storage); document.outputSettings().prettyPrint(false); Elements images = document.select("ac|image"); if (images.size() > 0) LOGGER.info(" Download images:"); for (Element element : images) { String downloadURL = null; String imageKey = null;/*from w w w. j a v a2s. co m*/ // Attachment? Elements refs = element.select("ri|attachment"); WikiImage image = new WikiImage(); image.pageId = page.id; image.acImage = element.outerHtml(); // if (refs.size() > 0) { // Attachment. Element riAttachment = refs.get(0); imageKey = riAttachment.attr("ri:filename"); Elements riPages = riAttachment.select("ri|page"); // Thumbnails are not found with "child/attachment" URL schema. boolean isThumbnail = "true".equals(element.attr("ac:thumbnail")); String queryURL = null; if (!isThumbnail) { queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } else { // For thumbnail we construct directly the downloadURL without queryURL. /* Some pages have thumbnail images for better online reading. * Here we download always the attached file to embed readable imagesinto the pdf. downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey); */ downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/" + page.id + "/" + URLEncoder.encode(imageKey); } if (riPages.size() > 0) { // The attachment is related with another page. Element riPage = riPages.get(0); String space = riPage.attr("ri:space-key"); String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20"); String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle + "&spaceKey=" + space; JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL); if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0) throw new RuntimeException( "Page \"" + contentTitle + "\" in space " + space + " not found."); JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0); image.pageId = jsonNewPage.getString(JSON_ID_ATTR); // Overwrite queryURL. String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } if (!isThumbnail) downloadURL = getAttachmentDownloadURL(queryURL); } else { refs = element.select("ri|url"); if (refs.size() > 0) { // URL. downloadURL = refs.get(0).attr("ri:value"); URL tempURL = new URL(downloadURL); String urlPath = tempURL.getPath(); imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1); } else { throw new RuntimeException("Image format unknown: " + element.toString()); } } // Download the image data. image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX. if (downloadURL != null) { LOGGER.info(" about to download image {}/{}", new Object[] { image.pageId, image.filename }); image.data = IOUtils.getImageFromURL(downloadURL); } else { LOGGER.info(" NULL download URL for page/image: {}/{}", new Object[] { image.pageId, image.filename }); } page.images.put(imageKey, image); } }
From source file:org.mar9000.space2latex.WikiPage.java
public static WikiPage loadForFormat(File file) throws IOException { String fileContent = IOUtils.readFileAsString(file); Document doc = Jsoup.parseBodyFragment(fileContent); // Maintain input string. doc.outputSettings().prettyPrint(false); Element body = doc.body();/*from w w w .jav a 2 s . c om*/ Element pageElement = body.select("page").first(); String title = pageElement.attr("title"); String id = pageElement.attr("id"); Element pageContent = pageElement.select("content").first(); WikiPage page = new WikiPage(null, title, id, pageContent.html()); page.pageContent = pageContent; // Images. Elements images = body.select("wikiimages").first().select("wikiimage"); for (Element imageElement : images) { WikiImage image = new WikiImage(); String acKey = imageElement.select("ac|image").first().outerHtml(); image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename"); page.images.put(acKey, image); } return page; }
From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java
/** * Gets the JSoup document./*from w w w. ja va2s .c o m*/ * * @param urlString the URL string * @param request the request * @return the JSoup document * @throws Exception the exception */ protected Document getJsoupDocument(String urlString, Request request) throws Exception { InputStream is = null; URLConnection c = null; try { URL url = UrlFactory.getUrl(urlString, request); c = url.openConnection(); is = c.getInputStream(); final Document doc = Jsoup.parse(is, "ISO-8859-9", "/"); doc.outputSettings().escapeMode(EscapeMode.xhtml); return doc; } finally { IOUtils.closeQuietly(is); UrlFactory.disconnect(c); } }
From source file:sk.svec.jan.acb.extraction.DiscussionFinder.java
public void findData(String path) throws Exception { dateCount = 0;// w w w.ja v a 2 s . c o m maxDepth = 0; foundDateStringSwitch = false; foundDate = false; File input = new File(path); Date todayDate = new Date(input.lastModified()); SimpleDateFormat dateFormat = new SimpleDateFormat("dd. MM. yyyy"); today = dateFormat.format(todayDate); Date yesterdayDate = new Date(todayDate.getTime() - 1 * 24 * 3600 * 1000); yesterday = dateFormat.format(yesterdayDate); Document doc = Jsoup.parse(input, "UTF-8"); Node node = doc; //Using EscapeMode.xhtml will give you output without entities. //sprvne kdovanie doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); traversePage(node); String filePath = path.substring(0, path.lastIndexOf("/") + 1); String outputPath = filePath.replace("extracted", "results"); //create folder for comments String fileName = path.substring(path.lastIndexOf("/") + 1, path.lastIndexOf(".")); // String commentFolderPath = outputPath + fileName + "_comments/"; // new File(commentFolderPath).mkdirs(); //initialize allLevels = new ArrayList<HashMap<String, Integer>>(); for (int i = 0; i <= maxDepth; i++) { allLevels.add(new HashMap<String, Integer>()); } boolean findDocumentParts = findDocumentParts(node); if (findDocumentParts) { // System.out.println(documentPart); Elements documentParts = doc.select(documentPartNode); int i = 0; for (Element documentPart : documentParts) { // System.out.println(documentPart.toString()+"\n"); DocumentPartFinder dpf = new DocumentPartFinder(documentPart.toString(), today, yesterday); // System.out.println("celly komentar "+dpf.getDoc().text()); for (Node nod : dpf.getNodesToRemove()) { // System.out.println(nod); dpf.removeNodes(dpf.getNode(), nod); } // System.out.println("XXXXXXXX"); String text = dpf.getDoc().text(); if (text.trim().length() == 0) { text = "null"; } //ak nenajdeme text alebo autora tak nevypiseme nic // if (text.trim().length() != 0 && dpf.getAuthor() != null) { String name; if (dpf.getAuthor() == null) { name = "null"; nullAuthor++; } else { name = dpf.getAuthor().trim(); } String date; if (dpf.getDate() == null) { date = "null"; } else { date = dpf.getDate().trim(); } String title = "diskusia"; //remove html tags title = html2text(title); name = html2text(name); date = html2text(date); //odstrani autor: xxx, datum: xxx atd // if (name.indexOf(":") != -1) { // name = name.substring(name.indexOf(":") + 1); // } date = findDateRegex(date); //nacitanie linku z exkterneho suboru String linkPath = filePath.replace("extracted", "links"); linkPath = linkPath + fileName + ".link"; String link = new Scanner(new File(linkPath)).useDelimiter("\\A").next(); String xmlPath = (outputPath + fileName + "_comment" + i + ".xml"); linkAndPath.add("<a href=\"" + link + "\">" + link + "</a> - <a href=\"/WebStructureDetection-web/getfile?name=" + xmlPath + "\"> " + xmlPath + "</a>"); WriteXMLFile wxmlf = new WriteXMLFile(); wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlPath); //cesty pre autora, ak nenaslo, ulozi do specialneho suboru String xmlFileName; if (name.compareTo("null") == 0) { xmlFileName = "deletedLinksLog.xml"; name = ""; date = ""; text = ""; title = xmlPath; } else { xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; } // String xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; StringTokenizer st = new StringTokenizer(outputPath, "/"); //cesta k suboru output/sk/cas/ napriklad String outputPath2 = ""; for (int j = 0; j < 3; j++) { outputPath2 += st.nextToken() + "/"; // System.out.println(st.nextToken()); } String xmlAuthorPath = outputPath2 + "author/" + xmlFileName; new File(outputPath2 + "author/").mkdirs(); //ulozenie autora if (text.compareTo("null") != 0) { File f = new File(xmlAuthorPath); if (f.isFile()) { wxmlf.addToXmlFile(link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } else { wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } } System.out.println("username: " + name); System.out.println("date: " + date); System.out.println("text: " + text); System.out.println("comment " + i + "extracted succesfully\n"); // } i++; } } }
From source file:utils.AutoLinkRenderer.java
private AutoLinkRenderer parse(Pattern pattern, ToLink toLink) { Document doc = Jsoup.parse(body); Document.OutputSettings settings = doc.outputSettings(); settings.prettyPrint(false);//from w w w. j av a 2 s .c o m Elements elements = doc.getElementsMatchingOwnText(pattern); for (Element el : elements) { if (isIgnoreElement(el)) { continue; } List<TextNode> textNodeList = el.textNodes(); for (TextNode node : textNodeList) { String result = convertLink(node.toString(), pattern, toLink); node.text(StringUtils.EMPTY); node.after(result); } } this.body = doc.body().html(); return this; }