Example usage for org.jsoup.nodes Document outputSettings

List of usage examples for org.jsoup.nodes Document outputSettings

Introduction

In this page you can find the example usage for org.jsoup.nodes Document outputSettings.

Prototype

OutputSettings outputSettings

To view the source code for org.jsoup.nodes Document outputSettings.

Click Source Link

Usage

From source file:org.eclipse.skalli.commons.HtmlUtils.java

/**
 * Filters untrusted tags and attributes from the given HTML fragment by using
 * a whitelist of allowed tags and attributes.
 *
 * @param html  the HTML fragment to clean.
 * @param whitelist  whitelist of allowed tags and attributes, or <code>null</code>
 * if the {@link #getWhiteList() default whitelist} should be used.
 * @param baseUri  base URL to resolve relative URLs against, or <code>null</code>.
 * @param escapeMode  determines how XML/HTML entities are to be escaped,
 * or <code>null</code>. The default escape mode is {@link EscapeMode.xhtml},
 * i.e. only the XML entities <tt>&amp;quot</tt>, <tt>&amp;amp</tt>, <tt>&amp;apos</tt>,
 * <tt>&amp;lt</tt>, and <tt>&amp;gt</tt> are recognized.
 *
 * @return the cleaned input string.//from  w  ww . j  av a 2s. co m
 */
public static String clean(String html, Whitelist whitelist, String baseUri, EscapeMode escapeMode) {
    if (StringUtils.isBlank(html)) {
        return html;
    }
    if (whitelist == null) {
        whitelist = getWhiteList();
    }
    String cleaned = Jsoup.clean(html, baseUri != null ? baseUri : "", whitelist); //$NON-NLS-1$
    Document cleanedDocument = Jsoup.parse(cleaned);
    cleanedDocument.outputSettings().escapeMode(escapeMode != null ? escapeMode : EscapeMode.xhtml);
    return cleanedDocument.body().html();
}

From source file:org.finra.herd.core.HerdStringUtils.java

/**
 * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
 *
 * @param fragment the specified String//  w  w w. j  a va  2s.  co m
 * @param whitelistTags the specified whitelist tags
 *
 * @return cleaned String with allowed tags
 */
public static String stripHtml(String fragment, String... whitelistTags) {

    // Parse out html tags except those from a given list of whitelist tags
    Document dirty = Jsoup.parseBodyFragment(fragment);

    Whitelist whitelist = new Whitelist();

    for (String whitelistTag : whitelistTags) {
        // Get the actual tag name from the whitelist tag
        // this is vulnerable in general to complex tags but will suffice for our simple needs
        whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");

        // Add all specified tags to the whitelist while preserving inline css
        whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
    }

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    // Set character encoding to UTF-8 and make sure no line-breaks are added
    clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8)
            .prettyPrint(false);

    // return 'cleaned' html body
    return clean.body().html();
}

From source file:org.mar9000.space2latex.WikiPage.java

public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException {
    String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
    Document document = Jsoup.parseBodyFragment(page.storage);
    document.outputSettings().prettyPrint(false);
    Elements images = document.select("ac|image");
    if (images.size() > 0)
        LOGGER.info("  Download images:");
    for (Element element : images) {
        String downloadURL = null;
        String imageKey = null;/*from   w  w w.  j a  v  a2s.  co m*/
        // Attachment?
        Elements refs = element.select("ri|attachment");
        WikiImage image = new WikiImage();
        image.pageId = page.id;
        image.acImage = element.outerHtml();
        //
        if (refs.size() > 0) { // Attachment.
            Element riAttachment = refs.get(0);
            imageKey = riAttachment.attr("ri:filename");
            Elements riPages = riAttachment.select("ri|page");
            // Thumbnails are not found with "child/attachment" URL schema.
            boolean isThumbnail = "true".equals(element.attr("ac:thumbnail"));
            String queryURL = null;
            if (!isThumbnail) {
                queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            } else {
                // For thumbnail we construct directly the downloadURL without queryURL.
                /* Some pages have thumbnail images for better online reading.
                 * Here we download always the attached file to embed readable imagesinto the pdf.
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api"))
                      + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey);
                */
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/"
                        + page.id + "/" + URLEncoder.encode(imageKey);
            }
            if (riPages.size() > 0) {
                // The attachment is related with another page.
                Element riPage = riPages.get(0);
                String space = riPage.attr("ri:space-key");
                String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20");
                String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle
                        + "&spaceKey=" + space;
                JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL);
                if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0)
                    throw new RuntimeException(
                            "Page \"" + contentTitle + "\" in space " + space + " not found.");
                JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0);
                image.pageId = jsonNewPage.getString(JSON_ID_ATTR);
                // Overwrite queryURL.
                String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            }
            if (!isThumbnail)
                downloadURL = getAttachmentDownloadURL(queryURL);
        } else {
            refs = element.select("ri|url");
            if (refs.size() > 0) { // URL.
                downloadURL = refs.get(0).attr("ri:value");
                URL tempURL = new URL(downloadURL);
                String urlPath = tempURL.getPath();
                imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1);
            } else {
                throw new RuntimeException("Image format unknown: " + element.toString());
            }
        }
        // Download the image data.
        image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX.
        if (downloadURL != null) {
            LOGGER.info("    about to download image {}/{}", new Object[] { image.pageId, image.filename });
            image.data = IOUtils.getImageFromURL(downloadURL);
        } else {
            LOGGER.info("    NULL download URL for page/image: {}/{}",
                    new Object[] { image.pageId, image.filename });
        }
        page.images.put(imageKey, image);
    }
}

From source file:org.mar9000.space2latex.WikiPage.java

public static WikiPage loadForFormat(File file) throws IOException {
    String fileContent = IOUtils.readFileAsString(file);
    Document doc = Jsoup.parseBodyFragment(fileContent);
    // Maintain input string.
    doc.outputSettings().prettyPrint(false);
    Element body = doc.body();/*from w w w  .jav  a 2  s  .  c om*/
    Element pageElement = body.select("page").first();
    String title = pageElement.attr("title");
    String id = pageElement.attr("id");
    Element pageContent = pageElement.select("content").first();
    WikiPage page = new WikiPage(null, title, id, pageContent.html());
    page.pageContent = pageContent;
    // Images.
    Elements images = body.select("wikiimages").first().select("wikiimage");
    for (Element imageElement : images) {
        WikiImage image = new WikiImage();
        String acKey = imageElement.select("ac|image").first().outerHtml();
        image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename");
        page.images.put(acKey, image);
    }
    return page;
}

From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java

/**
 * Gets the JSoup document./*from  w w w. ja va2s .c  o  m*/
 *
 * @param urlString the URL string
 * @param request the request
 * @return the JSoup document
 * @throws Exception the exception
 */
protected Document getJsoupDocument(String urlString, Request request) throws Exception {
    InputStream is = null;
    URLConnection c = null;
    try {
        URL url = UrlFactory.getUrl(urlString, request);
        c = url.openConnection();
        is = c.getInputStream();
        final Document doc = Jsoup.parse(is, "ISO-8859-9", "/");
        doc.outputSettings().escapeMode(EscapeMode.xhtml);
        return doc;
    } finally {
        IOUtils.closeQuietly(is);
        UrlFactory.disconnect(c);
    }
}

From source file:sk.svec.jan.acb.extraction.DiscussionFinder.java

public void findData(String path) throws Exception {
    dateCount = 0;// w w  w.ja v a  2 s  . c o  m
    maxDepth = 0;
    foundDateStringSwitch = false;
    foundDate = false;
    File input = new File(path);

    Date todayDate = new Date(input.lastModified());
    SimpleDateFormat dateFormat = new SimpleDateFormat("dd. MM. yyyy");
    today = dateFormat.format(todayDate);
    Date yesterdayDate = new Date(todayDate.getTime() - 1 * 24 * 3600 * 1000);
    yesterday = dateFormat.format(yesterdayDate);

    Document doc = Jsoup.parse(input, "UTF-8");
    Node node = doc;
    //Using EscapeMode.xhtml will give you output without entities. 
    //sprvne kdovanie
    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

    traversePage(node);

    String filePath = path.substring(0, path.lastIndexOf("/") + 1);
    String outputPath = filePath.replace("extracted", "results");

    //create folder for comments
    String fileName = path.substring(path.lastIndexOf("/") + 1, path.lastIndexOf("."));

    //        String commentFolderPath = outputPath + fileName + "_comments/";
    //        new File(commentFolderPath).mkdirs();
    //initialize
    allLevels = new ArrayList<HashMap<String, Integer>>();
    for (int i = 0; i <= maxDepth; i++) {
        allLevels.add(new HashMap<String, Integer>());
    }

    boolean findDocumentParts = findDocumentParts(node);
    if (findDocumentParts) {
        //            System.out.println(documentPart);

        Elements documentParts = doc.select(documentPartNode);
        int i = 0;
        for (Element documentPart : documentParts) {
            //                System.out.println(documentPart.toString()+"\n");
            DocumentPartFinder dpf = new DocumentPartFinder(documentPart.toString(), today, yesterday);
            //                 System.out.println("celly komentar "+dpf.getDoc().text());
            for (Node nod : dpf.getNodesToRemove()) {
                //            System.out.println(nod);
                dpf.removeNodes(dpf.getNode(), nod);
            }
            //                System.out.println("XXXXXXXX");
            String text = dpf.getDoc().text();

            if (text.trim().length() == 0) {
                text = "null";
            }
            //ak nenajdeme text alebo autora tak nevypiseme nic
            //                if (text.trim().length() != 0 && dpf.getAuthor() != null) {

            String name;
            if (dpf.getAuthor() == null) {
                name = "null";
                nullAuthor++;
            } else {
                name = dpf.getAuthor().trim();
            }

            String date;
            if (dpf.getDate() == null) {
                date = "null";
            } else {
                date = dpf.getDate().trim();
            }

            String title = "diskusia";

            //remove html tags
            title = html2text(title);
            name = html2text(name);
            date = html2text(date);

            //odstrani autor: xxx, datum: xxx atd
            //                if (name.indexOf(":") != -1) {
            //                    name = name.substring(name.indexOf(":") + 1);
            //                }

            date = findDateRegex(date);

            //nacitanie linku z exkterneho suboru       
            String linkPath = filePath.replace("extracted", "links");
            linkPath = linkPath + fileName + ".link";
            String link = new Scanner(new File(linkPath)).useDelimiter("\\A").next();

            String xmlPath = (outputPath + fileName + "_comment" + i + ".xml");

            linkAndPath.add("<a href=\"" + link + "\">" + link
                    + "</a> - <a href=\"/WebStructureDetection-web/getfile?name=" + xmlPath + "\"> " + xmlPath
                    + "</a>");

            WriteXMLFile wxmlf = new WriteXMLFile();
            wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlPath);

            //cesty pre autora, ak  nenaslo, ulozi do specialneho suboru
            String xmlFileName;
            if (name.compareTo("null") == 0) {
                xmlFileName = "deletedLinksLog.xml";
                name = "";
                date = "";
                text = "";
                title = xmlPath;
            } else {
                xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes()))
                        + ".xml";
            }
            // String xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml";
            StringTokenizer st = new StringTokenizer(outputPath, "/");
            //cesta k suboru output/sk/cas/ napriklad
            String outputPath2 = "";
            for (int j = 0; j < 3; j++) {
                outputPath2 += st.nextToken() + "/";
                //                    System.out.println(st.nextToken());
            }
            String xmlAuthorPath = outputPath2 + "author/" + xmlFileName;
            new File(outputPath2 + "author/").mkdirs();

            //ulozenie autora
            if (text.compareTo("null") != 0) {
                File f = new File(xmlAuthorPath);
                if (f.isFile()) {
                    wxmlf.addToXmlFile(link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath);
                } else {
                    wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(),
                            xmlAuthorPath);
                }

            }
            System.out.println("username: " + name);
            System.out.println("date: " + date);
            System.out.println("text: " + text);

            System.out.println("comment " + i + "extracted succesfully\n");
            //                }
            i++;
        }
    }

}

From source file:utils.AutoLinkRenderer.java

private AutoLinkRenderer parse(Pattern pattern, ToLink toLink) {
    Document doc = Jsoup.parse(body);

    Document.OutputSettings settings = doc.outputSettings();
    settings.prettyPrint(false);//from  w  w w.  j av a 2  s  .c  o  m

    Elements elements = doc.getElementsMatchingOwnText(pattern);

    for (Element el : elements) {
        if (isIgnoreElement(el)) {
            continue;
        }

        List<TextNode> textNodeList = el.textNodes();

        for (TextNode node : textNodeList) {
            String result = convertLink(node.toString(), pattern, toLink);
            node.text(StringUtils.EMPTY);
            node.after(result);
        }
    }

    this.body = doc.body().html();
    return this;
}