Example usage for org.jsoup.nodes Document charset

List of usage examples for org.jsoup.nodes Document charset

Introduction

In this page you can find the example usage for org.jsoup.nodes Document charset.

Prototype

public void charset(Charset charset) 

Source Link

Document

Sets the charset used in this document.

Usage

From source file:dslab.crawler.pack.CrawlerPack.java

public org.jsoup.nodes.Document htmlToJsoupDoc(String html) {

    Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}

From source file:dslab.crawler.pack.CrawlerPack.java

public org.jsoup.nodes.Document xmlToJsoupDoc(String xml) {
    if (xml != null) {
        xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<" + prefix.toLowerCase() + "$1>")
                .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</" + prefix.toLowerCase() + "$1>");

        Document jsoupDoc = Jsoup.parse(xml, "", new Parser(new PrefixXmlTreeBuilder(prefix.toLowerCase())));
        jsoupDoc.charset(StandardCharsets.UTF_8);

        return jsoupDoc;
    } else//from   w  w w  . jav a  2s.co m
        return null;
}

From source file:com.github.abola.crawler.CrawlerPack.java

/**
 *  XML  Jsoup Document /*  w  w  w.  ja va 2  s  . c  o m*/
 *
 * Jsoup 1.9.1+ supported non-ascii tag
 * -----
 * Tag ??? a-zA-Z jsoup ?
 *  prefix
 * ?xmlParse prefix
 *
 * @param xml XML format string
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document xmlToJsoupDoc(String xml) {

    // Tag ? a-zA-Z ?
    //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>")
    //         .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>");

    //  xml  jsoup Document 
    //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}

From source file:com.github.abola.crawler.CrawlerPack.java

/**
 *  HTML  Jsoup Document /*from  w  ww . ja  v  a 2  s . co  m*/
 *
 * HTMLJsoup HTML Parser
 *
 * @param html Html document
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document htmlToJsoupDoc(String html) {

    //  html(html/html5)  jsoup Document 
    Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}