Java Utililty Methods HTML Parse Jsoup

List of utility methods to do HTML Parse Jsoup

Description

The list of methods to do HTML Parse Jsoup are organized into topic(s).

Method

StringextractRssUrl(String html, URI base)
extract Rss Url
Document d = Jsoup.parse(html);
Elements links = d.getElementsByTag(LINK);
for (Element link : links) {
    if (ALTERNATE.equalsIgnoreCase(link.attr(REL))) {
        String type = link.attr(TYPE);
        if (RSS.equalsIgnoreCase(type) || ATOM.equalsIgnoreCase(type)) {
            String href = link.attr(HREF);
            String title = link.attr(TITLE);
...
Stringfilter(String html)
filter
return StringUtil.isBlank(html) ? "" : Jsoup.clean(html, content_filter);
StringfixHtml(String htmlContent, String outputFile, String contentFile)
fix Html
Document doc = Jsoup.parseBodyFragment(htmlContent);
doc.outputSettings().charset("ASCII");
String relPrefixPath = computeRelPath(outputFile, contentFile);
if (!relPrefixPath.isEmpty()) {
    Elements imgElements = doc.getElementsByTag("img");
    for (Element e : imgElements) {
        String src = e.attr("src");
        if (src != null) {
...
StringgetContentFromHTML(String html)
get Content From HTML
Document doc = Jsoup.parse(html);
return doc.body().text();
ListgetDistinctImageUrls(String htmlContent)
Finds all image inclusions (looks for img tags).
Document doc = Jsoup.parse(htmlContent);
Elements els = doc.select(IMG_SELECTOR);
List<String> images = new ArrayList<>(els.size());
for (Element e : els) {
    String path = e.attr(SRC_ATTR);
    if (!images.contains(path)) {
        images.add(path);
return images;
DocumentgetDoc(Connection conn)
get Doc
Document doc = null;
int retry = 0;
IOException e1 = null;
while (retry++ < 3) {
    try {
        doc = conn.get();
        return doc;
    } catch (IOException e) {
...
DocumentgetDoc(File file)
get Doc
Document result = null;
try {
    result = Jsoup.parse(file, "UTF-8");
} catch (Exception e) {
    e.printStackTrace();
return result;
DocumentgetDoc(String path)
get Doc
String fileContent = readFile(path, StandardCharsets.UTF_8).replaceAll("(?i)<br[^>]*>", LINE_START)
        .replaceAll("\n", LINE_START);
return Jsoup.parse(fileContent);
DocumentgetDoc(String url)
get Doc
try {
    Document document = Jsoup.connect(url).timeout(10000).get();
    if (document == null) {
        document = Jsoup.connect(url).timeout(10000).get();
    return document;
} catch (IOException e) {
    System.out.println("get document error," + e.getMessage());
...
StringgetDoctypeName(InputStream s)
get Doctype Name
final org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(s, "us-ascii", "",
        org.jsoup.parser.Parser.xmlParser());
List<org.jsoup.nodes.Node> nods = doc.childNodes();
for (org.jsoup.nodes.Node node : nods)
    if (node instanceof org.jsoup.nodes.DocumentType) {
        org.jsoup.nodes.DocumentType documentType = (org.jsoup.nodes.DocumentType) node;
        final String res = documentType.attr("name");
        if (res != null)
...