Java Utililty Methods HTML Parse Jsoup

List of utility methods to do HTML Parse Jsoup

Description

The list of methods to do HTML Parse Jsoup are organized into topic(s).

Method

StringgetErrorMessage(String htmlStr)
We need to show custom error message returned from the configuration management system.
String errorMsg = html2text(htmlStr);
errorMsg = errorMsg.substring(
        errorMsg.indexOf(HTTP_ERROR_MSG_START_OFFSET) + HTTP_ERROR_MSG_START_OFFSET.length(),
        errorMsg.indexOf(HTTP_ERROR_MSG_END_OFFSET));
return errorMsg;
StringgetExplanation(String html)
get Explanation
String text = Jsoup.parse(html).text();
int indexof_explanation = -1;
int indexof_tomorrowspic = -1;
int indexof_wekeepanarchive = -1;
indexof_explanation = text.indexOf(EXPLANATION);
indexof_tomorrowspic = text.indexOf(TOMORROWS_PIC);
indexof_wekeepanarchive = text.indexOf(WE_KEEP_AN_ARCHIVE);
if (indexof_explanation == -1 || (indexof_tomorrowspic == -1 && indexof_wekeepanarchive == -1)) {
...
StringgetFirstImageSrc(String html)
get First Image Src
if (html == null)
    return null;
Elements es = Jsoup.parseBodyFragment(html).select("img");
if (es != null && es.size() > 0)
    return es.first().attr("src");
return null;
StringgetFirstSentence(final String html)
Returns the first sentence of the specified HTML text.
final Document newDoc = Document.createShell("");
final Element newBody = newDoc.body();
final Document document = parse(html);
final Element body = document.body();
for (final Node node : body.childNodes()) {
    if (node instanceof TextNode) {
        final String text = ((TextNode) node).text();
        final String[] parts = text.split("\\.(\\s+|$)", 2);
...
List>getFirstTableFromHTML(String result)
Converts the first table in a HTML snippet to a list of list of strings.
Document doc = parse(result);
Element table = doc.select("table").get(0);
List<List<String>> rowList = table.select("tr").stream()
        .map(tr -> tr.select("td").stream().map(element -> element.text()).collect(toList()))
        .collect(toList());
return rowList;
DocumentgetHtml(String url, String ruta_fich)
Gets the html.
Document doc = Jsoup.connect(url).timeout(0).get();
return doc;
StringgetHtmlBodyContent(String html)
get Html Body Content
if (html == null)
    return null;
Document doc = Jsoup.parseBodyFragment(html);
if (doc != null) {
    return doc.body().html();
return null;
ElementsgetHtmlInTag(String html, String tag)
get html tag include tag too
 input (html): 
Hello world!
input (tag): code output: Hello world
so you can remove the input tag by using #removeTag(String)
return parse(html).child(0).getElementsByTag(tag);
StringgetImageCredit(String html)
get Image Credit
String text = Jsoup.parse(html).text();
indices = new ArrayList<>();
int credit_index = -1;
int credit_length = -1;
for (String credit : CREDIT_STRINGS) {
    indices.add(new Integer(credit_index = text.indexOf(credit)));
    if (credit_index != -1) {
        credit_length = credit.length() + 1;
...
ListgetJSFileLinks(String html)
get JS File Links
Document doc = Jsoup.parse(html);
Elements scriptSrc = doc.select("script[src]");
List<String> jsFileLinks = new ArrayList<String>();
for (Element script : scriptSrc) {
    if (script.attr("abs:src").endsWith(".js")) {
        jsFileLinks.add(script.attr("abs:src"));
return jsFileLinks;