Example usage for org.jsoup.nodes Attributes hasKey

List of usage examples for org.jsoup.nodes Attributes hasKey

Introduction

In this page you can find the example usage for org.jsoup.nodes Attributes hasKey.

Prototype

public boolean hasKey(String key) 

Source Link

Document

Tests if these attributes contain an attribute with this key.

Usage

From source file:com.romeikat.datamessie.core.base.service.download.ContentDownloader.java

public DownloadResult downloadContent(String url) {
    LOG.debug("Downloading content from {}", url);
    // In case of a new redirection for that source, use redirected URL
    URLConnection urlConnection = null;
    String originalUrl = null;/*w w  w . java2 s .c om*/
    org.jsoup.nodes.Document jsoupDocument = null;
    Integer statusCode = null;
    final LocalDateTime downloaded = LocalDateTime.now();
    try {
        urlConnection = getConnection(url);
        // Server-side redirection
        final String responseUrl = getResponseUrl(urlConnection);
        if (responseUrl != null) {
            final String redirectedUrl = getRedirectedUrl(url, responseUrl);
            if (isValidRedirection(url, redirectedUrl)) {
                originalUrl = url;
                url = redirectedUrl;
                closeUrlConnection(urlConnection);
                urlConnection = getConnection(url);
                LOG.debug("Redirection (server): {} -> {}", originalUrl, url);
            }
        }
        // Download content for further redirects
        final InputStream urlInputStream = asInputStream(urlConnection, true, false);
        final Charset charset = getCharset(urlConnection);
        jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
        final Elements metaTagsHtmlHeadLink;
        Elements metaTagsHtmlHeadMeta = null;
        // Meta redirection (<link rel="canonical" .../>)
        if (originalUrl == null) {
            metaTagsHtmlHeadLink = jsoupDocument.select("html head link");
            for (final Element metaTag : metaTagsHtmlHeadLink) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("rel")
                        && metaTagAttributes.get("rel").equalsIgnoreCase("canonical")
                        && metaTagAttributes.hasKey("href")) {
                    final String redirectedUrl = metaTagAttributes.get("href").trim();
                    if (isValidRedirection(url, redirectedUrl)) {
                        originalUrl = url;
                        url = redirectedUrl;
                        jsoupDocument = null;
                        LOG.debug("Redirection (<link rel=\"canonical\" .../>): {} -> {}", originalUrl, url);
                        break;
                    }
                }
            }
        }
        // Meta redirection (<meta http-equiv="refresh" .../>)
        if (originalUrl == null) {
            metaTagsHtmlHeadMeta = jsoupDocument.select("html head meta");
            for (final Element metaTag : metaTagsHtmlHeadMeta) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("http-equiv")
                        && metaTagAttributes.get("http-equiv").equalsIgnoreCase("refresh")
                        && metaTagAttributes.hasKey("content")) {
                    final String[] parts = metaTagAttributes.get("content").replace(" ", "").split("=", 2);
                    if (parts.length > 1) {
                        final String redirectedUrl = parts[1];
                        if (isValidRedirection(url, redirectedUrl)) {
                            originalUrl = url;
                            url = redirectedUrl;
                            jsoupDocument = null;
                            LOG.debug("Redirection (<meta http-equiv=\"refresh\" .../>): {} -> {}", originalUrl,
                                    url);
                            break;
                        }
                    }
                }
            }
        }
        // Meta redirection (<meta property="og:url" .../>)
        if (originalUrl == null) {
            for (final Element metaTag : metaTagsHtmlHeadMeta) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("property")
                        && metaTagAttributes.get("property").equalsIgnoreCase("og:url")
                        && metaTagAttributes.hasKey("content")) {
                    final String redirectedUrl = metaTagAttributes.get("content").trim();
                    if (isValidRedirection(url, redirectedUrl)) {
                        originalUrl = url;
                        url = redirectedUrl;
                        jsoupDocument = null;
                        LOG.debug("Redirection (<meta property=\"og:url\" .../>): {} -> {}", originalUrl, url);
                        break;
                    }
                }
            }
        }
    } catch (final Exception e) {
        if (e instanceof HttpStatusException) {
            statusCode = ((HttpStatusException) e).getStatusCode();
        }
        LOG.warn("Could not determine redirected URL for " + url, e);
    } finally {
        closeUrlConnection(urlConnection);
    }
    // Download content (if not yet done)
    String content = null;
    try {
        if (jsoupDocument == null) {
            LOG.debug("Downloading content from {}", url);
            urlConnection = getConnection(url);
            final InputStream urlInputStream = asInputStream(urlConnection, true, false);
            final Charset charset = getCharset(urlConnection);
            jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
        }
    } catch (final Exception e) {
        if (e instanceof HttpStatusException) {
            statusCode = ((HttpStatusException) e).getStatusCode();
        }
        // If the redirected URL does not exist, use the original URL instead
        if (originalUrl == null) {
            LOG.warn("Could not download content from " + url, e);
        }
        // If the redirected URL does not exist and a original URL is available, use the
        // original URL instead
        else {
            try {
                LOG.debug(
                        "Could not download content from redirected URL {}, downloading content from original URL {} instead",
                        url, originalUrl);
                urlConnection = getConnection(originalUrl);
                final InputStream urlInputStream = asInputStream(urlConnection, true, false);
                final Charset charset = getCharset(urlConnection);
                jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
                url = originalUrl;
                originalUrl = null;
                statusCode = null;
            } catch (final Exception e2) {
                LOG.warn("Could not download content from original URL " + url, e);
            }
        }
    } finally {
        closeUrlConnection(urlConnection);
    }
    if (jsoupDocument != null) {
        content = jsoupDocument.html();
    }
    // Strip non-valid characters as specified by the XML 1.0 standard
    final String validContent = xmlUtil.stripNonValidXMLCharacters(content);
    // Unescape HTML characters
    final String unescapedContent = StringEscapeUtils.unescapeHtml4(validContent);
    // Done
    final DownloadResult downloadResult = new DownloadResult(originalUrl, url, unescapedContent, downloaded,
            statusCode);
    return downloadResult;
}