List of usage examples for org.jsoup.nodes Attributes hasKey
public boolean hasKey(String key)
From source file:com.romeikat.datamessie.core.base.service.download.ContentDownloader.java
public DownloadResult downloadContent(String url) { LOG.debug("Downloading content from {}", url); // In case of a new redirection for that source, use redirected URL URLConnection urlConnection = null; String originalUrl = null;/*w w w . java2 s .c om*/ org.jsoup.nodes.Document jsoupDocument = null; Integer statusCode = null; final LocalDateTime downloaded = LocalDateTime.now(); try { urlConnection = getConnection(url); // Server-side redirection final String responseUrl = getResponseUrl(urlConnection); if (responseUrl != null) { final String redirectedUrl = getRedirectedUrl(url, responseUrl); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; closeUrlConnection(urlConnection); urlConnection = getConnection(url); LOG.debug("Redirection (server): {} -> {}", originalUrl, url); } } // Download content for further redirects final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); final Elements metaTagsHtmlHeadLink; Elements metaTagsHtmlHeadMeta = null; // Meta redirection (<link rel="canonical" .../>) if (originalUrl == null) { metaTagsHtmlHeadLink = jsoupDocument.select("html head link"); for (final Element metaTag : metaTagsHtmlHeadLink) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("rel") && metaTagAttributes.get("rel").equalsIgnoreCase("canonical") && metaTagAttributes.hasKey("href")) { final String redirectedUrl = metaTagAttributes.get("href").trim(); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<link rel=\"canonical\" .../>): {} -> {}", originalUrl, url); break; } } } } // Meta redirection (<meta http-equiv="refresh" .../>) if (originalUrl == null) { metaTagsHtmlHeadMeta = jsoupDocument.select("html head meta"); for (final Element metaTag : metaTagsHtmlHeadMeta) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("http-equiv") && metaTagAttributes.get("http-equiv").equalsIgnoreCase("refresh") && metaTagAttributes.hasKey("content")) { final String[] parts = metaTagAttributes.get("content").replace(" ", "").split("=", 2); if (parts.length > 1) { final String redirectedUrl = parts[1]; if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<meta http-equiv=\"refresh\" .../>): {} -> {}", originalUrl, url); break; } } } } } // Meta redirection (<meta property="og:url" .../>) if (originalUrl == null) { for (final Element metaTag : metaTagsHtmlHeadMeta) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("property") && metaTagAttributes.get("property").equalsIgnoreCase("og:url") && metaTagAttributes.hasKey("content")) { final String redirectedUrl = metaTagAttributes.get("content").trim(); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<meta property=\"og:url\" .../>): {} -> {}", originalUrl, url); break; } } } } } catch (final Exception e) { if (e instanceof HttpStatusException) { statusCode = ((HttpStatusException) e).getStatusCode(); } LOG.warn("Could not determine redirected URL for " + url, e); } finally { closeUrlConnection(urlConnection); } // Download content (if not yet done) String content = null; try { if (jsoupDocument == null) { LOG.debug("Downloading content from {}", url); urlConnection = getConnection(url); final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); } } catch (final Exception e) { if (e instanceof HttpStatusException) { statusCode = ((HttpStatusException) e).getStatusCode(); } // If the redirected URL does not exist, use the original URL instead if (originalUrl == null) { LOG.warn("Could not download content from " + url, e); } // If the redirected URL does not exist and a original URL is available, use the // original URL instead else { try { LOG.debug( "Could not download content from redirected URL {}, downloading content from original URL {} instead", url, originalUrl); urlConnection = getConnection(originalUrl); final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); url = originalUrl; originalUrl = null; statusCode = null; } catch (final Exception e2) { LOG.warn("Could not download content from original URL " + url, e); } } } finally { closeUrlConnection(urlConnection); } if (jsoupDocument != null) { content = jsoupDocument.html(); } // Strip non-valid characters as specified by the XML 1.0 standard final String validContent = xmlUtil.stripNonValidXMLCharacters(content); // Unescape HTML characters final String unescapedContent = StringEscapeUtils.unescapeHtml4(validContent); // Done final DownloadResult downloadResult = new DownloadResult(originalUrl, url, unescapedContent, downloaded, statusCode); return downloadResult; }