de.jetwick.snacktory.HtmlFetcher.java Source code

Introduction

Here is the source code for de.jetwick.snacktory.HtmlFetcher.java
Source

/*
 * Copyright 2011 Peter Karich
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations
 * under the License.
 */
package de.jetwick.snacktory;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class to fetch articles. This class is thread safe.
 *
 * @author Peter Karich
 */
public class HtmlFetcher {

    static {
        SHelper.enableCookieMgmt();
        SHelper.enableUserAgentOverwrite();
        SHelper.enableAnySSL();
    }
    private static final Logger logger = LoggerFactory.getLogger(HtmlFetcher.class);

    public static void main(String[] args) throws Exception {
        BufferedReader reader = new BufferedReader(new FileReader("urls.txt"));
        String line = null;
        Set<String> existing = new LinkedHashSet<String>();
        while ((line = reader.readLine()) != null) {
            int index1 = line.indexOf("\"");
            int index2 = line.indexOf("\"", index1 + 1);
            String url = line.substring(index1 + 1, index2);
            String domainStr = SHelper.extractDomain(url, true);
            String counterStr = "";
            // TODO more similarities
            if (existing.contains(domainStr))
                counterStr = "2";
            else
                existing.add(domainStr);

            String html = new HtmlFetcher().fetchAsString(url, 20000);
            String outFile = domainStr + counterStr + ".html";
            BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
            writer.write(html);
            writer.close();
        }
        reader.close();
    }

    private String referrer = "https://app.appgree.com";
    private String userAgent = "Mozilla/5.0 (compatible; Snacktory; " + referrer + ")";
    private String cacheControl = "max-age=0";
    private String language = "en-us";
    private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
    private String charset = "UTF-8";
    private SCache cache;
    private Proxy proxy = null;
    private AtomicInteger cacheCounter = new AtomicInteger(0);
    private int maxTextLength = -1;
    private ArticleTextExtractor extractor = new ArticleTextExtractor();
    @SuppressWarnings("serial")
    private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
        {
            add("bit.ly");
            add("cli.gs");
            add("deck.ly");
            add("fb.me");
            add("feedproxy.google.com");
            add("flic.kr");
            add("fur.ly");
            add("goo.gl");
            add("is.gd");
            add("ink.co");
            add("j.mp");
            add("lnkd.in");
            add("on.fb.me");
            add("ow.ly");
            add("plurl.us");
            add("sns.mx");
            add("snurl.com");
            add("su.pr");
            add("t.co");
            add("tcrn.ch");
            add("tl.gd");
            add("tiny.cc");
            add("tinyurl.com");
            add("tmi.me");
            add("tr.im");
            add("twurl.nl");
            add("kcy.me");
        }
    };

    public HtmlFetcher() {
    }

    public void setExtractor(ArticleTextExtractor extractor) {
        this.extractor = extractor;
    }

    public ArticleTextExtractor getExtractor() {
        return extractor;
    }

    public HtmlFetcher setCache(SCache cache) {
        this.cache = cache;
        return this;
    }

    public SCache getCache() {
        return cache;
    }

    public int getCacheCounter() {
        return cacheCounter.get();
    }

    public HtmlFetcher clearCacheCounter() {
        cacheCounter.set(0);
        return this;
    }

    public HtmlFetcher setMaxTextLength(int maxTextLength) {
        this.maxTextLength = maxTextLength;
        return this;
    }

    public int getMaxTextLength() {
        return maxTextLength;
    }

    public void setAccept(String accept) {
        this.accept = accept;
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

    public void setCacheControl(String cacheControl) {
        this.cacheControl = cacheControl;
    }

    public String getLanguage() {
        return language;
    }

    public void setLanguage(String language) {
        this.language = language;
    }

    public String getReferrer() {
        return referrer;
    }

    public HtmlFetcher setReferrer(String referrer) {
        this.referrer = referrer;
        return this;
    }

    public String getUserAgent() {
        return userAgent;
    }

    public void setUserAgent(String userAgent) {
        this.userAgent = userAgent;
    }

    public String getAccept() {
        return accept;
    }

    public String getCacheControl() {
        return cacheControl;
    }

    public String getCharset() {
        return charset;
    }

    public void setProxy(Proxy proxy) {
        this.proxy = proxy;
    }

    public Proxy getProxy() {
        return (proxy != null ? proxy : Proxy.NO_PROXY);
    }

    public boolean isProxySet() {
        return getProxy() != null;
    }

    public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {

        JResult result = new JResult();

        try {

            String originalUrl = url;
            url = SHelper.removeHashbang(url);
            String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
            if (gUrl != null)
                url = gUrl;
            else {
                gUrl = SHelper.getUrlFromUglyFacebookRedirect(url);
                if (gUrl != null)
                    url = gUrl;
            }

            if (resolve) {
                // check if we can avoid resolving the URL (which hits the website!)
                JResult res = getFromCache(url, originalUrl);
                if (res != null)
                    return res;

                String resUrl = getResolvedUrl(url, timeout);
                if (resUrl.isEmpty()) {
                    if (logger.isDebugEnabled())
                        logger.warn("resolved url is empty. Url is: " + url);

                    result = new JResult();
                    if (cache != null)
                        cache.put(url, result);
                    return result.setUrl(url);
                }

                // if resolved url is longer then use it!
                if (resUrl != null && resUrl.trim().length() > url.length()) {
                    // this is necessary e.g. for some homebaken url resolvers which return
                    // the resolved url relative to url!
                    url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
                }
            }

            // check if we have the (resolved) URL in cache
            JResult res = getFromCache(url, originalUrl);
            if (res != null)
                return res;

            result = new JResult();
            // or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/>
            result.setUrl(url);
            result.setOriginalUrl(originalUrl);
            result.setDate(SHelper.estimateDate(url));

            // Immediately put the url into the cache as extracting content takes time.
            if (cache != null) {
                cache.put(originalUrl, result);
                cache.put(url, result);
            }

            String lowerUrl = url.toLowerCase();
            if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
                // skip
            } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
                result.setVideoUrl(url);
            } else if (SHelper.isImage(lowerUrl)) {
                result.setImageUrl(url);
            } else {
                extractor.extractContent(url, result, fetchAsString(url, timeout));
                if (result.getFaviconUrl().isEmpty())
                    result.setFaviconUrl(SHelper.getDefaultFavicon(url));

                // some links are relative to root and do not include the domain of the url :(
                result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
                result.setImageUrl(fixUrl(url, result.getImageUrl()));
                result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
                result.setRssUrl(fixUrl(url, result.getRssUrl()));
            }
            result.setText(lessText(result.getText()));
            synchronized (result) {
                result.notifyAll();
            }

            return result;

        } catch (Exception e) {

            return result;
        }
    }

    public String lessText(String text) {
        if (text == null)
            return "";

        if (maxTextLength >= 0 && text.length() > maxTextLength)
            return text.substring(0, maxTextLength);

        return text;
    }

    private static String fixUrl(String url, String urlOrPath) {
        return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
    }

    public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException, IOException {
        return fetchAsString(urlAsString, timeout, true);
    }

    public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
            throws MalformedURLException, IOException {
        urlAsString = urlAsString.replace("https", "http");
        CloseableHttpResponse response = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions, false);
        if (response.getStatusLine().getStatusCode() > 399) {
            throw new MalformedURLException(response.getStatusLine().toString());
        }
        Header header = response.getFirstHeader("Content-Type");
        String encoding = null;
        if (header == null) {
            encoding = "utf-8";
        } else {
            encoding = header.getValue();
            if (encoding == null || !encoding.startsWith("text")) {
                throw new MalformedURLException("Not an HTML content!");
            }
        }
        String res = null;
        try {
            final HttpEntity body = response.getEntity();
            InputStream is;
            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
                is = new GZIPInputStream(body.getContent());
            } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
                is = new InflaterInputStream(body.getContent(), new Inflater(true));
            } else {
                is = body.getContent();
            }

            String enc = Converter.extractEncoding(encoding);
            res = createConverter(urlAsString).streamToString(is, enc);
            EntityUtils.consume(body);

            if (logger.isDebugEnabled())
                logger.debug(res.length() + " FetchAsString:" + urlAsString);
        } finally {
            response.close();
        }
        return res;
    }

    public Converter createConverter(String url) {
        return new Converter(url);
    }

    /**
     * On some devices we have to hack: http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html
     *
     * @param timeout Sets a specified timeout value, in milliseconds
     * @return the resolved url if any. Or null if it couldn't resolve the url (within the specified time) or the same url if response code is OK
     */
    public String getResolvedUrl(String urlAsString, int timeout) {
        int responseCode = -1;
        String newUrl = null;
        try {
            urlAsString = urlAsString.replace("https", "http");
            CloseableHttpResponse response = createUrlConnection(urlAsString, timeout, true, true);
            responseCode = response.getStatusLine().getStatusCode();
            if (responseCode == HttpStatus.SC_OK)
                return urlAsString;

            Header location = response.getLastHeader("Location");
            if (responseCode / 100 == 3 && location != null) {
                newUrl = location.getValue().replaceAll(" ", "+");
                // some services use (none-standard) utf8 in their location header
                if (urlAsString.startsWith("http://bit.ly") || urlAsString.startsWith("http://is.gd"))
                    newUrl = encodeUriFromHeader(newUrl);

                // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit
                if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
                    newUrl = getResolvedUrl(newUrl, timeout);

                return newUrl;
            } else
                return urlAsString;

        } catch (Exception ex) {
            logger.warn("getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage(), ex);
            return "";
        } finally {
            if (logger.isDebugEnabled())
                logger.debug(responseCode + " url:" + urlAsString + " resolved:" + newUrl);
        }
    }

    /**
     * Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding to non-ASCII characters. Workaround for broken origin servers that send
     * UTF-8 in the Location: header.
     */
    static String encodeUriFromHeader(String badLocation) {
        StringBuilder sb = new StringBuilder();

        for (char ch : badLocation.toCharArray()) {
            if (ch < (char) 128) {
                sb.append(ch);
            } else {
                // this is ONLY valid if the uri was decoded using ISO-8859-1
                sb.append(String.format("%%%02X", (int) ch));
            }
        }

        return sb.toString();
    }

    protected CloseableHttpResponse createUrlConnection(String urlAsStr, int timeout,
            boolean includeSomeGooseOptions, boolean isHead) throws MalformedURLException, IOException {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpRequestBase request = null;
        if (isHead) {
            request = new HttpHead(urlAsStr);
        } else {
            request = new HttpGet(urlAsStr);
        }
        RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(timeout)
                .setConnectTimeout(timeout).setSocketTimeout(timeout).setCookieSpec(CookieSpecs.STANDARD).build();
        request.setHeader("User-Agent", userAgent);
        request.setHeader("Accept", accept);

        if (includeSomeGooseOptions) {
            request.setHeader("Accept-Language", language);
            request.setHeader("content-charset", charset);
            request.setHeader("Referer", referrer);
            // avoid the cache for testing purposes only?
            request.setHeader("Cache-Control", cacheControl);
        }

        // suggest respond to be gzipped or deflated (which is just another compression)
        // http://stackoverflow.com/q/3932117
        request.setHeader("Accept-Encoding", "gzip, deflate");
        request.setConfig(requestConfig);

        return httpclient.execute(request);
    }

    private JResult getFromCache(String url, String originalUrl) throws Exception {
        if (cache != null) {
            JResult res = cache.get(url);
            if (res != null) {
                // e.g. the cache returned a shortened url as original url now we want to store the
                // current original url! Also it can be that the cache response to url but the JResult
                // does not contain it so overwrite it:
                res.setUrl(url);
                res.setOriginalUrl(originalUrl);
                cacheCounter.addAndGet(1);
                return res;
            }
        }
        return null;
    }
}