Example usage for org.apache.http.impl.client DefaultRedirectHandler getLocationURI

List of usage examples for org.apache.http.impl.client DefaultRedirectHandler getLocationURI

Introduction

In this page you can find the example usage for org.apache.http.impl.client DefaultRedirectHandler getLocationURI.

Prototype

public URI getLocationURI(final HttpResponse response, final HttpContext context) throws ProtocolException 

Source Link

Usage

From source file:org.openpipeline.pipeline.connector.webcrawler.Fetcher.java

/**
 * Fetches the data associated with the next URL.
 * //from  w w w  . j av  a  2 s.co m
 * @param nextUrlItem
 *            containing the next URL to crawl
 * 
 * @return action based on the status code of the HttpClient, robots safety
 *         check, max number of redirects, max number of fetch attempts
 */
public int fetch(LinkDBRecord nextUrlItem) {

    if (nextUrlItem == null) {
        return HttpResultMapper.ACTION_DELETE;
    }

    if (client == null) {
        throw new RuntimeException("Fetcher not initialized.");
    }

    String nextUrl = nextUrlItem.getNextUrl();
    redirectUrl = nextUrl;
    HttpResponse httpResponse = null;
    HttpGet get = null;
    lastModified = 0;

    try {
        /*
         * Check the compliance with allow/disallow directives.
         */
        UrlItem item = new UrlItem(nextUrlItem.getNextUrl());

        boolean robotsSafe = robotsDirectives.allowed(item);
        if (!robotsSafe) {
            if (debug) {
                logger.debug("Robots denied, next URL: " + nextUrl);
            }
            return HttpResultMapper.ACTION_DELETE;
        }
        /*
         * Check the compliance with visit-time directives.
         */
        boolean visitTimeSafe = robotsDirectives.visitTimeOK();
        if (!visitTimeSafe) {
            if (debug) {
                logger.debug("Robots visit time denied, next URL: " + nextUrl);
            }
            return HttpResultMapper.ACTION_SKIP;
        }

        int status = -1;
        int numRedirects = 0;

        while (true) {

            get = new HttpGet();

            /* Set uri for the next execution of get */
            URI uri = new URI(nextUrl);
            get.setURI(uri);

            /*
             * Check crawl delay. If the fetcher follows the redirect URL it
             * will also observe the crawl delay
             */
            long waitTime = robotsDirectives.crawlDelayTime(lastFetchTimeThisDomain);

            if (waitTime > 0) {
                try {
                    Thread.sleep(waitTime);
                } catch (InterruptedException e) {
                    logger.error("Exception in fetcher in thread.sleep, next URL: " + nextUrl + ". Message: "
                            + e.getMessage());
                }
            }

            /* Execute get method */
            DefaultRedirectHandler redirectHandler = new DefaultRedirectHandler();
            client.setRedirectHandler(redirectHandler);

            HttpContext localContext = new BasicHttpContext();

            httpResponse = client.execute(get, localContext);
            if (httpResponse == null) {
                break;
            }

            Header lastModHeader = httpResponse.getFirstHeader("last-modified");
            if (lastModHeader != null) {
                String lastModifiedDate = lastModHeader.getValue();
                Date date = format.parse(lastModifiedDate);
                lastModified = date.getTime();
            }

            StatusLine statusLine = httpResponse.getStatusLine();
            if (statusLine == null) {
                /* Should not happen after execute */
                status = -1;
            } else {
                status = httpResponse.getStatusLine().getStatusCode();
            }

            lastFetchTimeThisDomain = System.currentTimeMillis();

            HttpEntity entity = httpResponse.getEntity();

            if (HttpResultMapper.permanentRedirect(status) || HttpResultMapper.temporaryRedirect(status)) {
                /*
                 * The fetcher follows a redirect until the maximum number
                 * of redirects is reached.
                 */
                if (numRedirects == maxNumberOfRedirects) {
                    break;
                }

                /* Update the URL to be fetched */
                URI redirectURI = redirectHandler.getLocationURI(httpResponse, localContext);

                String newUrl = redirectURI.toString();
                numRedirects++;

                /*
                 * In case of a permanent redirect, the fetcher asks the URL
                 * filter whether to follow it or not. The fetcher follows
                 * all temporary redirects.
                 */
                if (HttpResultMapper.permanentRedirect(status)) {

                    boolean redirectUrlOK = urlFilter.checkCanonicalForm(newUrl, nextUrl);

                    /*
                     * Only follows the permanent redirects which are
                     * different because of the formatting to the canonical
                     * form such as removing the trailing slash
                     */
                    if (!redirectUrlOK) {
                        /* Permanent redirect, keep the redirect URL */
                        redirectUrl = newUrl;
                        break;
                    }
                }
                /*
                 * If the permanent redirect URL differs just in formatting,
                 * or if temporary redirect follow it.
                 * 
                 * The redirect URL becomes nextURL for the next iteration
                 * of the while loop.
                 */
                nextUrl = newUrl;

                if (debug) {
                    logger.debug("Fetcher: had a redirect, redirect URL: " + nextUrl + ", status: " + status);
                }
            } else {
                /*
                 * get's responseBody contains data if success and is null
                 * otherwise
                 */
                // TODO retry if
                // exception?

                if (entity != null) {

                    long inputSize = entity.getContentLength();

                    if (inputSize > 0 && inputSize >= maxFileSize) {
                        throw new RuntimeException("Fetcher exception: data exceeds the max file size.");
                    }
                    /* Often the data length is not known */
                    inputStream = getData(entity);
                }
                break;
            }
            /*
             * Need to release the current connection, otherwise client does
             * not work
             */
            entity.consumeContent();
        }

        /*
         * Decide on action after the while loop is done, possibly done with
         * redirects
         */
        int action = HttpResultMapper.getHttpCodeResult(status);

        int fetchAttempts = nextUrlItem.getFetchAttempts();
        if (action != HttpResultMapper.ACTION_FINALIZE && fetchAttempts == maxNumberOfFetchAttempts) {
            /*
             * Remove items which have too many fetch attempts: redirects,
             * skip etc
             */
            action = HttpResultMapper.ACTION_DELETE;
        } else if (numRedirects == maxNumberOfRedirects) {
            /*
             * Avoid following too many redirects
             */
            action = HttpResultMapper.ACTION_DELETE;
        }

        return action;

    } catch (Throwable e) {
        /*
         * Currently, no re-tries are implemented. The HttpClient
         * automatically tries to recover from safe exceptions.
         */

        if (e instanceof org.apache.http.conn.ConnectTimeoutException) {
            return HttpResultMapper.ACTION_SKIP;
        }
        return HttpResultMapper.ACTION_DELETE;
    } finally {
        if (get != null) {
            get.abort();
        }
    }
}