List of usage examples for org.apache.http.impl.client DefaultRedirectHandler getLocationURI
public URI getLocationURI(final HttpResponse response, final HttpContext context) throws ProtocolException
From source file:org.openpipeline.pipeline.connector.webcrawler.Fetcher.java
/** * Fetches the data associated with the next URL. * //from w w w . j av a 2 s.co m * @param nextUrlItem * containing the next URL to crawl * * @return action based on the status code of the HttpClient, robots safety * check, max number of redirects, max number of fetch attempts */ public int fetch(LinkDBRecord nextUrlItem) { if (nextUrlItem == null) { return HttpResultMapper.ACTION_DELETE; } if (client == null) { throw new RuntimeException("Fetcher not initialized."); } String nextUrl = nextUrlItem.getNextUrl(); redirectUrl = nextUrl; HttpResponse httpResponse = null; HttpGet get = null; lastModified = 0; try { /* * Check the compliance with allow/disallow directives. */ UrlItem item = new UrlItem(nextUrlItem.getNextUrl()); boolean robotsSafe = robotsDirectives.allowed(item); if (!robotsSafe) { if (debug) { logger.debug("Robots denied, next URL: " + nextUrl); } return HttpResultMapper.ACTION_DELETE; } /* * Check the compliance with visit-time directives. */ boolean visitTimeSafe = robotsDirectives.visitTimeOK(); if (!visitTimeSafe) { if (debug) { logger.debug("Robots visit time denied, next URL: " + nextUrl); } return HttpResultMapper.ACTION_SKIP; } int status = -1; int numRedirects = 0; while (true) { get = new HttpGet(); /* Set uri for the next execution of get */ URI uri = new URI(nextUrl); get.setURI(uri); /* * Check crawl delay. If the fetcher follows the redirect URL it * will also observe the crawl delay */ long waitTime = robotsDirectives.crawlDelayTime(lastFetchTimeThisDomain); if (waitTime > 0) { try { Thread.sleep(waitTime); } catch (InterruptedException e) { logger.error("Exception in fetcher in thread.sleep, next URL: " + nextUrl + ". Message: " + e.getMessage()); } } /* Execute get method */ DefaultRedirectHandler redirectHandler = new DefaultRedirectHandler(); client.setRedirectHandler(redirectHandler); HttpContext localContext = new BasicHttpContext(); httpResponse = client.execute(get, localContext); if (httpResponse == null) { break; } Header lastModHeader = httpResponse.getFirstHeader("last-modified"); if (lastModHeader != null) { String lastModifiedDate = lastModHeader.getValue(); Date date = format.parse(lastModifiedDate); lastModified = date.getTime(); } StatusLine statusLine = httpResponse.getStatusLine(); if (statusLine == null) { /* Should not happen after execute */ status = -1; } else { status = httpResponse.getStatusLine().getStatusCode(); } lastFetchTimeThisDomain = System.currentTimeMillis(); HttpEntity entity = httpResponse.getEntity(); if (HttpResultMapper.permanentRedirect(status) || HttpResultMapper.temporaryRedirect(status)) { /* * The fetcher follows a redirect until the maximum number * of redirects is reached. */ if (numRedirects == maxNumberOfRedirects) { break; } /* Update the URL to be fetched */ URI redirectURI = redirectHandler.getLocationURI(httpResponse, localContext); String newUrl = redirectURI.toString(); numRedirects++; /* * In case of a permanent redirect, the fetcher asks the URL * filter whether to follow it or not. The fetcher follows * all temporary redirects. */ if (HttpResultMapper.permanentRedirect(status)) { boolean redirectUrlOK = urlFilter.checkCanonicalForm(newUrl, nextUrl); /* * Only follows the permanent redirects which are * different because of the formatting to the canonical * form such as removing the trailing slash */ if (!redirectUrlOK) { /* Permanent redirect, keep the redirect URL */ redirectUrl = newUrl; break; } } /* * If the permanent redirect URL differs just in formatting, * or if temporary redirect follow it. * * The redirect URL becomes nextURL for the next iteration * of the while loop. */ nextUrl = newUrl; if (debug) { logger.debug("Fetcher: had a redirect, redirect URL: " + nextUrl + ", status: " + status); } } else { /* * get's responseBody contains data if success and is null * otherwise */ // TODO retry if // exception? if (entity != null) { long inputSize = entity.getContentLength(); if (inputSize > 0 && inputSize >= maxFileSize) { throw new RuntimeException("Fetcher exception: data exceeds the max file size."); } /* Often the data length is not known */ inputStream = getData(entity); } break; } /* * Need to release the current connection, otherwise client does * not work */ entity.consumeContent(); } /* * Decide on action after the while loop is done, possibly done with * redirects */ int action = HttpResultMapper.getHttpCodeResult(status); int fetchAttempts = nextUrlItem.getFetchAttempts(); if (action != HttpResultMapper.ACTION_FINALIZE && fetchAttempts == maxNumberOfFetchAttempts) { /* * Remove items which have too many fetch attempts: redirects, * skip etc */ action = HttpResultMapper.ACTION_DELETE; } else if (numRedirects == maxNumberOfRedirects) { /* * Avoid following too many redirects */ action = HttpResultMapper.ACTION_DELETE; } return action; } catch (Throwable e) { /* * Currently, no re-tries are implemented. The HttpClient * automatically tries to recover from safe exceptions. */ if (e instanceof org.apache.http.conn.ConnectTimeoutException) { return HttpResultMapper.ACTION_SKIP; } return HttpResultMapper.ACTION_DELETE; } finally { if (get != null) { get.abort(); } } }