List of usage examples for org.apache.http.impl.conn.tsccm ThreadSafeClientConnManager getConnectionsInPool
public int getConnectionsInPool()
From source file:com.heaptrip.util.http.bixo.fetcher.SimpleHttpFetcher.java
private FetchedResult doRequest(HttpRequestBase request, String url, List<TupleTwo<?, ?>> data, List<TupleTwo<?, ?>> headers) throws BaseFetchException { LOGGER.trace("Fetching " + url); HttpResponse response;/*from w w w . jav a 2 s. com*/ long readStartTime; HttpHeaders headerMap = new HttpHeaders(); String redirectedUrl = null; String newBaseUrl = null; int numRedirects = 0; boolean needAbort = true; String contentType = ""; String hostAddress = null; // Create a local instance of cookie store, and bind to local context // Without this we get killed w/lots of threads, due to sync() on single // cookie store. HttpContext localContext = new BasicHttpContext(); CookieStore cookieStore = new BasicCookieStore(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); try { URI uri = new URI(url); request.setURI(uri); request.setHeader("Host", uri.getHost()); if (headers != null) { for (TupleTwo<?, ?> t : headers) { request.setHeader(t.getKey().toString(), t.getValue().toString()); } } //collect post data if available if (request instanceof HttpPost && data != null) { List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(1); for (TupleTwo<?, ?> e : data) { nameValuePairs.add(new BasicNameValuePair(URLEncoder.encode(e.getKey().toString(), "utf-8"), URLEncoder.encode(e.getValue().toString(), "utf-8"))); } ((HttpPost) (request)).setEntity(new UrlEncodedFormEntity(nameValuePairs)); } readStartTime = System.currentTimeMillis(); response = _httpClient.execute(request, localContext); Header[] responseHeaders = response.getAllHeaders(); for (Header header : responseHeaders) { headerMap.add(header.getName(), header.getValue()); } int httpStatus = response.getStatusLine().getStatusCode(); if ((httpStatus < 200) || (httpStatus >= 300)) { // We can't just check against SC_OK, as some wackos return 201, 202, // etc throw new HttpFetchException(url, "Error fetching " + url + " due to http status code " + httpStatus, httpStatus, headerMap); } redirectedUrl = extractRedirectedUrl(url, localContext); URI permRedirectUri = (URI) localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY); if (permRedirectUri != null) { newBaseUrl = permRedirectUri.toURL().toExternalForm(); } Integer redirects = (Integer) localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY); if (redirects != null) { numRedirects = redirects.intValue(); } hostAddress = (String) (localContext.getAttribute(HOST_ADDRESS)); if (hostAddress == null) { throw new UrlFetchException(url, "Host address not saved in context"); } Header cth = response.getFirstHeader(HttpHeaderNames.CONTENT_TYPE); if (cth != null) { contentType = cth.getValue(); } needAbort = false; } catch (ClientProtocolException e) { // Oleg guarantees that no abort is needed in the case of an IOException // (which is is a subclass of) needAbort = false; // If the root case was a "too many redirects" error, we want to map this // to a specific // exception that contains the final redirect. if (e.getCause() instanceof MyRedirectException) { MyRedirectException mre = (MyRedirectException) e.getCause(); String redirectUrl = url; try { redirectUrl = mre.getUri().toURL().toExternalForm(); } catch (MalformedURLException e2) { LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri()); } throw new RedirectFetchException(url, redirectUrl, mre.getReason()); } else if (e.getCause() instanceof RedirectException) { throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS); } else { throw new IOFetchException(url, e); } } catch (IOException e) { // Oleg guarantees that no abort is needed in the case of an IOException needAbort = false; if (e instanceof ConnectionPoolTimeoutException) { // Should never happen, so let's dump some info about the connection // pool. ThreadSafeClientConnManager cm = (ThreadSafeClientConnManager) _httpClient.getConnectionManager(); int numConnections = cm.getConnectionsInPool(); cm.closeIdleConnections(0, TimeUnit.MILLISECONDS); LOGGER.error(String.format( "Got ConnectionPoolTimeoutException: %d connections before, %d after idle close", numConnections, cm.getConnectionsInPool())); } throw new IOFetchException(url, e); } catch (URISyntaxException e) { throw new UrlFetchException(url, e.getMessage()); } catch (IllegalStateException e) { throw new UrlFetchException(url, e.getMessage()); } catch (BaseFetchException e) { throw e; } catch (Exception e) { // Map anything else to a generic IOFetchException // TODO KKr - create generic fetch exception throw new IOFetchException(url, new IOException(e)); } finally { safeAbort(needAbort, request); } // Figure out how much data we want to try to fetch. int targetLength = _fetcherPolicy.getMaxContentSize(); boolean truncated = false; String contentLengthStr = headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH); if (contentLengthStr != null) { try { int contentLength = Integer.parseInt(contentLengthStr); if (contentLength > targetLength) { truncated = true; } else { targetLength = contentLength; } } catch (NumberFormatException e) { // Ignore (and log) invalid content length values. LOGGER.warn("Invalid content length in header: " + contentLengthStr); } } // Now finally read in response body, up to targetLength bytes. // Note that entity might be null, for zero length responses. byte[] content = new byte[0]; long readRate = 0; HttpEntity entity = response.getEntity(); needAbort = true; if (entity != null) { InputStream in = null; try { in = entity.getContent(); byte[] buffer = new byte[BUFFER_SIZE]; int bytesRead = 0; int totalRead = 0; ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE); int readRequests = 0; int minResponseRate = _fetcherPolicy.getMinResponseRate(); // TODO KKr - we need to monitor the rate while reading a // single block. Look at HttpClient // metrics support for how to do this. Once we fix this, fix // the test to read a smaller (< 20K) // chuck of data. while ((totalRead < targetLength) && ((bytesRead = in.read(buffer, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) { readRequests += 1; totalRead += bytesRead; out.write(buffer, 0, bytesRead); // Assume read time is at least one millisecond, to avoid DBZ // exception. long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime); readRate = (totalRead * 1000L) / totalReadTime; // Don't bail on the first read cycle, as we can get a hiccup starting // out. // Also don't bail if we've read everything we need. if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) { throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE); } // Check to see if we got interrupted. if (Thread.interrupted()) { throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED); } } content = out.toByteArray(); needAbort = truncated || (in.available() > 0); } catch (IOException e) { // We don't need to abort if there's an IOException throw new IOFetchException(url, e); } finally { safeAbort(needAbort, request); safeClose(in); } } return new FetchedResult(url, redirectedUrl, System.currentTimeMillis(), headerMap, content, contentType, (int) readRate, newBaseUrl, numRedirects, hostAddress); }
From source file:bixo.fetcher.SimpleHttpFetcher.java
private FetchedResult doRequest(HttpRequestBase request, String url, List<Tuple2<?, ?>> data, List<Tuple2<?, ?>> headers) throws BaseFetchException { LOGGER.trace("Fetching " + url); HttpResponse response;//from w ww .j av a 2s. com long readStartTime; HttpHeaders headerMap = new HttpHeaders(); String redirectedUrl = null; String newBaseUrl = null; int numRedirects = 0; boolean needAbort = true; String contentType = ""; String hostAddress = null; // Create a local instance of cookie store, and bind to local context // Without this we get killed w/lots of threads, due to sync() on single // cookie store. HttpContext localContext = new BasicHttpContext(); CookieStore cookieStore = new BasicCookieStore(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); try { URI uri = new URI(url); request.setURI(uri); request.setHeader("Host", uri.getHost()); if (headers != null) { for (Tuple2<?, ?> t : headers) { request.setHeader(t.getKey().toString(), t.getValue().toString()); } } //collect post data if available if (request instanceof HttpPost && data != null) { List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(1); for (Tuple2<?, ?> e : data) { nameValuePairs.add(new BasicNameValuePair(URLEncoder.encode(e.getKey().toString(), "utf-8"), URLEncoder.encode(e.getValue().toString(), "utf-8"))); } ((HttpPost) (request)).setEntity(new UrlEncodedFormEntity(nameValuePairs)); } readStartTime = System.currentTimeMillis(); response = _httpClient.execute(request, localContext); Header[] responseHeaders = response.getAllHeaders(); for (Header header : responseHeaders) { headerMap.add(header.getName(), header.getValue()); } int httpStatus = response.getStatusLine().getStatusCode(); if ((httpStatus < 200) || (httpStatus >= 300)) { // We can't just check against SC_OK, as some wackos return 201, 202, // etc throw new HttpFetchException(url, "Error fetching " + url + " due to http status code " + httpStatus, httpStatus, headerMap); } redirectedUrl = extractRedirectedUrl(url, localContext); URI permRedirectUri = (URI) localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY); if (permRedirectUri != null) { newBaseUrl = permRedirectUri.toURL().toExternalForm(); } Integer redirects = (Integer) localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY); if (redirects != null) { numRedirects = redirects.intValue(); } hostAddress = (String) (localContext.getAttribute(HOST_ADDRESS)); if (hostAddress == null) { throw new UrlFetchException(url, "Host address not saved in context"); } Header cth = response.getFirstHeader(HttpHeaderNames.CONTENT_TYPE); if (cth != null) { contentType = cth.getValue(); } needAbort = false; } catch (ClientProtocolException e) { // Oleg guarantees that no abort is needed in the case of an IOException // (which is is a subclass of) needAbort = false; // If the root case was a "too many redirects" error, we want to map this // to a specific // exception that contains the final redirect. if (e.getCause() instanceof MyRedirectException) { MyRedirectException mre = (MyRedirectException) e.getCause(); String redirectUrl = url; try { redirectUrl = mre.getUri().toURL().toExternalForm(); } catch (MalformedURLException e2) { LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri()); } throw new RedirectFetchException(url, redirectUrl, mre.getReason()); } else if (e.getCause() instanceof RedirectException) { throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS); } else { throw new IOFetchException(url, e); } } catch (IOException e) { // Oleg guarantees that no abort is needed in the case of an IOException needAbort = false; if (e instanceof ConnectionPoolTimeoutException) { // Should never happen, so let's dump some info about the connection // pool. ThreadSafeClientConnManager cm = (ThreadSafeClientConnManager) _httpClient.getConnectionManager(); int numConnections = cm.getConnectionsInPool(); cm.closeIdleConnections(0, TimeUnit.MILLISECONDS); LOGGER.error(String.format( "Got ConnectionPoolTimeoutException: %d connections before, %d after idle close", numConnections, cm.getConnectionsInPool())); } throw new IOFetchException(url, e); } catch (URISyntaxException e) { throw new UrlFetchException(url, e.getMessage()); } catch (IllegalStateException e) { throw new UrlFetchException(url, e.getMessage()); } catch (BaseFetchException e) { throw e; } catch (Exception e) { // Map anything else to a generic IOFetchException // TODO KKr - create generic fetch exception throw new IOFetchException(url, new IOException(e)); } finally { safeAbort(needAbort, request); } // Figure out how much data we want to try to fetch. int targetLength = _fetcherPolicy.getMaxContentSize(); boolean truncated = false; String contentLengthStr = headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH); if (contentLengthStr != null) { try { int contentLength = Integer.parseInt(contentLengthStr); if (contentLength > targetLength) { truncated = true; } else { targetLength = contentLength; } } catch (NumberFormatException e) { // Ignore (and log) invalid content length values. LOGGER.warn("Invalid content length in header: " + contentLengthStr); } } // Now finally read in response body, up to targetLength bytes. // Note that entity might be null, for zero length responses. byte[] content = new byte[0]; long readRate = 0; HttpEntity entity = response.getEntity(); needAbort = true; if (entity != null) { InputStream in = null; try { in = entity.getContent(); byte[] buffer = new byte[BUFFER_SIZE]; int bytesRead = 0; int totalRead = 0; ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE); int readRequests = 0; int minResponseRate = _fetcherPolicy.getMinResponseRate(); // TODO KKr - we need to monitor the rate while reading a // single block. Look at HttpClient // metrics support for how to do this. Once we fix this, fix // the test to read a smaller (< 20K) // chuck of data. while ((totalRead < targetLength) && ((bytesRead = in.read(buffer, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) { readRequests += 1; totalRead += bytesRead; out.write(buffer, 0, bytesRead); // Assume read time is at least one millisecond, to avoid DBZ // exception. long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime); readRate = (totalRead * 1000L) / totalReadTime; // Don't bail on the first read cycle, as we can get a hiccup starting // out. // Also don't bail if we've read everything we need. if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) { throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE); } // Check to see if we got interrupted. if (Thread.interrupted()) { throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED); } } content = out.toByteArray(); needAbort = truncated || (in.available() > 0); } catch (IOException e) { // We don't need to abort if there's an IOException throw new IOFetchException(url, e); } finally { safeAbort(needAbort, request); safeClose(in); } } return new FetchedResult(url, redirectedUrl, System.currentTimeMillis(), headerMap, content, contentType, (int) readRate, newBaseUrl, numRedirects, hostAddress); }
From source file:crawlercommons.fetcher.SimpleHttpFetcher.java
private FetchedResult doRequest(HttpRequestBase request, String url, Payload payload) throws BaseFetchException { LOGGER.trace("Fetching " + url); HttpResponse response;//ww w . ja v a2s . c o m long readStartTime; Metadata headerMap = new Metadata(); String redirectedUrl = null; String newBaseUrl = null; int numRedirects = 0; boolean needAbort = true; String contentType = ""; String mimeType = ""; String hostAddress = null; // Create a local instance of cookie store, and bind to local context // Without this we get killed w/lots of threads, due to sync() on single cookie store. HttpContext localContext = new BasicHttpContext(); CookieStore cookieStore = new BasicCookieStore(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); StringBuilder fetchTrace = null; if (LOGGER.isTraceEnabled()) { fetchTrace = new StringBuilder("Fetched url: " + url); } try { request.setURI(new URI(url)); readStartTime = System.currentTimeMillis(); response = _httpClient.execute(request, localContext); Header[] headers = response.getAllHeaders(); for (Header header : headers) { headerMap.add(header.getName(), header.getValue()); } int httpStatus = response.getStatusLine().getStatusCode(); if (LOGGER.isTraceEnabled()) { fetchTrace.append("; status code: " + httpStatus); if (headerMap.get(HttpHeaders.CONTENT_LENGTH) != null) { fetchTrace.append("; Content-Length: " + headerMap.get(HttpHeaders.CONTENT_LENGTH)); } if (headerMap.get(HttpHeaders.LOCATION) != null) { fetchTrace.append("; Location: " + headerMap.get(HttpHeaders.LOCATION)); } } if ((httpStatus < 200) || (httpStatus >= 300)) { // We can't just check against SC_OK, as some wackos return 201, 202, etc throw new HttpFetchException(url, "Error fetching " + url, httpStatus, headerMap); } redirectedUrl = extractRedirectedUrl(url, localContext); URI permRedirectUri = (URI) localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY); if (permRedirectUri != null) { newBaseUrl = permRedirectUri.toURL().toExternalForm(); } Integer redirects = (Integer) localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY); if (redirects != null) { numRedirects = redirects.intValue(); } hostAddress = (String) (localContext.getAttribute(HOST_ADDRESS)); if (hostAddress == null) { throw new UrlFetchException(url, "Host address not saved in context"); } Header cth = response.getFirstHeader(HttpHeaders.CONTENT_TYPE); if (cth != null) { contentType = cth.getValue(); } // Check if we should abort due to mime-type filtering. Note that this will fail if the server // doesn't report a mime-type, but that's how we want it as this configuration is typically // used when only a subset of parsers are installed/enabled, so we don't want the auto-detect // code in Tika to get triggered & try to process an unsupported type. If you want unknown // mime-types from the server to be processed, set "" as one of the valid mime-types in FetcherPolicy. mimeType = getMimeTypeFromContentType(contentType); Set<String> mimeTypes = getValidMimeTypes(); if ((mimeTypes != null) && (mimeTypes.size() > 0)) { if (!mimeTypes.contains(mimeType)) { throw new AbortedFetchException(url, "Invalid mime-type: " + mimeType, AbortedFetchReason.INVALID_MIMETYPE); } } needAbort = false; } catch (ClientProtocolException e) { // Oleg guarantees that no abort is needed in the case of an IOException (which is is a subclass of) needAbort = false; // If the root case was a "too many redirects" error, we want to map this to a specific // exception that contains the final redirect. if (e.getCause() instanceof MyRedirectException) { MyRedirectException mre = (MyRedirectException) e.getCause(); String redirectUrl = url; try { redirectUrl = mre.getUri().toURL().toExternalForm(); } catch (MalformedURLException e2) { LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri()); } throw new RedirectFetchException(url, redirectUrl, mre.getReason()); } else if (e.getCause() instanceof RedirectException) { throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS); } else { throw new IOFetchException(url, e); } } catch (IOException e) { // Oleg guarantees that no abort is needed in the case of an IOException needAbort = false; if (e instanceof ConnectionPoolTimeoutException) { // Should never happen, so let's dump some info about the connection pool. ThreadSafeClientConnManager cm = (ThreadSafeClientConnManager) _httpClient.getConnectionManager(); int numConnections = cm.getConnectionsInPool(); cm.closeIdleConnections(0, TimeUnit.MILLISECONDS); LOGGER.error(String.format( "Got ConnectionPoolTimeoutException: %d connections before, %d after idle close", numConnections, cm.getConnectionsInPool())); } throw new IOFetchException(url, e); } catch (URISyntaxException e) { throw new UrlFetchException(url, e.getMessage()); } catch (IllegalStateException e) { throw new UrlFetchException(url, e.getMessage()); } catch (BaseFetchException e) { throw e; } catch (Exception e) { // Map anything else to a generic IOFetchException // TODO KKr - create generic fetch exception throw new IOFetchException(url, new IOException(e)); } finally { safeAbort(needAbort, request); } // Figure out how much data we want to try to fetch. int maxContentSize = getMaxContentSize(mimeType); int targetLength = maxContentSize; boolean truncated = false; String contentLengthStr = headerMap.get(HttpHeaders.CONTENT_LENGTH); if (contentLengthStr != null) { try { int contentLength = Integer.parseInt(contentLengthStr); if (contentLength > targetLength) { truncated = true; } else { targetLength = contentLength; } } catch (NumberFormatException e) { // Ignore (and log) invalid content length values. LOGGER.warn("Invalid content length in header: " + contentLengthStr); } } // Now finally read in response body, up to targetLength bytes. // Note that entity might be null, for zero length responses. byte[] content = new byte[0]; long readRate = 0; HttpEntity entity = response.getEntity(); needAbort = true; if (entity != null) { InputStream in = null; try { in = entity.getContent(); byte[] buffer = new byte[BUFFER_SIZE]; int bytesRead = 0; int totalRead = 0; ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE); int readRequests = 0; int minResponseRate = getMinResponseRate(); // TODO KKr - we need to monitor the rate while reading a // single block. Look at HttpClient // metrics support for how to do this. Once we fix this, fix // the test to read a smaller (< 20K) // chuck of data. while ((totalRead < targetLength) && ((bytesRead = in.read(buffer, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) { readRequests += 1; totalRead += bytesRead; out.write(buffer, 0, bytesRead); // Assume read time is at least one millisecond, to avoid DBZ exception. long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime); readRate = (totalRead * 1000L) / totalReadTime; // Don't bail on the first read cycle, as we can get a hiccup starting out. // Also don't bail if we've read everything we need. if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) { throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE); } // Check to see if we got interrupted. if (Thread.interrupted()) { throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED); } } content = out.toByteArray(); needAbort = truncated || (in.available() > 0); } catch (IOException e) { // We don't need to abort if there's an IOException throw new IOFetchException(url, e); } finally { safeAbort(needAbort, request); safeClose(in); } } // Toss truncated image content. if ((truncated) && (!isTextMimeType(mimeType))) { throw new AbortedFetchException(url, "Truncated image", AbortedFetchReason.CONTENT_SIZE); } // Now see if we need to uncompress the content. String contentEncoding = headerMap.get(HttpHeaders.CONTENT_ENCODING); if (contentEncoding != null) { if (LOGGER.isTraceEnabled()) { fetchTrace.append("; Content-Encoding: " + contentEncoding); } // TODO KKr We might want to just decompress a truncated gzip // containing text (since we have a max content size to save us // from any gzip corruption). We might want to break the following // out into a separate method, by the way (if not refactor this // entire monolithic method). // try { if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { if (truncated) { throw new AbortedFetchException(url, "Truncated compressed data", AbortedFetchReason.CONTENT_SIZE); } else { ExpandedResult expandedResult = EncodingUtils.processGzipEncoded(content, maxContentSize); truncated = expandedResult.isTruncated(); if ((truncated) && (!isTextMimeType(mimeType))) { throw new AbortedFetchException(url, "Truncated decompressed image", AbortedFetchReason.CONTENT_SIZE); } else { content = expandedResult.getExpanded(); if (LOGGER.isTraceEnabled()) { fetchTrace.append("; unzipped to " + content.length + " bytes"); } } // } else if ("deflate".equals(contentEncoding)) { // content = EncodingUtils.processDeflateEncoded(content); // if (LOGGER.isTraceEnabled()) { // fetchTrace.append("; inflated to " + content.length + " bytes"); // } } } } catch (IOException e) { throw new IOFetchException(url, e); } } // Finally dump out the trace msg we've been building. if (LOGGER.isTraceEnabled()) { LOGGER.trace(fetchTrace.toString()); } // TODO KKr - Save truncated flag in FetchedResult/FetchedDatum. return new FetchedResult(url, redirectedUrl, System.currentTimeMillis(), headerMap, content, contentType, (int) readRate, payload, newBaseUrl, numRedirects, hostAddress); }