List of usage examples for org.apache.http.params HttpProtocolParamBean setUserAgent
public void setUserAgent(String str)
From source file:org.berlin.crawl.net.RobotsConnector.java
/** * Connect to robots.txt file./*from w w w.j av a 2s . co m*/ * * On error, close inputstream, return empty document. * * @param builder * @return * @throws Exception */ protected synchronized String connect(final URIBuilder builder) throws Exception { this.lastURIBuilder = builder; InputStream instream = null; try { logger.info("Attempting request : " + builder.toString()); final HttpParams params = new BasicHttpParams(); final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setUserAgent(OctaneCrawlerConstants.USER_AGENT); // Set this to false, or else you'll get an // Expectation Failed: error paramsBean.setUseExpectContinue(false); final URI uri = builder.build(); final HttpClient httpclient = new DefaultHttpClient(); final HttpGet httpget = new HttpGet(uri); httpget.setParams(params); // Connect // final HttpResponse response = httpclient.execute(httpget); final HttpEntity entity = response.getEntity(); this.response = response; if (response != null) { if (response.getStatusLine() != null) { if (response.getStatusLine().getStatusCode() != 200) { // Log the error line logger.error("Invalid status code - " + response.getStatusLine().getStatusCode()); throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode()); } } } if (entity != null) { instream = entity.getContent(); if (instream != null) { final StringBuffer document = new StringBuffer(); final BufferedReader reader = new BufferedReader(new InputStreamReader(instream)); String line = ""; while ((line = reader.readLine()) != null) { document.append(line); document.append(NL); } // End of the while // return document.toString(); } // End of - instream /// } // End of the if / Thread.sleep(100); } catch (final Exception e) { logger.error("Error at robots connect", e); throw new CrawlerError("Error at connect", e); } finally { try { if (instream != null) { instream.close(); } } catch (IOException e) { e.printStackTrace(); } } // End of the try - catch block // return null; }
From source file:org.berlin.crawl.net.WebConnector.java
public synchronized String connect(final BotLink blink, final URIBuilder builder) throws Exception { InputStream instream = null;/*from w w w . ja va 2s . c om*/ try { logger.info("!* Attempting download and connect request : " + builder.toString()); final HttpParams params = new BasicHttpParams(); final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setUserAgent(USER_AGENT); // Set this to false, or else you'll get an // Expectation Failed: error paramsBean.setUseExpectContinue(false); final URI uri = builder.build(); final HttpClient httpclient = new DefaultHttpClient(); final HttpGet httpget = new HttpGet(uri); httpget.setParams(params); // Connect // final HttpResponse response = httpclient.execute(httpget); final HttpEntity entity = response.getEntity(); this.response = response; if (response != null) { if (response.getStatusLine() != null) { if (response.getStatusLine().getStatusCode() != 200) { // Log the error line logger.error("Invalid status code - " + response.getStatusLine().getStatusCode()); throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode()); } } } if (entity != null) { blink.setStatusline(String.valueOf(response.getStatusLine())); blink.setCode(response.getStatusLine().getStatusCode()); instream = entity.getContent(); if (instream != null) { final StringBuffer document = new StringBuffer(); final BufferedReader reader = new BufferedReader(new InputStreamReader(instream)); String line = ""; while ((line = reader.readLine()) != null) { document.append(line); document.append(NL); } // End of the while // db.proc(blink); Thread.sleep(LINK_PROCESS_DELAY); return document.toString(); } // End of - instream /// } // End of the if / } catch (final Throwable e) { logger.error("Error at connect to LINK", e); throw new CrawlerError("Error at connect to LINK", e); } finally { try { if (instream != null) { instream.close(); } } catch (IOException e) { e.printStackTrace(); } } // End of the try - catch block // return null; }