List of usage examples for org.apache.http.params HttpProtocolParamBean setUseExpectContinue
public void setUseExpectContinue(boolean z)
From source file:org.apache.droids.examples.cli.SimpleRuntime.java
public static void main(String[] args) throws Exception { if (args.length < 1) { System.out.println("Please specify a URL to crawl"); System.exit(-1);/* w w w . ja v a2s. c o m*/ } String targetURL = args[0]; // Create parser factory. Support basic HTML markup only ParserFactory parserFactory = new ParserFactory(); TikaDocumentParser tikaParser = new TikaDocumentParser(); parserFactory.getMap().put("text/html", tikaParser); // Create protocol factory. Support HTTP/S only. ProtocolFactory protocolFactory = new ProtocolFactory(); // Create and configure HTTP client HttpParams params = new BasicHttpParams(); HttpProtocolParamBean hppb = new HttpProtocolParamBean(params); HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params); ConnManagerParamBean cmpb = new ConnManagerParamBean(params); // Set protocol parametes hppb.setVersion(HttpVersion.HTTP_1_1); hppb.setContentCharset(HTTP.ISO_8859_1); hppb.setUseExpectContinue(true); // Set connection parameters hcpb.setStaleCheckingEnabled(false); // Set connection manager parameters ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean.setDefaultMaxPerRoute(2); cmpb.setConnectionsPerRoute(connPerRouteBean); DroidsHttpClient httpclient = new DroidsHttpClient(params); HttpProtocol httpProtocol = new HttpProtocol(httpclient); protocolFactory.getMap().put("http", httpProtocol); protocolFactory.getMap().put("https", httpProtocol); // Create URL filter factory. URLFiltersFactory filtersFactory = new URLFiltersFactory(); RegexURLFilter defaultURLFilter = new RegexURLFilter(); defaultURLFilter.setFile("classpath:/regex-urlfilter.txt"); filtersFactory.getMap().put("default", defaultURLFilter); // Create handler factory. Provide sysout handler only. HandlerFactory handlerFactory = new HandlerFactory(); SysoutHandler defaultHandler = new SysoutHandler(); handlerFactory.getMap().put("default", defaultHandler); // Create droid factory. Leave it empty for now. DroidFactory<Link> droidFactory = new DroidFactory<Link>(); // Create default droid SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer(); simpleDelayTimer.setDelayMillis(100); Queue<Link> simpleQueue = new LinkedList<Link>(); SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>(); taskMaster.setDelayTimer(simpleDelayTimer); taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler()); CrawlingDroid helloCrawler = new SysoutCrawlingDroid(simpleQueue, taskMaster); helloCrawler.setFiltersFactory(filtersFactory); helloCrawler.setParserFactory(parserFactory); helloCrawler.setProtocolFactory(protocolFactory); Collection<String> initialLocations = new ArrayList<String>(); initialLocations.add(targetURL); helloCrawler.setInitialLocations(initialLocations); // Initialize and start the crawler helloCrawler.init(); helloCrawler.start(); // Await termination helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS); // Shut down the HTTP connection manager httpclient.getConnectionManager().shutdown(); }
From source file:eu.sisob.uma.api.crawler4j.crawler.PageFetcher.java
public synchronized static void startConnectionMonitorThread() { if (connectionMonitorThread == null) { HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter("http.useragent", Configurations.getStringProperty("fetcher.user_agent", "crawler4j (http://code.google.com/p/crawler4j/)")); params.setIntParameter("http.socket.timeout", Configurations.getIntProperty("fetcher.socket_timeout", 20000)); params.setIntParameter("http.connection.timeout", Configurations.getIntProperty("fetcher.connection_timeout", 30000)); params.setBooleanParameter("http.protocol.handle-redirects", false); ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean/*ww w . j av a2 s . c o m*/ .setDefaultMaxPerRoute(Configurations.getIntProperty("fetcher.max_connections_per_host", 100)); ConnManagerParams.setMaxConnectionsPerRoute(params, connPerRouteBean); ConnManagerParams.setMaxTotalConnections(params, Configurations.getIntProperty("fetcher.max_total_connections", 100)); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); if (Configurations.getBooleanProperty("fetcher.crawl_https", false)) { schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); } connectionManager = new ThreadSafeClientConnManager(params, schemeRegistry); ProjectLogger.LOGGER.setLevel(Level.INFO); httpclient = new DefaultHttpClient(connectionManager, params); connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:org.berlin.crawl.net.WebConnector.java
public synchronized String connect(final BotLink blink, final URIBuilder builder) throws Exception { InputStream instream = null;/*from www.j a v a 2s . c om*/ try { logger.info("!* Attempting download and connect request : " + builder.toString()); final HttpParams params = new BasicHttpParams(); final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setUserAgent(USER_AGENT); // Set this to false, or else you'll get an // Expectation Failed: error paramsBean.setUseExpectContinue(false); final URI uri = builder.build(); final HttpClient httpclient = new DefaultHttpClient(); final HttpGet httpget = new HttpGet(uri); httpget.setParams(params); // Connect // final HttpResponse response = httpclient.execute(httpget); final HttpEntity entity = response.getEntity(); this.response = response; if (response != null) { if (response.getStatusLine() != null) { if (response.getStatusLine().getStatusCode() != 200) { // Log the error line logger.error("Invalid status code - " + response.getStatusLine().getStatusCode()); throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode()); } } } if (entity != null) { blink.setStatusline(String.valueOf(response.getStatusLine())); blink.setCode(response.getStatusLine().getStatusCode()); instream = entity.getContent(); if (instream != null) { final StringBuffer document = new StringBuffer(); final BufferedReader reader = new BufferedReader(new InputStreamReader(instream)); String line = ""; while ((line = reader.readLine()) != null) { document.append(line); document.append(NL); } // End of the while // db.proc(blink); Thread.sleep(LINK_PROCESS_DELAY); return document.toString(); } // End of - instream /// } // End of the if / } catch (final Throwable e) { logger.error("Error at connect to LINK", e); throw new CrawlerError("Error at connect to LINK", e); } finally { try { if (instream != null) { instream.close(); } } catch (IOException e) { e.printStackTrace(); } } // End of the try - catch block // return null; }
From source file:org.berlin.crawl.net.RobotsConnector.java
/** * Connect to robots.txt file./* w w w.java 2 s. c om*/ * * On error, close inputstream, return empty document. * * @param builder * @return * @throws Exception */ protected synchronized String connect(final URIBuilder builder) throws Exception { this.lastURIBuilder = builder; InputStream instream = null; try { logger.info("Attempting request : " + builder.toString()); final HttpParams params = new BasicHttpParams(); final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setUserAgent(OctaneCrawlerConstants.USER_AGENT); // Set this to false, or else you'll get an // Expectation Failed: error paramsBean.setUseExpectContinue(false); final URI uri = builder.build(); final HttpClient httpclient = new DefaultHttpClient(); final HttpGet httpget = new HttpGet(uri); httpget.setParams(params); // Connect // final HttpResponse response = httpclient.execute(httpget); final HttpEntity entity = response.getEntity(); this.response = response; if (response != null) { if (response.getStatusLine() != null) { if (response.getStatusLine().getStatusCode() != 200) { // Log the error line logger.error("Invalid status code - " + response.getStatusLine().getStatusCode()); throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode()); } } } if (entity != null) { instream = entity.getContent(); if (instream != null) { final StringBuffer document = new StringBuffer(); final BufferedReader reader = new BufferedReader(new InputStreamReader(instream)); String line = ""; while ((line = reader.readLine()) != null) { document.append(line); document.append(NL); } // End of the while // return document.toString(); } // End of - instream /// } // End of the if / Thread.sleep(100); } catch (final Exception e) { logger.error("Error at robots connect", e); throw new CrawlerError("Error at connect", e); } finally { try { if (instream != null) { instream.close(); } } catch (IOException e) { e.printStackTrace(); } } // End of the try - catch block // return null; }
From source file:com.nanocrawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { this.config = config; HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/* ww w . j a v a 2s . c o m*/ connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); }
From source file:cn.jachohx.crawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }// ww w .jav a 2s . co m connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); }
From source file:yin.autoflowcontrol.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/*w w w . j av a 2 s. c om*/ connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:edu.hust.grid.crawl.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); params.setParameter("http.language.Accept-Language", "en-us"); params.setParameter("http.protocol.content-charset", "UTF-8"); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/*from w w w .java 2 s. c o m*/ connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:16.0) Gecko/20100101 Firefox/16.0"); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:com.autonomousturk.crawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/*from w ww. j av a2 s .co m*/ connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:org.sbs.goodcrawler.fetcher.PageFetcher.java
public PageFetcher(FetchConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getAgent()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeoutMilliseconds()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isHttps()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }//from w w w.ja v a 2s. c o m connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }