List of usage examples for org.apache.http.params HttpProtocolParamBean setContentCharset
public void setContentCharset(String str)
From source file:org.apache.droids.examples.cli.SimpleRuntime.java
public static void main(String[] args) throws Exception { if (args.length < 1) { System.out.println("Please specify a URL to crawl"); System.exit(-1);/* w w w .java 2 s. co m*/ } String targetURL = args[0]; // Create parser factory. Support basic HTML markup only ParserFactory parserFactory = new ParserFactory(); TikaDocumentParser tikaParser = new TikaDocumentParser(); parserFactory.getMap().put("text/html", tikaParser); // Create protocol factory. Support HTTP/S only. ProtocolFactory protocolFactory = new ProtocolFactory(); // Create and configure HTTP client HttpParams params = new BasicHttpParams(); HttpProtocolParamBean hppb = new HttpProtocolParamBean(params); HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params); ConnManagerParamBean cmpb = new ConnManagerParamBean(params); // Set protocol parametes hppb.setVersion(HttpVersion.HTTP_1_1); hppb.setContentCharset(HTTP.ISO_8859_1); hppb.setUseExpectContinue(true); // Set connection parameters hcpb.setStaleCheckingEnabled(false); // Set connection manager parameters ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean.setDefaultMaxPerRoute(2); cmpb.setConnectionsPerRoute(connPerRouteBean); DroidsHttpClient httpclient = new DroidsHttpClient(params); HttpProtocol httpProtocol = new HttpProtocol(httpclient); protocolFactory.getMap().put("http", httpProtocol); protocolFactory.getMap().put("https", httpProtocol); // Create URL filter factory. URLFiltersFactory filtersFactory = new URLFiltersFactory(); RegexURLFilter defaultURLFilter = new RegexURLFilter(); defaultURLFilter.setFile("classpath:/regex-urlfilter.txt"); filtersFactory.getMap().put("default", defaultURLFilter); // Create handler factory. Provide sysout handler only. HandlerFactory handlerFactory = new HandlerFactory(); SysoutHandler defaultHandler = new SysoutHandler(); handlerFactory.getMap().put("default", defaultHandler); // Create droid factory. Leave it empty for now. DroidFactory<Link> droidFactory = new DroidFactory<Link>(); // Create default droid SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer(); simpleDelayTimer.setDelayMillis(100); Queue<Link> simpleQueue = new LinkedList<Link>(); SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>(); taskMaster.setDelayTimer(simpleDelayTimer); taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler()); CrawlingDroid helloCrawler = new SysoutCrawlingDroid(simpleQueue, taskMaster); helloCrawler.setFiltersFactory(filtersFactory); helloCrawler.setParserFactory(parserFactory); helloCrawler.setProtocolFactory(protocolFactory); Collection<String> initialLocations = new ArrayList<String>(); initialLocations.add(targetURL); helloCrawler.setInitialLocations(initialLocations); // Initialize and start the crawler helloCrawler.init(); helloCrawler.start(); // Await termination helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS); // Shut down the HTTP connection manager httpclient.getConnectionManager().shutdown(); }
From source file:eu.sisob.uma.api.crawler4j.crawler.PageFetcher.java
public synchronized static void startConnectionMonitorThread() { if (connectionMonitorThread == null) { HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter("http.useragent", Configurations.getStringProperty("fetcher.user_agent", "crawler4j (http://code.google.com/p/crawler4j/)")); params.setIntParameter("http.socket.timeout", Configurations.getIntProperty("fetcher.socket_timeout", 20000)); params.setIntParameter("http.connection.timeout", Configurations.getIntProperty("fetcher.connection_timeout", 30000)); params.setBooleanParameter("http.protocol.handle-redirects", false); ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean/*from ww w. j av a2 s.c o m*/ .setDefaultMaxPerRoute(Configurations.getIntProperty("fetcher.max_connections_per_host", 100)); ConnManagerParams.setMaxConnectionsPerRoute(params, connPerRouteBean); ConnManagerParams.setMaxTotalConnections(params, Configurations.getIntProperty("fetcher.max_total_connections", 100)); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); if (Configurations.getBooleanProperty("fetcher.crawl_https", false)) { schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); } connectionManager = new ThreadSafeClientConnManager(params, schemeRegistry); ProjectLogger.LOGGER.setLevel(Level.INFO); httpclient = new DefaultHttpClient(connectionManager, params); connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:com.nanocrawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { this.config = config; HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/* w ww. j a v a2s. c om*/ connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); }
From source file:cn.jachohx.crawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }//ww w . j av a 2 s .co m connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); }
From source file:yin.autoflowcontrol.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }//from w ww . j a v a2 s . c o m connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:edu.hust.grid.crawl.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); params.setParameter("http.language.Accept-Language", "en-us"); params.setParameter("http.protocol.content-charset", "UTF-8"); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/* w w w . j a v a2s . co m*/ connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:16.0) Gecko/20100101 Firefox/16.0"); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:com.autonomousturk.crawler.fetcher.PageFetcher.java
public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }/*from w w w . ja va 2s .com*/ connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:org.sbs.goodcrawler.fetcher.PageFetcher.java
public PageFetcher(FetchConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getAgent()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeoutMilliseconds()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isHttps()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); }//from w w w . j a va 2 s. co m connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
From source file:org.sbs.goodcrawler.fetcher.Fetcher.java
public void init(File fetchConfFile) { FetchConfig fetchConfig = new FetchConfig(); Document document;// w ww .j a v a 2 s. co m try { document = Jsoup.parse(fetchConfFile, "utf-8"); Fetcher.config = fetchConfig.loadConfig(document); } catch (IOException e) { e.printStackTrace(); } catch (ConfigurationException e) { e.printStackTrace(); } HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getAgent()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeoutMilliseconds()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isHttps()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); } connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }