Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga" http://iaia.lcc.uma.es | http://www.uma.es This file is part of SISOB Data Extractor. SISOB Data Extractor is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. SISOB Data Extractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>. */ package eu.sisob.uma.api.crawler4j.crawler; import java.io.IOException; import java.util.Date; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.HttpVersion; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnManagerParams; import org.apache.http.conn.params.ConnPerRouteBean; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParamBean; import org.apache.log4j.Level; import org.apache.log4j.Logger; import eu.sisob.uma.api.crawler4j.frontier.DocIDServer; import eu.sisob.uma.api.crawler4j.url.URLCanonicalizer; import eu.sisob.uma.api.crawler4j.url.WebURL; /** * @author Yasser Ganjisaffar <yganjisa at uci dot edu> */ public final class PageFetcher { private static ThreadSafeClientConnManager connectionManager; private static DefaultHttpClient httpclient; private static Object mutex = PageFetcher.class.toString() + "_MUTEX"; private static int processedCount = 0; private static long startOfPeriod = 0; private static long lastFetchTime = 0; private static long politenessDelay = Configurations.getIntProperty("fetcher.default_politeness_delay", 200); public static final int MAX_DOWNLOAD_SIZE = Configurations.getIntProperty("fetcher.max_download_size", 1048576); private static final boolean show404Pages = Configurations.getBooleanProperty("logging.show_404_pages", true); private static IdleConnectionMonitorThread connectionMonitorThread = null; public static long getPolitenessDelay() { return politenessDelay; } public static void setPolitenessDelay(long politenessDelay) { PageFetcher.politenessDelay = politenessDelay; } public synchronized static void startConnectionMonitorThread() { if (connectionMonitorThread == null) { HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter("http.useragent", Configurations.getStringProperty("fetcher.user_agent", "crawler4j (http://code.google.com/p/crawler4j/)")); params.setIntParameter("http.socket.timeout", Configurations.getIntProperty("fetcher.socket_timeout", 20000)); params.setIntParameter("http.connection.timeout", Configurations.getIntProperty("fetcher.connection_timeout", 30000)); params.setBooleanParameter("http.protocol.handle-redirects", false); ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean .setDefaultMaxPerRoute(Configurations.getIntProperty("fetcher.max_connections_per_host", 100)); ConnManagerParams.setMaxConnectionsPerRoute(params, connPerRouteBean); ConnManagerParams.setMaxTotalConnections(params, Configurations.getIntProperty("fetcher.max_total_connections", 100)); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); if (Configurations.getBooleanProperty("fetcher.crawl_https", false)) { schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); } connectionManager = new ThreadSafeClientConnManager(params, schemeRegistry); ProjectLogger.LOGGER.setLevel(Level.INFO); httpclient = new DefaultHttpClient(connectionManager, params); connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); } public synchronized static void stopConnectionMonitorThread() { if (connectionMonitorThread != null) { connectionManager.shutdown(); connectionMonitorThread.shutdown(); connectionMonitorThread = null; connectionManager = null; } } public static int fetch(Page page, boolean ignoreIfBinary, DocIDServer refdocIDServer) { String toFetchURL = page.getWebURL().getURL(); HttpGet get = null; HttpEntity entity = null; try { get = new HttpGet(toFetchURL); synchronized (mutex) { long now = (new Date()).getTime(); if (now - startOfPeriod > 10000) { //ProjectLogger.LOGGER.info("Number of pages fetched per second: " + processedCount / ((now - startOfPeriod) / 1000)); processedCount = 0; startOfPeriod = now; } processedCount++; if (now - lastFetchTime < politenessDelay) { Thread.sleep(politenessDelay - (now - lastFetchTime)); } lastFetchTime = (new Date()).getTime(); } get.addHeader("Accept", "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"); HttpResponse response = httpclient.execute(get); entity = response.getEntity(); int statusCode = response.getStatusLine().getStatusCode(); if ((statusCode != HttpStatus.SC_OK)) { if (statusCode != HttpStatus.SC_NOT_FOUND) { if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) { Header header = response.getFirstHeader("Location"); if (header != null) { String movedToUrl = header.getValue(); if (!movedToUrl.contains("http://")) { movedToUrl = get.getURI().getScheme() + "://" + get.getURI().getHost() + movedToUrl; } page.getWebURL().setURL(movedToUrl); } else { page.getWebURL().setURL(null); } return PageFetchStatus.Moved; } ProjectLogger.LOGGER.info( "Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL); } else if (show404Pages) { ProjectLogger.LOGGER.info("Not Found: " + toFetchURL + " (Link found in doc#: " + page.getWebURL().getParentDocid() + ")"); } return response.getStatusLine().getStatusCode(); } String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) { if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) { int newdocid = refdocIDServer.getDocID(uri); if (newdocid != -1) { if (newdocid > 0) { return PageFetchStatus.RedirectedPageIsSeen; } WebURL webURL = new WebURL(); webURL.setURL(uri); webURL.setDocid(refdocIDServer.getNewDocID(uri)); page.setWebURL(webURL); } } } if (entity != null) { long size = entity.getContentLength(); if (size == -1) { Header length = response.getLastHeader("Content-Length"); if (length == null) { length = response.getLastHeader("Content-length"); } if (length != null) { size = Integer.parseInt(length.getValue()); } else { size = -1; } } if (size > MAX_DOWNLOAD_SIZE) { entity.consumeContent(); return PageFetchStatus.PageTooBig; } boolean isBinary = false; Header type = entity.getContentType(); if (type != null) { String typeStr = type.getValue().toLowerCase(); if (typeStr.contains("image") || typeStr.contains("audio") || typeStr.contains("video")) { isBinary = true; if (ignoreIfBinary) { return PageFetchStatus.PageIsBinary; } } } if (page.load(entity.getContent(), (int) size, isBinary)) { return PageFetchStatus.OK; } else { return PageFetchStatus.PageLoadError; } } else { get.abort(); } } catch (IOException e) { ProjectLogger.LOGGER.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL + " (link found in doc #" + page.getWebURL().getParentDocid() + ")"); return PageFetchStatus.FatalTransportError; } catch (IllegalStateException e) { // ignoring exceptions that occur because of not registering https // and other schemes } catch (Exception e) { if (e.getMessage() == null) { ProjectLogger.LOGGER.error("Error while fetching " + page.getWebURL().getURL()); } else { ProjectLogger.LOGGER.error(e.getMessage() + " while fetching " + page.getWebURL().getURL()); } } finally { try { if (entity != null) { entity.consumeContent(); } else if (get != null) { get.abort(); } } catch (Exception e) { e.printStackTrace(); } } return PageFetchStatus.UnknownError; } public static void setProxy(String proxyHost, int proxyPort) { HttpHost proxy = new HttpHost(proxyHost, proxyPort); httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } public static void setProxy(String proxyHost, int proxyPort, String username, String password) { httpclient.getCredentialsProvider().setCredentials(new AuthScope(proxyHost, proxyPort), new UsernamePasswordCredentials(username, password)); setProxy(proxyHost, proxyPort); } }