Java tutorial
package crawler; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.LinkedList; import java.util.List; import java.util.Queue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; import org.apache.lucene.util.Version; import toolbox.web.sitemap.SAXSitemapParser; import toolbox.web.sitemap.WebPage; /** * A multi-threaded, database assisted, web crawler. * * @author billy */ public class WebCrawler { /** * The version of the crawler. */ private static final String VERSION = "0.7"; /** * The author of the crawler. */ private static final String strAppAuthor = "Vassilis S. Moustakas (vsmoustakas[at]gmail[dot]com)"; /** * The application name. */ private static final String strAppName = "Web Crawler v." + VERSION; /** * The basic application usage. */ private static final String strAppUsage = strAppName + "\n" + "Usage: java WebCrawler [OPTIONS] <url>\n" + "Use -h for more help"; /** * The application header. */ private static final String strAppHeader = strAppName + "\n" + "\n" + "This program is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details.\n" + "\n" + "Authored by " + strAppAuthor + "."; /** * The application help. */ private static final String strAppHelp = strAppName + "\n" + "A simple multi-threaded, database assisted, network content retriever.\n" + "Author: " + strAppAuthor + "\n" + "Usage: java WebCrawler [OPTIONS] <url>\n" + "\n" + "OPTIONS\n" + "\n" + "-a<NAME>\n\tSet the NAME with which the crawler will introduce itself to web servers. Can alternatively be handled by setting the \"agent\" configuration property.\n" + // "-A: Comma-separated list of accepted mimes.\n" + "-d<LEVELS>\n\tSpecify maximum LEVELS number of recursion depth. Can alternatively be handled by setting the \"depth\" configuration property.\n" + "-h\n\tPrint this help.\n" + "-H\n\tPrint the application's header information.\n" + "-i<PATH>\n\tSave index under PATH directory. Can alternatively be handled by setting the \"indexPath\" configuration property.\n" + "-m<NUMBER>\n\tSpecify NUMBER of maximum threads in the pool. Can alternatively be handled by setting the \"threadNumber\" configuration property.\n" + "-n<NUMBER>\n\tBound to NUMBER maximum files downloaded. Can alternatively be handled by setting the \"maximumFileNumber\" configuration property.\n" + "-o<PATHTOFILE>\n\tLog messages to the file denoted by PATHTOFILE. If no -o and/or PATHTOFILE is defined then logging will be directed to standard out. Can alternatively be handled by setting the \"logFilePath\" configuration property.\n" + "-p<PATH>\n\tSave retrieved files under PATH directory. Can alternatively be handled by setting the \"storagePath\" configuration property.\n" + // "-R\n\tComma-separated list of rejected mimes.\n" + "-s<URL>\n\tCrawl the pages dictated by the sitemap on this URL Can alternatively be handled by setting the \"sitemapURL\" configuration property.\n" + "-t<SECONDS>\n\tSet SECONDS for HTTP connection time-outs. Can alternatively be handled by setting the \"timeout\" configuration property.\n" + "-v\n\tBe verbose. Can alternatively be handled by setting the \"verbose\" configuration property.\n" + "-x\n\tDo not follow the image links. Can alternatively be handled by setting the \"followImgLinks\" configuration property.\n"; /** * The number of threads in the pool. */ private final int threadNumber; /** * Set whether you want logging output during crawl. */ private final boolean verbose; /** * Set whether you want logging output to be redirected to a file. */ private final boolean redirect; /** * The path to the log file. */ private final String logFilePath; /** * The path to the file-system where the downloaded files will be stored * into. */ private final String storagePath; /** * The path to the file-system where the indexing will be stored into. */ private final String indexPath; /** * The maximum number of files allowed to be downloaded. Set it to a value * less than or equal to zero to set it to unlimited (highly discouraged). */ private final int maximumFileNumber; /** * The connection timeout period in seconds. */ private final int timeout; /** * How much deep in the link graph will the crawler go in the recursive * case. 0 (or less than?!) means the crawler will not descend further down * the link graph. */ private final int depth; /** * Set whether image links will be followed. */ private final boolean followImgLinks; /** * The agent name with which the crawler is "introduced" to a web server. */ private final String agent; /** * Set whether the URL provided by the user should be treated as a sitemap * with which the crawling process should be initialized. */ private final boolean sitemapAssisted; /** * The crawler's configurator. */ private static final WebCrawlerConfigurator configurator = WebCrawlerConfigurator.getInstance(); /** * The thread pool. */ private final ExecutorService executor; /** * A Queue of futures for the submitted threads. */ private final Queue<Future<?>> futures; /** * A map for storing the already visited web pages by the web crawler. * Using {@code ConcurrentMap<K, V>} to allow multiple reads, single write * by the crawler's threads. */ private final ConcurrentMap<String, URL> visited; /** * A writer for full-text indexing. */ private IndexWriter luceneIndexWriter = null; /** * Constructor. */ public WebCrawler() { threadNumber = configurator.propertyInteger("threadNumber"); // System.out.println("threadNumber " + threadNumber); verbose = configurator.propertyBoolean("verbose"); // System.out.println("verbose " + verbose); logFilePath = configurator.property("logFilePath"); redirect = (logFilePath.trim().isEmpty() ? false : true); // System.out.println("logFilePath " + logFilePath); storagePath = configurator.property("storagePath"); // System.out.println("storagePath " + storagePath); indexPath = configurator.property("indexPath"); // System.out.println("indexPath " + indexPath); maximumFileNumber = configurator.propertyInteger("maximumFileNumber"); // System.out.println("maximumFileNumber " + maximumFileNumber); timeout = configurator.propertyInteger("timeout"); // System.out.println("timeout " + timeout); depth = configurator.propertyInteger("depth"); // System.out.println("depth " + depth); followImgLinks = configurator.propertyBoolean("followImgLinks"); // System.out.println("followImgLinks " + followImgLinks); sitemapAssisted = configurator.propertyBoolean("sitemapAssisted"); // System.out.println("sitemapAssisted " + sitemapAssisted); agent = configurator.property("agent"); // System.out.println("agent " + agent); executor = Executors.newFixedThreadPool(this.threadNumber); futures = new LinkedList<Future<?>>(); visited = new ConcurrentHashMap<String, URL>(); try { Directory directory = new NIOFSDirectory(new File(this.indexPath)); Version lv = Version.LUCENE_41; Analyzer a = new EnglishAnalyzer(lv); IndexWriterConfig iwc = new IndexWriterConfig(lv, a); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setWriteLockTimeout(20000); luceneIndexWriter = new IndexWriter(directory, iwc); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * The main function that drives the execution. * * @param args * The command-line arguments. */ public static void main(String[] args) { URL url = null; System.out.println(strAppHeader + "\n"); boolean abort = false; // Parse CMD arguments. // At least one argument needed (that of the start url). if (args.length < 1) { System.out.println("Error: Defining the starting URL is mandatory!"); System.out.println(strAppUsage); abort = true; } else { int i = 0; // Cycle through cmd line arguments. while (!abort && (i < args.length)) { if (args[i].charAt(0) == '-') { // If it is a switch... which switch? switch (args[i].charAt(1)) { case 'a': // Determine the name with which the crawler is introduced to the // various web servers it traverses. configurator.assign("agent", args[i].substring(2, args[i].length())); break; // case 'A': // // comma-separated list of accepted extensions. // configurator.assign("accept", args[i].substring(2, args[i].length())); // break; case 'd': // define depth of crawling function in levels configurator.assign("depth", args[i].substring(2, args[i].length())); break; case 'h': // print help info System.out.println(strAppHelp); abort = true; break; case 'H': // Print application header System.out.println(strAppHeader); abort = true; break; case 'i': // Determine index area on local FS configurator.assign("indexPath", args[i].substring(2, args[i].length())); break; case 'm': // Maximum number of threads configurator.assign("threadNumber", args[i].substring(2, args[i].length())); break; case 'n': // set the maximum allowed number of downloaded files configurator.assign("maximumFileNumber", args[i].substring(2, args[i].length())); break; case 'o': // Redirect crawling output to a file configurator.assign("logFilePath", args[i].substring(2, args[i].length())); break; case 'p': // Determine storage area on local FS configurator.assign("storagePath", args[i].substring(2, args[i].length())); break; // case 'R': // // comma-separated list of rejected extensions. // configurator.assign("accept", args[i].substring(2, args[i].length())); // break; case 's': // Determine the URL given is a sitemap. configurator.assign("sitemapAssisted", "true"); break; case 't': // define connection time-out period configurator.assign("timeout", args[i].substring(2, args[i].length())); break; case 'v': // Turn on/off verbose mode configurator.assign("verbose", "true"); break; case 'x': // don't follow links incorporated into images configurator.assign("followImgLinks", "false"); break; default: // cmd line contained a non identifiable switch System.out.println("Error: Switch " + args[i] + " is not valid."); System.out.println(strAppUsage); break; } } else { // it is not a switch... its a URL if (url == null) { // if it there is no url already set, then set it... String tmp = new String(args[i]); try { url = new URL(tmp); } catch (MalformedURLException murle) { murle.printStackTrace(); } } else { // else, there is some problem since we need only one // URL from the command line. System.out.println("Warning: A URL has been already defined. Ignoring " + args[i] + "."); System.out.println(strAppUsage); } } // Prepare for next CMD argument. i++; } } // End - Parse CMD arguments. if (!abort) { if (url != null) { WebCrawler spiderman = new WebCrawler(); spiderman.start(url); spiderman.block(); spiderman.stop(); } else { System.out.println("Error: Defining the starting URL is mandatory!"); System.out.println(strAppUsage); } } } /** * Offers the given spider job to the crawler. The job's URL is checked * whether it is already processed. If not, it is submitted its future * added to the futures' queue. Else the job is discarded. * * @param spider * The {@code Runnable} job to be submitted. * * @return * {@code true} if the offered job was accepted; {@code false} * otherwise. */ public boolean crawl(Spider spider) { boolean accepted = false; // Put the job's URL to the visited registry. if (visited.putIfAbsent(spider.getUrl().toString(), spider.getUrl()) == null) { futures.add(executor.submit(spider)); accepted = true; } return accepted; } /** * Triggers the crawling process. * * @param url * The initial URL. */ public void start(URL url) { System.out.println("Starting crawler..."); if (isSitemapAssisted()) { System.out.println("Accessing sitemap: " + url.toString()); SAXSitemapParser parser = new SAXSitemapParser(url); List<WebPage> webpages = parser.parse(); for (WebPage page : webpages) { crawl(new Spider(0, page.getLocation(), 0, this)); } } else { crawl(new Spider(0, url, 0, this)); } } /** * <p> * Waits on the futures queue and consumes them. This function blocks until * all futures are consumed (a.k.a. all submitted spider jobs have * finished). * </p> * * <p> * It seems that this function is not the optimal solution to the problem of * "waiting until all tasks are finished before shutting down the executor". * Not sure it works properly at all situations. Needs rework. * </p> */ public void block() { while (!futures.isEmpty()) { System.out.println(futures.size()); try { futures.remove().get(); } catch (InterruptedException | ExecutionException e) { e.printStackTrace(); } } } /** * <p> * Stops the crawler. * <p> * * <p> * The following method shuts down the {@code ExecutorService} of the * crawler in two phases; first by calling {@code shutdown()} to reject * incoming tasks, and then calling {@code shutdownNow()}, if necessary, * to cancel any lingering tasks. * </p> */ public void stop() { System.out.print("Stopping crawler: "); executor.shutdown(); // Disable new tasks from being submitted try { // Wait a while for existing tasks to terminate if (!executor.awaitTermination(60, TimeUnit.SECONDS)) { executor.shutdownNow(); // Cancel currently executing tasks // Wait a while for tasks to respond to being cancelled if (!executor.awaitTermination(60, TimeUnit.SECONDS)) { System.err.println("Pool did not terminate"); } } // Close index writer try { if (luceneIndexWriter != null) { luceneIndexWriter.close(); } } catch (IOException ioe) { ioe.printStackTrace(); } System.out.println("OK\n"); } catch (InterruptedException ie) { ie.printStackTrace(); // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } } // -- Getters/Setters /** * Gets the number of threads in the pool. * * @return * The number of threads in the pool. */ public int getThreadNumber() { return threadNumber; } /** * Gets the verbose switch. * * @return * {@code true} if the verbose mode is enabled; {@code false} * otherwise. */ public boolean isVerbose() { return verbose; } /** * Gets the redirect switch. * * @return * {@code true} if the redirect mode is enabled; {@code false} * otherwise. */ public boolean isRedirect() { return redirect; } /** * Gets the path to the log file. * * @return * The path to the log file. */ public String getLogFilePath() { return logFilePath; } /** * Gets the the path to the file-system where the downloaded files will be * stored into. * * @return * The file-system path where the downloaded files will be stored. */ public String getStoragePath() { return storagePath; } /** * Gets the the path to the file-system where the index will be stored * into. * * @return * The file-system path where the index will be stored. */ public String getIndexPath() { return indexPath; } /** * Gets the maximum number of files to be downloaded. * * @return * The maximum number of files to be downloaded. */ public int getMaximumFileNumber() { return maximumFileNumber; } /** * Gets the connection timeout period. * * @return * The connection timeout period (in seconds). */ public int getTimeout() { return timeout; } /** * Gets the number of levels that crawling should take place. * * @return * The number of levels the crawler is going to descent. */ public int getDepth() { return depth; } /** * Gets the follow image links switch. * * @return * {@code true} if the image links are to be followed; {@code false} * otherwise. */ public boolean doFollowImgLinks() { return followImgLinks; } /** * Gets the agent name used to "introduce" to web servers. * * @return * The agent name. */ public String getAgent() { return agent; } /** * Gets the sitemap assisted switch. * * @return * {@code true} if the crawl is to be assisted by a sitemap; * {@code false} otherwise. */ public boolean isSitemapAssisted() { return sitemapAssisted; } /** * Gets the web crawler's executor service. * * @return * The executor service. */ public ExecutorService getExecutor() { return executor; } /** * Gets the web crawler's futures. * * @return * The futures. */ public Queue<Future<?>> getFutures() { return futures; } /** * Gets the web crawler's visited URLs. * * @return * A map containing the web crawler's visited URLs. */ public ConcurrentMap<String, URL> getVisited() { return visited; } /** * Gets the web crawler's writer used for full-text indexing. * * @return * The web crawler's lucene index writer. */ public IndexWriter getLuceneIndexWriter() { return luceneIndexWriter; } }