Crawler.java :  » Web-Crawler » crawler » com » torunski » crawler » Java Open Source

Java Open Source » Web Crawler » crawler 
crawler » com » torunski » crawler » Crawler.java
/*
 * Copyright 2005 by Lars Torunski
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.torunski.crawler;

import java.util.Collection;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.torunski.crawler.core.AbstractCrawler;
import com.torunski.crawler.link.Link;
import com.torunski.crawler.parser.PageData;
import com.torunski.crawler.util.StopWatch;

/**
 * Project: Smart & Simple Web Crawler
 * 
 * Crawls through the web with a single thread.
 * 
 * @author Lars Torunski
 * @version $Revision: 1.17 $
 */
public class Crawler extends AbstractCrawler {
    
    private static final transient Log log = LogFactory.getLog(Crawler.class);

    private StopWatch total = new StopWatch();
    private StopWatch loading = new StopWatch();
    private StopWatch parsing = new StopWatch();
    private StopWatch listener = new StopWatch();

    /**
     * Constructor for Crawler.
     */
    public Crawler() {
    }
    
    /**
     * Starts the crawling process in a single thread.
   */
    public void start(String server, String start) {
        
        // set the default parser
        if (parser == null) {
          log.debug("No parser set, defautling to SimpleHttpClientParser.");
            parser = new com.torunski.crawler.parser.httpclient.SimpleHttpClientParser();
        }
        
        // set default crawler model
        if (model == null) {
          log.debug("No model set, defautling to MaxIterationsModel.");
            model = new com.torunski.crawler.model.MaxIterationsModel();
        }

        // initialize stop watch
        total.reset();
        loading.reset();
        parsing.reset();
        listener.reset();
        
        total.start();
        
        // add at least one link to the list
        model.add(null, server + start);
        
        // starts the crawling process
        start();

        total.stop();
        
        // output some statistics
        if (log.isInfoEnabled()) {
            
            Collection visitedURIs = model.getVisitedURIs();
            Collection toVisitURIs = model.getToVisitURIs();

            log.info("Visited URIs: " + visitedURIs.size());
            
            if (toVisitURIs.size() > 0) {
                log.warn("still URIs to be visited, at least: " + toVisitURIs.size());
            }
            
            // output stop watch data
            log.info("Total time: " + total.getTime() + " ms");
            log.info("- loading:  " + loading.getTime() + " ms");
            log.info("- parsing:  " + parsing.getTime() + " ms");
            log.info("- listener: " + listener.getTime() + " ms");
        }
    }

  /**
   * Starts the crawling process in a single thread.
   * @see com.torunski.crawler.core.ICrawler#start()
   */
  public void start() {
    // loop until there aren't any URIs anymore
        while (!model.isEmpty()) {
            
            // remove a link from the stack
            Link link = model.pop();
            
            // load the page
            loading.start();
            PageData pageData = parser.load(link);
            loading.stop();
            
            if (pageData.getStatus() == PageData.OK) {
                // get the links in the page
                parsing.start();
                Collection newURIs = parser.parse(pageData, linkFilter);
                parsing.stop();
                
                listener.start();
                fireParserEvent(link, pageData, newURIs);
                listener.stop();
                
                // remove already visited URIs from the new URI list
                newURIs.removeAll(model.getVisitedURIs());
                
                // the rest of the URIs can be visited
                model.add(link, newURIs);
            }
        }
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.