/*
* Copyright 2005 by Lars Torunski
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.torunski.crawler;
import java.util.Collection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.torunski.crawler.core.AbstractCrawler;
import com.torunski.crawler.link.Link;
import com.torunski.crawler.parser.PageData;
import com.torunski.crawler.util.StopWatch;
/**
* Project: Smart & Simple Web Crawler
*
* Crawls through the web with a single thread.
*
* @author Lars Torunski
* @version $Revision: 1.17 $
*/
public class Crawler extends AbstractCrawler {
private static final transient Log log = LogFactory.getLog(Crawler.class);
private StopWatch total = new StopWatch();
private StopWatch loading = new StopWatch();
private StopWatch parsing = new StopWatch();
private StopWatch listener = new StopWatch();
/**
* Constructor for Crawler.
*/
public Crawler() {
}
/**
* Starts the crawling process in a single thread.
*/
public void start(String server, String start) {
// set the default parser
if (parser == null) {
log.debug("No parser set, defautling to SimpleHttpClientParser.");
parser = new com.torunski.crawler.parser.httpclient.SimpleHttpClientParser();
}
// set default crawler model
if (model == null) {
log.debug("No model set, defautling to MaxIterationsModel.");
model = new com.torunski.crawler.model.MaxIterationsModel();
}
// initialize stop watch
total.reset();
loading.reset();
parsing.reset();
listener.reset();
total.start();
// add at least one link to the list
model.add(null, server + start);
// starts the crawling process
start();
total.stop();
// output some statistics
if (log.isInfoEnabled()) {
Collection visitedURIs = model.getVisitedURIs();
Collection toVisitURIs = model.getToVisitURIs();
log.info("Visited URIs: " + visitedURIs.size());
if (toVisitURIs.size() > 0) {
log.warn("still URIs to be visited, at least: " + toVisitURIs.size());
}
// output stop watch data
log.info("Total time: " + total.getTime() + " ms");
log.info("- loading: " + loading.getTime() + " ms");
log.info("- parsing: " + parsing.getTime() + " ms");
log.info("- listener: " + listener.getTime() + " ms");
}
}
/**
* Starts the crawling process in a single thread.
* @see com.torunski.crawler.core.ICrawler#start()
*/
public void start() {
// loop until there aren't any URIs anymore
while (!model.isEmpty()) {
// remove a link from the stack
Link link = model.pop();
// load the page
loading.start();
PageData pageData = parser.load(link);
loading.stop();
if (pageData.getStatus() == PageData.OK) {
// get the links in the page
parsing.start();
Collection newURIs = parser.parse(pageData, linkFilter);
parsing.stop();
listener.start();
fireParserEvent(link, pageData, newURIs);
listener.stop();
// remove already visited URIs from the new URI list
newURIs.removeAll(model.getVisitedURIs());
// the rest of the URIs can be visited
model.add(link, newURIs);
}
}
}
}
|