/*
* Project: ExampleVakanzenMaxDepthModel
*
* $Id: ExampleVakanzenMaxDepthModel.java,v 1.2 2006/08/26 10:33:12 ltorunski Exp $
*/
package com.torunski.crawler.examples;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import com.torunski.crawler.Crawler;
import com.torunski.crawler.filter.ServerFilter;
import com.torunski.crawler.model.MaxDepthModel;
/**
* Example for the vakanzen site and a max depth model.
*
* Description: Using the "MaxDepthModel" with a maximum depth of 2 and follows only links of the same server.
* Result: Vists around 50 pages and stops when the depth of 2 is reached.
*
* @author Lars Torunski
* @version $Id: ExampleVakanzenMaxDepthModel.java,v 1.2 2006/08/26 10:33:12 ltorunski Exp $
*/
public class ExampleVakanzenMaxDepthModel {
private static final String SERVER = "http://www.oppenheim-karriere.de";
private static final String START = "/Vakanzen_main";
public static void main(String[] args) {
Crawler crawler = new Crawler();
crawler.setModel(new MaxDepthModel(2));
crawler.setLinkFilter(new ServerFilter(SERVER));
crawler.start(SERVER, START);
// show visited links
Collection visitedLinks = sortMaxDepthLinks(crawler.getModel().getVisitedURIs());
System.out.println("Links visited=" + visitedLinks.size());
Iterator list = visitedLinks.iterator();
while (list.hasNext()) {
System.out.println(list.next());
}
// show not visited links
Collection notVisitedLinks = sortMaxDepthLinks(crawler.getModel().getToVisitURIs());
System.out.println("Links NOT visited=" + notVisitedLinks.size());
Iterator listNot = notVisitedLinks.iterator();
while (listNot.hasNext()) {
System.out.println(listNot.next());
}
}
/**
* Sorts MaxDepthLinks
* @param links the links collection
* @return a sorted collection of links
*/
private static Collection sortMaxDepthLinks(Collection links) {
List sort = new ArrayList(links.size());
Iterator iter = links.iterator();
while (iter.hasNext()) {
sort.add(iter.next().toString());
}
Collections.sort(sort);
return sort;
}
}
|