package bdd.search.spider;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.File;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.util.Hashtable;
import bdd.search.EnginePrefs;
import bdd.search.Monitor;
import bdd.util.FIFOQueue;
/** Written by Tim Macinta 1997 <br>
* Distributed under the GNU Public License
* (a copy of which is enclosed with the source). <br>
* <br>
* Calling the Crawler's start() method will cause the Crawler to
* index all of the sites in its queue and then replace the main
* index with the updated index when it completes. The Crawler's
* queue should be filled with the starting URLs before calling
* start().
*/
public class Crawler extends Thread {
File working_dir; // directory for temp files
Indexer indexer; // handles post-crawl indexing
FIFOQueue q = new FIFOQueue(); // url queue
Hashtable urls_done = new Hashtable(40); // keeps track of what
// urls are already processed
EnginePrefs eng_prefs; // preferences
boolean exit_when_done = false; // exit when done indexing
/** "working_dir" should be a directory that only this
* Crawler and a given Indexer will be
* accessing. This means that if several Crawlers are running
* simultaneously, they should all be given different "working_dir"
* directories. Also, no other threads should write to this
* directory (except for the selected Indexer).
*/
public Crawler(File working_dir, EnginePrefs eng_prefs) {
this.eng_prefs = eng_prefs;
this.working_dir = working_dir;
indexer = new Indexer(working_dir, this, eng_prefs);
indexer.start();
}
/** Takes "url_to_queue" and adds it to this Crawler's queue of URLs.
* This method should be used to add all of the desired starting URLs to
* the queue before the Crawler is started. If the URL has already
* been processed or if it is an unallowed URL it is not added.
*/
public void addURL(URL url_to_queue) {
if (!eng_prefs.URLAllowed(url_to_queue)) return; // check if URL is allowed
if (eng_prefs.URLNotIndexable(url_to_queue)) return; //don't index non-text
url_to_queue = simplify(url_to_queue); // remove loops/anchors
if (urls_done.put(url_to_queue, url_to_queue) == null) {
q.addElement(url_to_queue); // add if not done already
Monitor m = eng_prefs.getMonitor();
if (m != null) m.indexing(url_to_queue);
}
}
/** Takes "url" and removes all references to "/./" and "/../" . This
* can be used to help eliminate looping. Also removes all anchors
* (i.e., everything after and including a '#'). */
URL simplify(URL url) {
String file = url.getFile();
boolean changed = false; // keep track of whether we change anything
// collapse all occurances of "/./"
int i = file.indexOf("/./");
while (i >= 0) {
changed = true;
file = file.substring(0, i) + file.substring(i+2);
i = file.indexOf("/./");
}
// collapse all occurances of "/../" (by removing preceding directory)
i = file.indexOf("/../");
while (i >= 0) {
changed = true;
int i2 = file.lastIndexOf('/', i-1);
if (i2 < 0) i2 = i;
file = file.substring(0, i2) + file.substring(i+3);
i = file.indexOf("/../");
}
// remove anchor if necessary
if (url.getRef() != null) changed = true;
// set port if it's not set already
int port = url.getPort();
String proto = url.getProtocol().toLowerCase();
if (port < 0 && proto.equals("http")) {
changed = true;
port = 80;
}
// create a new URL if anything changed
if (changed) {
try {
url = new URL(proto, url.getHost(), port, file);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
return url;
}
/** This is where the actual crawling occurs. */
public void run() {
if (!q.hasMoreElements()) return; // return if there's nothing to do
int tmp_file = 0; // used to generate unique temporary filenames
URLStatus url_status;
while (true) {
url_status = new URLStatus((URL) q.nextElement(),
new File(working_dir, tmp_file+".tmp"),
eng_prefs);
tmp_file++;
url_status.readContent();
if (url_status.loaded()) {
indexer.queueURL(url_status);
} else if (url_status.moved()) {
addURL(url_status.actual_url);
} else {
Monitor m = eng_prefs.getMonitor();
if (m != null) m.reportError(url_status.actual_url);
}
if (q.hasMoreElements()) {
eng_prefs.pauseBetweenURLs();
} else {
while (!q.hasMoreElements() && indexer.q.hasMoreElements()) {
eng_prefs.pauseBetweenURLs();
}
if (!q.hasMoreElements()) {
break;
}
}
}
Monitor m = eng_prefs.getMonitor();
if (m != null) m.crawlerDone(this);
indexer.stopWhenDone(exit_when_done);
}
/** This is the method that is called when this class is invoked from
* the command line. calling this method will cause a Crawler to be
* created and started with the starting URLs being listed in a file
* specified by the first argument (arg[0]). The file listing the URLs
* should contain only the URLs with each URL on a line by itself. Blank
* lines are allowed and lines beginning with "#" are considered comments
* and are ignored.
*/
public static void main(String arg[]) {
if (arg.length < 1) return;
main(new File(arg[0]), new EnginePrefs(), true);
}
public static void main(File file, EnginePrefs prefs) {
main(file, prefs, false);
}
public static void main(File file, EnginePrefs prefs, boolean exit) {
Crawler cr = new Crawler(prefs.getWorkingDir(), prefs);
try {
DataInputStream in = new DataInputStream(new FileInputStream(file));
String line = in.readLine();
while (line != null) {
line = line.trim();
if (!line.equals("") && !line.startsWith("#")) {
try {
cr.addURL(new URL(line));
} catch (MalformedURLException e2) {
e2.printStackTrace();
}
}
line = in.readLine();
}
cr.exit_when_done = exit;
cr.start();
} catch (Exception e) {
e.printStackTrace();
}
}
}
|