package bdd.search;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.File;
import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;
import java.io.IOException;
import java.io.DataInputStream;
import java.io.FileInputStream;
/** Written by Tim Macinta 1997 <br>
* Distributed under the GNU Public License
* (a copy of which is enclosed with the source). <br>
* <br>
* Encapsulates the preferences for the crawler and the search
* engine.
*/
public class EnginePrefs {
/** The time to pause between URL fetches (in seconds). */
public int pause_time = 5;
File main_dir = new File("searchdb"); // directory containing main index
// // and custom html files
File main_index = new File(main_dir, "main.db"); // main index
File rules = new File(main_dir, "rules.txt"); // inclusion/exclusion rules
File header = new File(main_dir, "header.html"); // header file
File footer = new File(main_dir, "footer.html"); // footer file
File notfound = new File(main_dir, "notfound.html"); // query not found file
File url_list = new File(main_dir, "urls.txt"); // list of starting URLs
File working_dir = new File("searchtmp"); // temporary working directory
Vector excluded = new Vector(3, 10); // excluded URLs
Vector included = new Vector(3, 10); // included URLs
Hashtable hosts = new Hashtable(3, 6); // hosts where we've read robots.txt
String user_agent = "BDDBot"; // name used when retrieving URLs
String email_address = "nobody@nowhere.edu"; // administrator's email address
boolean filter_cgi = true; // filter out cgi urls?
Monitor monitor = null; // query and url monitor
public static int port = 8001; // default web server port
public EnginePrefs() {
if (!main_dir.exists()) main_dir.mkdir();
if (!working_dir.exists()) working_dir.mkdir();
try {
readRulesFile();
} catch (IOException e) {
e.printStackTrace();
}
}
/** Returns true if "url" is allowed to be indexed and false otherwise. */
public boolean URLAllowed(URL url) {
URL u2;
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
if (port < 0 && protocol.equals("http")) port = 80;
String file = url.getFile();
// filter out cgi scripts
if (filter_cgi) {
if (file.indexOf('?') > -1) return false;
if (file.startsWith("/cgi-bin/")) return false;
}
// check exclusion rules
int p;
Enumeration en = excluded.elements();
while (en.hasMoreElements()) {
u2 = (URL) en.nextElement();
p = u2.getPort();
if (protocol.equals(u2.getProtocol()) &&
host.equals(u2.getHost()) &&
(port == p || (port == 80 && p == -1)) &&
file.startsWith(u2.getFile())) return false;
}
// include all files that aren't excluded
if (protocol.equals("file")) return true;
// check inclusion rules
en = included.elements();
while (en.hasMoreElements()) {
u2 = (URL) en.nextElement();
p = u2.getPort();
if (protocol.equals(u2.getProtocol()) &&
host.equals(u2.getHost()) &&
(port == p || (port == 80 && p == -1)) &&
file.startsWith(u2.getFile())) {
if (protocol.equals("http") && hosts.get(host+":"+port) == null) {
readRobotsDotText(host, port);
hosts.put(host+":"+port, Boolean.TRUE);
return URLAllowed(url);
} else {
return true;
}
}
}
return false;
}
/** Pauses for the amount of time that has been specified for pausing
* between URL fetches. */
public void pauseBetweenURLs() {
long diff = pause_time * -1000L;
long start = System.currentTimeMillis();
while (diff < 0) {
try {
Thread.sleep(-diff);
} catch (InterruptedException e) {}
diff = System.currentTimeMillis() - start - pause_time * -1000L;
}
}
public File getMainIndex() {
return main_index;
}
public File getMainDir() {
return main_dir;
}
/** Returns the working directory for use by a crawler. If more than
* one crawler is running at the same time they should be given different
* working directories.
*/
public File getWorkingDir() {
return working_dir;
}
public File getHeaderFile() {
return header;
}
public File getFooterFile() {
return footer;
}
public File getNotFoundFile() {
return notfound;
}
public File getStartingFile() {
return url_list;
}
/** The rules file contains rules which determine what URLs are allowed
* and what URLs whould be excluded. A line that is in the form:
* <pre>
* include http://gsd.mit.edu/
* </pre>
* will cause all URLs that start with "http://gsd.mit.edu/" to be
* included. Similarly, to exclude URLs, use the keyword "exclude"
* instead of "include". Blank lines and lines starting with "#" are
* ignored.
* <p>
* When an URL is checked against the inclusion/exclusion rules the
* exclusion rules are checked first and if the URL matches an
* exclusion rule it is not included. If an URL is not covered by
* either rule it is not included, unless it is a "file://" URL in
* which case it is included by default.
*/
public File getRulesFile() {
return rules;
}
/** Causes the inclusion/exclusion rules to be read. This method should
* be called if the rules file is changed.
*/
public void readRulesFile() throws IOException {
excluded.removeAllElements();
included.removeAllElements();
hosts.clear();
DataInputStream in = new DataInputStream(new FileInputStream(rules));
String line = in.readLine();
while (line != null) {
line = line.trim();
try {
if (line.startsWith("include ")) {
included.addElement(new URL(line.substring(8)));
} else if (line.startsWith("exclude ")) {
excluded.addElement(new URL(line.substring(8)));
}
} catch (MalformedURLException e) {
e.printStackTrace();
}
line = in.readLine();
}
in.close();
}
/** Reads the "robots.txt" file on the given host and uses the results
* to determine what files on "host" are crawlable.
*/
public void readRobotsDotText(String host, int port) {
try {
if (port < 0) port = 80;
URL url = new URL("http", host, port, "/robots.txt");
DataInputStream in = new DataInputStream(url.openStream());
boolean relevant = false;
String line = in.readLine();
String lower_case;
String us = user_agent.toLowerCase();
while (line != null) {
line = line.trim();
lower_case = line.toLowerCase();
if (lower_case.startsWith("user-agent:")) {
// determine if the following directives apply to us
int i = 11;
while (Character.isSpace(line.charAt(i))) i++;
int i2 = lower_case.indexOf(' ', i);
if (i2 < 0) i2 = lower_case.length();
lower_case = lower_case.substring(i, i2);
if (lower_case.endsWith("*")) {
lower_case = lower_case.substring(0, lower_case.length()-1);
}
relevant = us.startsWith(lower_case);
} else if (lower_case.startsWith("disallow:")) {
// assimilate directive if applicable
if (relevant) {
int i = 9;
while (Character.isSpace(line.charAt(i))) i++;
int i2 = line.indexOf(' ', i);
if (i2 < 0) i2 = line.length();
line = line.substring(i, i2);
excluded.addElement(new URL("http", host, port, line));
}
}
line = in.readLine();
}
in.close();
} catch (IOException e) {
} catch (StringIndexOutOfBoundsException e2) {
}
}
public String getUserAgent() {
return user_agent;
}
public String getEmailAddress() {
return email_address;
}
public Monitor getMonitor() {
return monitor;
}
/** Returns true if this URL represents a file type that is not indexable. */
public boolean URLNotIndexable(URL url) {
String f = url.getFile().toLowerCase();
return (f.endsWith(".gif") || f.endsWith(".tif") ||
f.endsWith(".map") || f.endsWith(".jpg") ||
f.endsWith(".ppt") || f.endsWith(".doc") ||
f.endsWith(".pdf") || f.endsWith(".xls") ||
f.endsWith(".rtf"));
}
}
|