EnginePrefs.java :  » Search-Engine » BDDBot » bdd » search » Java Open Source

Java Open Source » Search Engine » BDDBot 
BDDBot » bdd » search » EnginePrefs.java
package bdd.search;

import java.net.URL;
import java.net.MalformedURLException;
import java.io.File;
import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;
import java.io.IOException;
import java.io.DataInputStream;
import java.io.FileInputStream;

/** Written by Tim Macinta 1997                           <br>
 *  Distributed under the GNU Public License
 *       (a copy of which is enclosed with the source).   <br>
 *                                                        <br> 
 *  Encapsulates the preferences for the crawler and the search
 *  engine.
 */

public class EnginePrefs {
  /** The time to pause between URL fetches (in seconds).  */
  public int pause_time = 5;

  File main_dir = new File("searchdb");    // directory containing main index
  //                                       // and custom html files
  File main_index = new File(main_dir, "main.db"); // main index
  File rules = new File(main_dir, "rules.txt");    // inclusion/exclusion rules
  File header = new File(main_dir, "header.html"); // header file
  File footer = new File(main_dir, "footer.html"); // footer file
  File notfound = new File(main_dir, "notfound.html");  // query not found file
  File url_list = new File(main_dir, "urls.txt");  // list of starting URLs
  File working_dir = new File("searchtmp");  // temporary working directory

  Vector excluded = new Vector(3, 10);   // excluded URLs
  Vector included = new Vector(3, 10);   // included URLs
  Hashtable hosts = new Hashtable(3, 6); // hosts where we've read robots.txt
  String user_agent = "BDDBot";          // name used when retrieving URLs
  String email_address = "nobody@nowhere.edu"; // administrator's email address
  boolean filter_cgi = true;             // filter out cgi urls?

  Monitor monitor = null;                // query and url monitor
  public static int port = 8001;         // default web server port

  public EnginePrefs() {
    if (!main_dir.exists()) main_dir.mkdir();
    if (!working_dir.exists()) working_dir.mkdir();
    try {
      readRulesFile();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  /** Returns true if "url" is allowed to be indexed and false otherwise. */
  public boolean URLAllowed(URL url) {
    URL u2;
    String protocol = url.getProtocol();
    String host = url.getHost();
    int port = url.getPort();
    if (port < 0 && protocol.equals("http")) port = 80;
    String file = url.getFile();

    // filter out cgi scripts

    if (filter_cgi) {
      if (file.indexOf('?') > -1) return false;
      if (file.startsWith("/cgi-bin/")) return false;
    }
    
    // check exclusion rules

    int p;
    Enumeration en = excluded.elements();
    while (en.hasMoreElements()) {
      u2 = (URL) en.nextElement();
      p = u2.getPort();
      if (protocol.equals(u2.getProtocol()) &&
    host.equals(u2.getHost()) &&
    (port == p || (port == 80 && p == -1)) &&
    file.startsWith(u2.getFile())) return false;
    }

    // include all files that aren't excluded

    if (protocol.equals("file")) return true;
    
    // check inclusion rules

    en = included.elements();
    while (en.hasMoreElements()) {
      u2 = (URL) en.nextElement();
      p = u2.getPort();
      if (protocol.equals(u2.getProtocol()) &&
    host.equals(u2.getHost()) &&
    (port == p || (port == 80 && p == -1)) &&
    file.startsWith(u2.getFile())) {
  if (protocol.equals("http") && hosts.get(host+":"+port) == null) {
    readRobotsDotText(host, port);
    hosts.put(host+":"+port, Boolean.TRUE);
    return URLAllowed(url);
  } else {
    return true;
  }
      }
    }
    return false;
  }
  
  /** Pauses for the amount of time that has been specified for pausing
   *  between URL fetches.   */
  public void pauseBetweenURLs() {
    long diff = pause_time * -1000L;
    long start = System.currentTimeMillis();
    while (diff < 0) {
      try {
  Thread.sleep(-diff);
      } catch (InterruptedException e) {}
      diff = System.currentTimeMillis() - start - pause_time * -1000L;
    }
  }

  public File getMainIndex() {
    return main_index;
  }

  public File getMainDir() {
    return main_dir;
  }

  /** Returns the working directory for use by a crawler.  If more than
   *  one crawler is running at the same time they should be given different
   *  working directories.
   */
  public File getWorkingDir() {
    return working_dir;
  }

  public File getHeaderFile() {
    return header;
  }

  public File getFooterFile() {
    return footer;
  }

  public File getNotFoundFile() {
    return notfound;
  }

  public File getStartingFile() {
    return url_list;
  }

  /** The rules file contains rules which determine what URLs are allowed
   *  and what URLs whould be excluded.  A line that is in the form:
   *  <pre>
   *  include http://gsd.mit.edu/
   *  </pre>
   *  will cause all URLs that start with "http://gsd.mit.edu/" to be
   *  included.  Similarly, to exclude URLs, use the keyword "exclude"
   *  instead of "include".  Blank lines and lines starting with "#" are
   *  ignored.
   *  <p>
   *  When an URL is checked against the inclusion/exclusion rules the
   *  exclusion rules are checked first and if the URL matches an
   *  exclusion rule it is not included.  If an URL is not covered by
   *  either rule it is not included, unless it is a "file://" URL in
   *  which case it is included by default.
   */
  public File getRulesFile() {
    return rules;
  }

  /** Causes the inclusion/exclusion rules to be read.  This method should
   *  be called if the rules file is changed.
   */
  public void readRulesFile() throws IOException {
    excluded.removeAllElements();
    included.removeAllElements();
    hosts.clear();
    DataInputStream in = new DataInputStream(new FileInputStream(rules));
    String line = in.readLine();
    while (line != null) {
      line = line.trim();
      try {
  if (line.startsWith("include ")) {
    included.addElement(new URL(line.substring(8)));
  } else if (line.startsWith("exclude ")) {
    excluded.addElement(new URL(line.substring(8)));
  }
      } catch (MalformedURLException e) {
  e.printStackTrace();
      }
      line = in.readLine();
    }
    in.close();
  }

  /** Reads the "robots.txt" file on the given host and uses the results
   *  to determine what files on "host" are crawlable.
   */
  public void readRobotsDotText(String host, int port) {
    try {
      if (port < 0) port = 80;
      URL url = new URL("http", host, port, "/robots.txt");
      DataInputStream in = new DataInputStream(url.openStream());
      boolean relevant = false;
      String line = in.readLine();
      String lower_case;
      String us = user_agent.toLowerCase();
      while (line != null) {
  line = line.trim();
  lower_case = line.toLowerCase();
  if (lower_case.startsWith("user-agent:")) {

    // determine if the following directives apply to us

    int i = 11;
    while (Character.isSpace(line.charAt(i))) i++;
    int i2 = lower_case.indexOf(' ', i);
    if (i2 < 0) i2 = lower_case.length();
    lower_case = lower_case.substring(i, i2);
    if (lower_case.endsWith("*")) {
      lower_case = lower_case.substring(0, lower_case.length()-1);
    }
    relevant = us.startsWith(lower_case);
  } else if (lower_case.startsWith("disallow:")) {

    // assimilate directive if applicable

    if (relevant) {
      int i = 9;
      while (Character.isSpace(line.charAt(i))) i++;
      int i2 = line.indexOf(' ', i);
      if (i2 < 0) i2 = line.length();
      line = line.substring(i, i2);
      excluded.addElement(new URL("http", host, port, line));
    }
  }
  line = in.readLine();
      }
      in.close();
    } catch (IOException e) {
    } catch (StringIndexOutOfBoundsException e2) {
    }
  }

  public String getUserAgent() {
    return user_agent;
  }

  public String getEmailAddress() {
    return email_address;
  }

  public Monitor getMonitor() {
    return monitor;
  }

  /** Returns true if this URL represents a file type that is not indexable. */
  public boolean URLNotIndexable(URL url) {
    String f = url.getFile().toLowerCase();
    return (f.endsWith(".gif") || f.endsWith(".tif") ||
      f.endsWith(".map") || f.endsWith(".jpg") ||
      f.endsWith(".ppt") || f.endsWith(".doc") ||
      f.endsWith(".pdf") || f.endsWith(".xls") ||
      f.endsWith(".rtf"));
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.