Crawler.java :  » Search-Engine » BDDBot » bdd » search » spider » Java Open Source

Java Open Source » Search Engine » BDDBot 
BDDBot » bdd » search » spider » Crawler.java
package bdd.search.spider;

import java.net.URL;
import java.net.MalformedURLException;
import java.io.File;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.util.Hashtable;
import bdd.search.EnginePrefs;
import bdd.search.Monitor;
import bdd.util.FIFOQueue;

/** Written by Tim Macinta 1997                           <br>
 *  Distributed under the GNU Public License
 *       (a copy of which is enclosed with the source).   <br>
 *                                                        <br> 
 *  Calling the Crawler's start() method will cause the Crawler to
 *  index all of the sites in its queue and then replace the main
 *  index with the updated index when it completes.  The Crawler's
 *  queue should be filled with the starting URLs before calling
 *  start().
 */
public class Crawler extends Thread {

  File working_dir;                            // directory for temp files
  Indexer indexer;                             // handles post-crawl indexing
  FIFOQueue q = new FIFOQueue();               // url queue
  Hashtable urls_done = new Hashtable(40);     // keeps track of what
  //                                           urls are already processed
  EnginePrefs eng_prefs;                       // preferences
  boolean exit_when_done = false;              // exit when done indexing

  /** "working_dir" should be a directory that only this
   *  Crawler and a given Indexer will be
   *  accessing.  This means that if several Crawlers are running
   *  simultaneously, they should all be given different "working_dir"
   *  directories.  Also, no other threads should write to this
   *  directory (except for the selected Indexer).
   */
  public Crawler(File working_dir, EnginePrefs eng_prefs) {
    this.eng_prefs = eng_prefs;
    this.working_dir = working_dir;
    indexer = new Indexer(working_dir, this, eng_prefs);
    indexer.start();
  }

  /** Takes "url_to_queue" and adds it to this Crawler's queue of URLs.
   *  This method should be used to add all of the desired starting URLs to
   *  the queue before the Crawler is started.  If the URL has already
   *  been processed or if it is an unallowed URL it is not added.
   */
  public void addURL(URL url_to_queue) {
    if (!eng_prefs.URLAllowed(url_to_queue)) return; // check if URL is allowed
    if (eng_prefs.URLNotIndexable(url_to_queue)) return; //don't index non-text
    url_to_queue = simplify(url_to_queue);    // remove loops/anchors
    if (urls_done.put(url_to_queue, url_to_queue) == null) {
      q.addElement(url_to_queue);           // add if not done already
      Monitor m = eng_prefs.getMonitor();
      if (m != null) m.indexing(url_to_queue);
    }
  }
  
  /** Takes "url" and removes all references to "/./" and "/../" .  This
   *  can be used to help eliminate looping.  Also removes all anchors
   *  (i.e., everything after and including a '#'). */
  URL simplify(URL url) {
    String file = url.getFile();
    boolean changed = false;     // keep track of whether we change anything

    // collapse all occurances of "/./"

    int i = file.indexOf("/./");
    while (i >= 0) {
      changed = true;
      file = file.substring(0, i) + file.substring(i+2);
      i = file.indexOf("/./");
    }

    // collapse all occurances of "/../" (by removing preceding directory)

    i = file.indexOf("/../");
    while (i >= 0) {
      changed = true;
      int i2 = file.lastIndexOf('/', i-1);
      if (i2 < 0) i2 = i;
      file = file.substring(0, i2) + file.substring(i+3);
      i = file.indexOf("/../");
    }

    // remove anchor if necessary
    
    if (url.getRef() != null) changed = true;

    // set port if it's not set already

    int port = url.getPort();
    String proto = url.getProtocol().toLowerCase();
    if (port < 0 && proto.equals("http")) {
      changed = true;
      port = 80;
    }

    // create a new URL if anything changed

    if (changed) {
      try {
  url = new URL(proto, url.getHost(), port, file);
      } catch (MalformedURLException e) {
  e.printStackTrace();
      }
    }
    return url;
  }
  
  /** This is where the actual crawling occurs. */
  public void run() {
    if (!q.hasMoreElements()) return;  // return if there's nothing to do
    int tmp_file = 0;  // used to generate unique temporary filenames

    URLStatus url_status;
    while (true) {
      url_status = new URLStatus((URL) q.nextElement(),
         new File(working_dir, tmp_file+".tmp"),
         eng_prefs);
      tmp_file++;
      url_status.readContent();
      if (url_status.loaded()) {
  indexer.queueURL(url_status);
      } else if (url_status.moved()) {
  addURL(url_status.actual_url);
      } else {
  Monitor m = eng_prefs.getMonitor();
  if (m != null) m.reportError(url_status.actual_url);
      }
      if (q.hasMoreElements()) {
  eng_prefs.pauseBetweenURLs();
      } else {
  while (!q.hasMoreElements() && indexer.q.hasMoreElements()) {
    eng_prefs.pauseBetweenURLs();
  }
  if (!q.hasMoreElements()) {
    break;
  }
      }
    }
    Monitor m = eng_prefs.getMonitor();
    if (m != null) m.crawlerDone(this);
    indexer.stopWhenDone(exit_when_done);
  }
  
  /** This is the method that is called when this class is invoked from
   *  the command line.  calling this method will cause a Crawler to be
   *  created and started with the starting URLs being listed in a file
   *  specified by the first argument (arg[0]).  The file listing the URLs
   *  should contain only the URLs with each URL on a line by itself.  Blank
   *  lines are allowed and lines beginning with "#" are considered comments
   *  and are ignored.
   */
  public static void main(String arg[]) {
    if (arg.length < 1) return;
    main(new File(arg[0]), new EnginePrefs(), true);
  }

  public static void main(File file, EnginePrefs prefs) {
    main(file, prefs, false);
  }

  public static void main(File file, EnginePrefs prefs, boolean exit) {
    Crawler cr = new Crawler(prefs.getWorkingDir(), prefs);
    try {
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      String line = in.readLine();
      while (line != null) {
  line = line.trim();
  if (!line.equals("") && !line.startsWith("#")) {
    try {
      cr.addURL(new URL(line));
    } catch (MalformedURLException e2) {
      e2.printStackTrace();
    }
  }
  line = in.readLine();
      }
      cr.exit_when_done = exit;
      cr.start();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.