Web crawler : Crawler « Network Protocol « Java






Web crawler

 

Revised from: http://java.sun.com/developer/technicalArticles/ThirdParty/WebCrawler/

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;

public class WebCrawler implements Runnable {
  public static final String SEARCH = "Search";
  public static final String STOP = "Stop";
  public static final String DISALLOW = "Disallow:";
  public static final int SEARCH_LIMIT = 50;

  Vector vectorToSearch = new Vector();
  Vector vectorSearched = new Vector();
  Vector vectorMatches = new Vector();

  Thread searchThread;

  public WebCrawler() {
    // ("text/html");
    // ("audio/basic");
    // ("audio/au");
    // ("audio/aiff");
    // ("audio/wav");
    // ("video/mpeg");
    // ("video/x-avi");

    URLConnection.setDefaultAllowUserInteraction(false);
    searchThread = new Thread(this);
    searchThread.start();
  }

  public void run() {
    String strURL = "http://www.google.com";
    String strTargetType = "text/html";
    int numberSearched = 0;
    int numberFound = 0;

    if (strURL.length() == 0) {
      System.out.println("ERROR: must enter a starting URL");
      return;
    }

    vectorToSearch = new Vector();
    vectorSearched = new Vector();
    vectorMatches = new Vector();

    vectorToSearch.addElement(strURL);

    while ((vectorToSearch.size() > 0)
        && (Thread.currentThread() == searchThread)) {
      strURL = (String) vectorToSearch.elementAt(0);

      System.out.println("searching " + strURL);

      URL url = null;
      try {
        url = new URL(strURL);
      } catch (MalformedURLException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
      }

      vectorToSearch.removeElementAt(0);
      vectorSearched.addElement(strURL);

      try {
        URLConnection urlConnection = url.openConnection();

        urlConnection.setAllowUserInteraction(false);

        InputStream urlStream = url.openStream();
        String type = urlConnection.guessContentTypeFromStream(urlStream);
        if (type == null)
          break;
        if (type.compareTo("text/html") != 0)
          break;

        byte b[] = new byte[5000];
        int numRead = urlStream.read(b);
        String content = new String(b, 0, numRead);
        while (numRead != -1) {
          if (Thread.currentThread() != searchThread)
            break;
          numRead = urlStream.read(b);
          if (numRead != -1) {
            String newContent = new String(b, 0, numRead);
            content += newContent;
          }
        }
        urlStream.close();

        if (Thread.currentThread() != searchThread)
          break;

        String lowerCaseContent = content.toLowerCase();

        int index = 0;
        while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
          if ((index = lowerCaseContent.indexOf("href", index)) == -1)
            break;
          if ((index = lowerCaseContent.indexOf("=", index)) == -1)
            break;

          if (Thread.currentThread() != searchThread)
            break;

          index++;
          String remaining = content.substring(index);

          StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
          String strLink = st.nextToken();

          URL urlLink;
          try {
            urlLink = new URL(url, strLink);
            strLink = urlLink.toString();
          } catch (MalformedURLException e) {
            System.out.println("ERROR: bad URL " + strLink);
            continue;
          }

          if (urlLink.getProtocol().compareTo("http") != 0)
            break;

          if (Thread.currentThread() != searchThread)
            break;

          try {
            URLConnection urlLinkConnection = urlLink.openConnection();
            urlLinkConnection.setAllowUserInteraction(false);
            InputStream linkStream = urlLink.openStream();
            String strType = urlLinkConnection
                .guessContentTypeFromStream(linkStream);
            linkStream.close();

            if (strType == null)
              break;
            if (strType.compareTo("text/html") == 0) {
              if ((!vectorSearched.contains(strLink))
                  && (!vectorToSearch.contains(strLink))) {

                vectorToSearch.addElement(strLink);
              }
            }

            if (strType.compareTo(strTargetType) == 0) {
              if (vectorMatches.contains(strLink) == false) {
                System.out.println(strLink);
                vectorMatches.addElement(strLink);
                numberFound++;
                if (numberFound >= SEARCH_LIMIT)
                  break;
              }
            }
          } catch (IOException e) {
            System.out.println("ERROR: couldn't open URL " + strLink);
            continue;
          }
        }
      } catch (IOException e) {
        System.out.println("ERROR: couldn't open URL " + strURL);
        break;
      }

      numberSearched++;
      if (numberSearched >= SEARCH_LIMIT)
        break;
    }

    if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
      System.out.println("reached search limit of " + SEARCH_LIMIT);
    else
      System.out.println("done");
    searchThread = null;
  }

  public static void main(String argv[]) {
    WebCrawler applet = new WebCrawler();
    /*
     * Behind a firewall set your proxy and port here!
     */
    Properties props = new Properties(System.getProperties());
    props.put("http.proxySet", "true");
    props.put("http.proxyHost", "webcache-cup");
    props.put("http.proxyPort", "8080");

    Properties newprops = new Properties(props);
    System.setProperties(newprops);
  }

}
       
           
         
  








Related examples in the same category

1.Search CrawlerSearch Crawler
2.Web Crawler from Sun Microsystems