HttpDocManager.java :  » Web-Crawler » JoBo » net » matuschek » http » Java Open Source

Java Open Source » Web Crawler » JoBo 
JoBo » net » matuschek » http » HttpDocManager.java
package net.matuschek.http;

/*********************************************
    Copyright (c) 2001 by Daniel Matuschek
*********************************************/

import java.io.IOException;
import java.net.URL;

/**
 * An HttpDocManager does something with an HttpDoc. 
 * It is used by the WebRobot to store the retrieved documents.
 * @see net.matuschek.http.AbstractHttpDocManager
 *
 * @author Daniel Matuschek
 * @version $Id: HttpDocManager.java,v 1.3 2003/02/27 18:40:19 oliver_schmidt Exp $
 */

public interface HttpDocManager {

  /**
   * "Processes" a document (without storing it).
   * Either direct processing or collecting urls and later processing.
   * Most documents should be stored (for reruns) but not all of them should be
   * processed (Maybe you only want to process PDF documents).
   *
   * @param doc a HttpDoc object to process. This may also be null
   * @exception DocManagerException will be thrown if an error occurs
   * while processing the document.
   */
  void processDocument(HttpDoc doc) throws DocManagerException;

  /**
   * Stores a document. Usually this will store the document somewhere (file
   * system, database, ...). It is also possible that this will not store the
   * whole documents, but extract information from it and process this
   * information.
   * Most documents should be stored (for reruns) but not all of them should be
   * processed (Maybe you only want to process PDF documents).
   *
   * @param doc a HttpDoc object to store. This may also be null
   * @exception DocManagerException will be thrown if an error occurs
   * while storing the document.
   */
  void storeDocument(HttpDoc doc) throws DocManagerException;

  /**
   * Removes a  document from cache
   *
   * @param doc a HttpDoc object to store. This may also be null
   * @exception DocManagerException will be thrown if an error occurs
   * while storing the document.
   */
  public void removeDocument(URL url);
  
  /**
   * Returns URL of a stored document with the same content or null.
   * 
   * @param doc
   * @return URL of duplicate document as String or null
   * @throws IOException
   */
  public String findDuplicate(HttpDoc doc) throws IOException;
  
  
  /**
   * If a HttpDocManager stores the complete HttpDocs, it is possible
   * to use it as a cache. Using this method it is possible to access the cached
   * objects. If a HttpDocManager can't be used as a cache, it should always
   * return null.
   *
   * @return a cached HttpDoc for this URL or null
   */
  HttpDoc retrieveFromCache(URL u);

  /**
   * Should be called if the instance is not used any more.
   * Some resources might need to be released.
   */
    public void finish();

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.