/*
* Copyright 2005 by Lars Torunski
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.torunski.crawler.parser;
import java.util.Collection;
import com.torunski.crawler.filter.ILinkFilter;
import com.torunski.crawler.link.Link;
/**
* Defines an interface for the parsers. With the load method it is possible to download
* different pages and to parse them later in a different thread.
*
* @author Lars Torunski
* @version $Revision: 1.6 $
*/
public interface IParser {
/**
* Loads the data of the URI. A crawler can load different URIs at the same
* time and parse them lately. Hence all necessary information have to be stored
* in a PageData object. E.g. different threads can download the content of the
* URI parallel and parse them in a different order.
*
* @param link the link of the page
* @return the page data of the uri or <code>null</code> if preloading the data failed
*/
PageData load(Link link);
/**
* Parses a PageData object e.g. for links and returns them in a Collection.
*
* @param pageData the page data of the page
* @param linkFilter the filter for the URIs
* @return a collection of new URIs in the pageData filtered by the linkFilter
*/
Collection parse(PageData pageData, ILinkFilter linkFilter);
}
|