RobotsTXTRule.java :  » Web-Crawler » jspider » net » javacoding » jspider » core » rule » impl » Java Open Source

Java Open Source » Web Crawler » jspider 
jspider » net » javacoding » jspider » core » rule » impl » RobotsTXTRule.java
package net.javacoding.jspider.core.rule.impl;


import net.javacoding.jspider.api.model.Decision;
import net.javacoding.jspider.api.model.Site;
import net.javacoding.jspider.core.SpiderContext;
import net.javacoding.jspider.core.model.DecisionInternal;
import net.javacoding.jspider.core.model.SiteInternal;
import net.javacoding.jspider.core.util.URLUtil;
import net.javacoding.jspider.core.util.html.RobotsTXTLine;
import net.javacoding.jspider.core.util.html.RobotsTXTLineSet;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;


/**
 * Rule implementation that applies the rules expressed by a site's robots.txt
 * file to the resources we want to fetch on that site.
 * This file allows webmasters to exclude certain resources and folders not to
 * be spidered by web robots, to disallow inclusion in search engines, etc ...
 *
 * $Id: RobotsTXTRule.java,v 1.13 2003/03/28 17:26:28 vanrogu Exp $
 *
 * @author Gnther Van Roey
 */
public class RobotsTXTRule extends BaseRuleImpl {

    /** user agent under which we're operating. */
    protected String effectiveUserAgent;

    /** user agent in the robots.txt file we're obeying */
    protected String obeyedUserAgent;

    /** all lines in the robots.txt file that apply to us and forbid access to a part of the site. */
    protected RobotsTXTLine[] forbiddenPaths;


    /**
     * Public constructor.
     * @param userAgent the userAgent under which we're operating
     * @param is the inputstream to read the robots.txt file from
     * @throws IOException in case something goes wrong reading the robots.txt
     */
    public RobotsTXTRule(String userAgent, InputStream is) throws IOException {
        RobotsTXTLineSet lineSet = RobotsTXTLineSet.findLineSet(is, userAgent);
        this.effectiveUserAgent = userAgent;
        if (lineSet == null) {
            this.obeyedUserAgent = null;
            forbiddenPaths = new RobotsTXTLine[0];
        } else {
            this.obeyedUserAgent = lineSet.getUserAgent();
            forbiddenPaths = lineSet.getLines();
        }
    }

    /**
     * Returns the user agent from robots.txt we're obeying. (can be '*').
     * This user agent identification is the first match we encountered in the file,
     * a match is given if our effective user agent contains the user agent
     * identification as a substring in a case-insensitive way.
     * @return the useragent selector we're obeying.
     */
    public String getObeyedUserAgent() {
        return obeyedUserAgent;
    }

    /**
     * Applies the rule to a given URL
     * @param context the spider context we're working in
     * @param currentSite the site we're spidering
     * @param url the url of the resource to be tested for spider permission
     * @return Decision object expressing this rule's decision on the resource
     */
    public Decision apply(SpiderContext context, Site currentSite, URL url) {
        String path = url.getPath();
        Decision decision = new DecisionInternal();

        if ((context.getStorage().getSiteDAO().find(URLUtil.getSiteURL(url))).getObeyRobotsTXT()) {

            for (int i = 0; i < forbiddenPaths.length; i++) {
                RobotsTXTLine forbiddenPath = forbiddenPaths[i];
                if (forbiddenPath.matches(url)) {
                    decision = new DecisionInternal(Decision.RULE_FORBIDDEN, "access to '" + path + "' forbidden");
                    break;
                }
            }
        }
        return decision;
    }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.