com.norconex.collector.http.crawler.URLCrawlScopeStrategy.java Source code

Java tutorial

Introduction

Here is the source code for com.norconex.collector.http.crawler.URLCrawlScopeStrategy.java

Source

/* Copyright 2015 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.collector.http.crawler;

import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import com.norconex.commons.lang.url.HttpURL;

/**
 * <p>By default a crawler will try to follow all links it discovers. You can
 * define your own filters to limit the scope of the pages being crawled.
 * When you have multiple URLs defined as start URLs, it can be tricky to 
 * perform global filtering that apply to each URLs without causing 
 * URL filtering conflicts.  This class offers an easy way to address 
 * a frequent URL filtering need: to "stay on site". That is, 
 * when following a page and extracting URLs found in it, make sure to 
 * only keep URLs that are on the same site as the page URL we are on.
 * </p>
 * <p>
 * By default this class does not request to stay on a site.
 * </p>
 * @author Pascal Essiembre
 * @since 2.3.0
 */
//TODO make this an interface so developers can provide their own?
public class URLCrawlScopeStrategy {

    private static final Logger LOG = LogManager.getLogger(URLCrawlScopeStrategy.class);

    private boolean stayOnDomain;
    private boolean stayOnPort;
    private boolean stayOnProtocol = false;

    /**
     * Whether the crawler should always stay on the same domain name as
     * the domain for each URL specified as a start URL.  By default (false) 
     * the crawler will try follow any discovered links not otherwise rejected 
     * by other settings (like regular filtering rules you may have).
     * @return <code>true</code> if the crawler should stay on a domain
     */
    public boolean isStayOnDomain() {
        return stayOnDomain;
    }

    /**
     * Sets whether the crawler should always stay on the same domain name as
     * the domain for each URL specified as a start URL.
     * @param stayOnDomain <code>true</code> for the crawler to stay on domain
     */
    public void setStayOnDomain(boolean stayOnDomain) {
        this.stayOnDomain = stayOnDomain;
    }

    /**
     * Whether the crawler should always stay on the same port as
     * the port for each URL specified as a start URL.  By default (false) 
     * the crawler will try follow any discovered links not otherwise rejected 
     * by other settings (like regular filtering rules you may have).
     * @return <code>true</code> if the crawler should stay on a port
     */
    public boolean isStayOnPort() {
        return stayOnPort;
    }

    /**
     * Sets whether the crawler should always stay on the same port as
     * the port for each URL specified as a start URL.
     * @param stayOnPort <code>true</code> for the crawler to stay on port
     */
    public void setStayOnPort(boolean stayOnPort) {
        this.stayOnPort = stayOnPort;
    }

    /**
     * Whether the crawler should always stay on the same protocol as
     * the protocol for each URL specified as a start URL.  By default (false) 
     * the crawler will try follow any discovered links not otherwise rejected 
     * by other settings (like regular filtering rules you may have).
     * @return <code>true</code> if the crawler should stay on protocol
     */
    public boolean isStayOnProtocol() {
        return stayOnProtocol;
    }

    /**
     * Sets whether the crawler should always stay on the same protocol as
     * the protocol for each URL specified as a start URL.
     * @param stayOnProtocol 
     *        <code>true</code> for the crawler to stay on protocol
     */
    public void setStayOnProtocol(boolean stayOnProtocol) {
        this.stayOnProtocol = stayOnProtocol;
    }

    public boolean isInScope(String inScopeURL, String candidateURL) {
        // if not specifying any scope, candidate URL is good
        if (!stayOnProtocol && !stayOnDomain && !stayOnPort) {
            return true;
        }

        HttpURL inScope = new HttpURL(inScopeURL);
        HttpURL candidate = new HttpURL(candidateURL);
        if (stayOnProtocol && !inScope.getProtocol().equalsIgnoreCase(candidate.getProtocol())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Rejected protocol for: " + candidateURL);
            }
            return false;
        }
        if (stayOnDomain && !inScope.getHost().equalsIgnoreCase(candidate.getHost())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Rejected domain for: " + candidateURL);
            }
            return false;
        }
        if (stayOnPort && inScope.getPort() != candidate.getPort()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Rejected port for: " + candidateURL);
            }
            return false;
        }
        return true;
    }

    @Override
    public boolean equals(final Object other) {
        if (!(other instanceof URLCrawlScopeStrategy)) {
            return false;
        }
        URLCrawlScopeStrategy castOther = (URLCrawlScopeStrategy) other;
        return new EqualsBuilder().append(stayOnProtocol, castOther.stayOnProtocol)
                .append(stayOnDomain, castOther.stayOnDomain).append(stayOnPort, castOther.stayOnPort).isEquals();
    }

    @Override
    public int hashCode() {
        return new HashCodeBuilder().append(stayOnProtocol).append(stayOnDomain).append(stayOnPort).toHashCode();
    }

    @Override
    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).append("stayOnProtocol", stayOnProtocol)
                .append("stayOnDomain", stayOnDomain).append("stayOnPort", stayOnPort).toString();
    }
}