org.cloudcrawler.domain.crawler.robotstxt.RobotsTxtService.java Source code

Introduction

Here is the source code for org.cloudcrawler.domain.crawler.robotstxt.RobotsTxtService.java
Source

package org.cloudcrawler.domain.crawler.robotstxt;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 * Inspired by crawler commons
 *
 * @coauthor Timo Schmidt <timo-schmidt@gmx.net>
 */

import org.cloudcrawler.domain.crawler.robotstxt.cache.Cache;
import org.cloudcrawler.domain.crawler.robotstxt.parser.SimpleRobotRulesParser;
import org.cloudcrawler.domain.crawler.robotstxt.rules.BaseRobotRules;
import org.cloudcrawler.system.http.HttpService;
import com.google.inject.Inject;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.util.EntityUtils;

import java.io.StringWriter;
import java.net.URI;

public class RobotsTxtService {

    protected HttpService httpService;

    protected SimpleRobotRulesParser robotsTxtParser;

    protected Cache cache;

    @Inject
    RobotsTxtService(HttpService httpService, SimpleRobotRulesParser robotsTxtParser, Cache cache) {
        this.httpService = httpService;
        this.robotsTxtParser = robotsTxtParser;
        this.cache = cache;
        System.out.println("Using cache: " + cache.getClass().getCanonicalName());
    }

    /**
     * This method is used to evaluate if the provided uri is
     * allowed to be crawled against the robots.txt of the website.
     *
     * @param uri
     * @return
     * @throws Exception
     */
    public boolean isAllowedUri(URI uri) throws Exception {
        URIBuilder uriBuilder = new URIBuilder();
        uriBuilder.setScheme(uri.getScheme());
        uriBuilder.setHost(uri.getHost());
        uriBuilder.setUserInfo(uri.getUserInfo());
        uriBuilder.setPath("/robots.txt");

        URI robotsTxtUri = uriBuilder.build();
        BaseRobotRules rules = (BaseRobotRules) cache.get(robotsTxtUri.toString());

        if (rules == null) {
            HttpResponse response = httpService.get(robotsTxtUri);

            try {

                // HACK! DANGER! Some sites will redirect the request to the top-level domain
                // page, without returning a 404. So look for a response which has a redirect,
                // and the fetched content is not plain text, and assume it's one of these...
                // which is the same as not having a robotstxt.txt file.

                String contentType = response.getEntity().getContentType().getValue();
                boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));

                if (response.getStatusLine().getStatusCode() == 404 || !isPlainText) {
                    rules = robotsTxtParser.failedFetch(HttpStatus.SC_GONE);
                } else {
                    StringWriter writer = new StringWriter();

                    IOUtils.copy(response.getEntity().getContent(), writer);

                    rules = robotsTxtParser.parseContent(uri.toString(), writer.toString().getBytes(),
                            response.getEntity().getContentType().getValue(), httpService.getUserAgent());

                }
            } catch (Exception e) {
                EntityUtils.consume(response.getEntity());
                throw e;
            }

            EntityUtils.consume(response.getEntity());
            cache.set(robotsTxtUri.toString(), 60 * 60 * 24, rules);
        }

        return rules.isAllowed(uri.toString());
    }
}