com.esri.geoportal.commons.http.BotsHttpClient.java Source code

Java tutorial

Introduction

Here is the source code for com.esri.geoportal.commons.http.BotsHttpClient.java

Source

/*
 * Copyright 2016 Esri, Inc..
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.esri.geoportal.commons.http;

import com.esri.geoportal.commons.robots.Access;
import com.esri.geoportal.commons.robots.Bots;
import static com.esri.geoportal.commons.robots.BotsUtils.requestAccess;
import java.io.Closeable;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpRequestWrapper;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Bots http client.
 */
public class BotsHttpClient extends CloseableHttpClient {
    private static final Logger LOG = LoggerFactory.getLogger(BotsHttpClient.class);
    private final CloseableHttpClient client;
    private final Bots bots;

    public BotsHttpClient(CloseableHttpClient client, Bots bots) {
        this.client = client;
        this.bots = bots;
    }

    public BotsHttpClient(Bots bots) {
        this.client = HttpClientBuilder.create().useSystemProperties().build();
        this.bots = bots;
    }

    @Override
    public HttpParams getParams() {
        return client.getParams();
    }

    @Override
    public ClientConnectionManager getConnectionManager() {
        return client.getConnectionManager();
    }

    @Override
    protected CloseableHttpResponse doExecute(HttpHost target, HttpRequest request, HttpContext context)
            throws IOException, ClientProtocolException {
        return execute(target, request, context);
    }

    @Override
    public CloseableHttpResponse execute(HttpUriRequest request) throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap);
    }

    @Override
    public CloseableHttpResponse execute(HttpUriRequest request, HttpContext context)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, context);
    }

    @Override
    public CloseableHttpResponse execute(HttpHost target, HttpRequest request)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request, target);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap);
    }

    @Override
    public CloseableHttpResponse execute(HttpHost target, HttpRequest request, HttpContext context)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request, target);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, context);
    }

    @Override
    public <T> T execute(HttpUriRequest request, ResponseHandler<? extends T> responseHandler)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, responseHandler);
    }

    @Override
    public <T> T execute(HttpUriRequest request, ResponseHandler<? extends T> responseHandler, HttpContext context)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, responseHandler, context);
    }

    @Override
    public <T> T execute(HttpHost target, HttpRequest request, ResponseHandler<? extends T> responseHandler)
            throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request, target);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, responseHandler);
    }

    @Override
    public <T> T execute(HttpHost target, HttpRequest request, ResponseHandler<? extends T> responseHandler,
            HttpContext context) throws IOException, ClientProtocolException {
        HttpRequestWrapper wrap = HttpRequestWrapper.wrap(request, target);
        adviseRobotsTxt(wrap.getURI());
        wrap.setURI(applyPHP(wrap.getURI()));
        return client.execute(wrap, responseHandler, context);
    }

    @Override
    public void close() throws IOException {
        if (client instanceof Closeable) {
            ((Closeable) client).close();
        }
    }

    private Long resolveThrottleDelay() {
        return bots != null && bots.getCrawlDelay() != null ? 1000L * bots.getCrawlDelay() : null;
    }

    private String getRelativePath(URI u) throws MalformedURLException {
        return String.format("%s%s%s", u.getPath() != null ? u.getPath() : "/",
                u.getQuery() != null ? "?" + u.getQuery() : "",
                u.getFragment() != null ? "#" + u.getFragment() : "");
    }

    private String getProtocolHostPort(URI u) throws MalformedURLException {
        return String.format("%s://%s%s", u.getScheme(), u.getHost(), u.getPort() >= 0 ? ":" + u.getPort() : "");
    }

    private void adviseRobotsTxt(URI u) throws IOException {
        if (bots != null) {
            String url = getRelativePath(u);
            LOG.debug(String.format("Evaluating access to %s using robots.txt", u));
            Access access = requestAccess(bots, url);
            if (!access.hasAccess()) {
                LOG.info(String.format("Access to %s disallowed by robots.txt", u));
                throw new HttpResponseException(403, String.format("Access to %s disallowed by robots.txt", url));
            }
            LOG.debug(String.format("Access to %s allowed by robots.txt", u));
            CrawlLocker.getInstance().enterServer(getProtocolHostPort(u), resolveThrottleDelay());
        }
    }

    private URI applyPHP(URI uri) throws ClientProtocolException {
        if (bots != null) {
            try {
                String orgUri = uri.toString();
                PHP php = parsePHP(bots.getHost());
                if (php != null) {
                    uri = updateURI(uri, php);
                }
                if (!uri.toString().equals(orgUri)) {
                    LOG.debug(String.format("Uri updated from %s to %s", orgUri, uri));
                }
            } catch (URISyntaxException ex) {
                throw new ClientProtocolException("Unable to apply host robots.txt host directive.", ex);
            }
        }
        return uri;
    }

    private URI updateURI(URI uri, PHP php) throws URISyntaxException {
        return new URI(php.protocol != null ? php.protocol : uri.getScheme(), uri.getUserInfo(),
                php.host != null ? php.host : uri.getHost(),
                php.host != null ? php.port != null ? php.port : -1 : uri.getPort(), uri.getPath(), uri.getQuery(),
                uri.getFragment());
    }

    private PHP parsePHP(String host) {
        host = StringUtils.trimToEmpty(host);
        if (!host.isEmpty()) {
            // parse protocol
            String protocolPart = null;
            int protocolStopIdx = host.indexOf("://");
            if (protocolStopIdx >= 0) {
                protocolPart = protocolStopIdx > 0 ? host.substring(0, protocolStopIdx) : null;
                host = host.substring(protocolStopIdx + "://".length());
            }

            // parse host:port
            String hostPart = null;
            Integer portPart = null;
            if (!host.isEmpty()) {
                int hostStopIdx = host.indexOf(":");
                if (hostStopIdx < 0) {
                    hostPart = host;
                } else {
                    hostPart = hostStopIdx > 0 ? host.substring(0, hostStopIdx) : null;
                    try {
                        portPart = Integer.parseInt(host.substring(hostStopIdx + ":".length()));
                    } catch (NumberFormatException ex) {

                    }
                }
            }

            if (protocolPart != null || hostPart != null || portPart != null) {
                return new PHP(protocolPart, hostPart, portPart);
            }
        }
        return null;
    }

    /**
     * Protocol-host-port
     */
    private static class PHP {
        String protocol;
        String host;
        Integer port;

        public PHP(String protocol, String host, Integer port) {
            this.protocol = protocol;
            this.host = host;
            this.port = port;
        }
    }
}