org.berlin.crawl.net.RobotsConnector.java Source code

Java tutorial

Introduction

Here is the source code for org.berlin.crawl.net.RobotsConnector.java

Source

/* Copyright (c) 2013 Berlin Brown (berlin2research.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.berlin.crawl.net;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.berlin.crawl.error.CrawlerError;
import org.berlin.crawl.util.OctaneCrawlerConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class RobotsConnector {

    private static final Logger logger = LoggerFactory.getLogger(WebConnector.class);
    private static final String NL = System.getProperty("line.separator");

    private URIBuilder lastURIBuilder = null;
    private HttpResponse response = null;

    public String connect(final String scheme, final String host) throws Exception {
        final URIBuilder builder = new URIBuilder();
        builder.setScheme(scheme);
        builder.setHost(host);
        builder.setPath("/robots.txt");
        return this.connect(builder);
    } // End of the method //

    /**
     * Connect to robots.txt file.
     * 
     * On error, close inputstream, return empty document.
     * 
     * @param builder
     * @return
     * @throws Exception
     */
    protected synchronized String connect(final URIBuilder builder) throws Exception {
        this.lastURIBuilder = builder;
        InputStream instream = null;
        try {
            logger.info("Attempting request : " + builder.toString());
            final HttpParams params = new BasicHttpParams();
            final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
            paramsBean.setUserAgent(OctaneCrawlerConstants.USER_AGENT);
            // Set this to false, or else you'll get an
            // Expectation Failed: error
            paramsBean.setUseExpectContinue(false);

            final URI uri = builder.build();
            final HttpClient httpclient = new DefaultHttpClient();
            final HttpGet httpget = new HttpGet(uri);
            httpget.setParams(params);

            // Connect //
            final HttpResponse response = httpclient.execute(httpget);
            final HttpEntity entity = response.getEntity();

            this.response = response;
            if (response != null) {
                if (response.getStatusLine() != null) {
                    if (response.getStatusLine().getStatusCode() != 200) {
                        // Log the error line
                        logger.error("Invalid status code - " + response.getStatusLine().getStatusCode());
                        throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode());
                    }
                }
            }

            if (entity != null) {
                instream = entity.getContent();
                if (instream != null) {
                    final StringBuffer document = new StringBuffer();
                    final BufferedReader reader = new BufferedReader(new InputStreamReader(instream));
                    String line = "";
                    while ((line = reader.readLine()) != null) {
                        document.append(line);
                        document.append(NL);
                    } // End of the while //
                    return document.toString();
                } // End of - instream ///
            } // End of the if /
            Thread.sleep(100);
        } catch (final Exception e) {
            logger.error("Error at robots connect", e);
            throw new CrawlerError("Error at connect", e);
        } finally {
            try {
                if (instream != null) {
                    instream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        } // End of the try - catch block //
        return null;
    } // End of the method //

    /**
     * @return the lastURIBuilder
     */
    public URIBuilder getLastURIBuilder() {
        return lastURIBuilder;
    }

    /**
     * @return the response
     */
    public HttpResponse getResponse() {
        return response;
    }

} // End of the class //