org.berlin.crawl.net.WebConnector.java Source code

Java tutorial

Introduction

Here is the source code for org.berlin.crawl.net.WebConnector.java

Source

/* Copyright (c) 2013 Berlin Brown (berlin2research.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.berlin.crawl.net;

/*
 * Octane crawler is a simple web crawler in Java.  All open with a liberal license.
 * 
 * http://code.google.com/p/octane-crawler/
 * http://berlin2research.com/
 * 
 * Author: Berlin Brown (berlin dot brown at gmail.com)
 * 
 * Libraries used:
 * ---------------- 
 * dom4j-1.6.1.jar, hibernate-core-4.0.1.Final.jar, hsqldb-1.8.0.10.jar, httpclient-4.2.3.jar, jackson-core-asl-1.9.12.jar, 
 * log4j-1.2.16.jar, mysql-connector-java-5.1.23.jar, opennlp-maxent-3.0.2-incubating.jar
 * opennlp-tools-1.5.2-incubating.jar, spring-core-3.1.1.RELEASE.jar, spring-web-3.1.1.RELEASE.jar, 
 * struts-core-1.3.10.jar, tagsoup-1.2.1.jar, tika-core-1.3.jar
 */

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.berlin.crawl.bean.BotLink;
import org.berlin.crawl.bom.LinkProcessQueueDatabase;
import org.berlin.crawl.error.CrawlerError;
import org.berlin.crawl.util.OctaneCrawlerConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebConnector {

    public static final int LINK_PROCESS_DELAY = 200;
    public static final String USER_AGENT = OctaneCrawlerConstants.USER_AGENT;

    private static final String NL = System.getProperty("line.separator");

    private static final Logger logger = LoggerFactory.getLogger(WebConnector.class);
    private final LinkProcessQueueDatabase db;

    private HttpResponse response;

    public WebConnector(final LinkProcessQueueDatabase db) {
        this.db = db;
    }

    public synchronized String connect(final BotLink blink, final URIBuilder builder) throws Exception {
        InputStream instream = null;
        try {
            logger.info("!* Attempting download and connect request : " + builder.toString());
            final HttpParams params = new BasicHttpParams();
            final HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
            paramsBean.setUserAgent(USER_AGENT);
            // Set this to false, or else you'll get an
            // Expectation Failed: error
            paramsBean.setUseExpectContinue(false);

            final URI uri = builder.build();
            final HttpClient httpclient = new DefaultHttpClient();
            final HttpGet httpget = new HttpGet(uri);
            httpget.setParams(params);

            // Connect //
            final HttpResponse response = httpclient.execute(httpget);
            final HttpEntity entity = response.getEntity();

            this.response = response;
            if (response != null) {
                if (response.getStatusLine() != null) {
                    if (response.getStatusLine().getStatusCode() != 200) {
                        // Log the error line
                        logger.error("Invalid status code - " + response.getStatusLine().getStatusCode());
                        throw new CrawlerError("Invalid status code - " + response.getStatusLine().getStatusCode());
                    }
                }
            }

            if (entity != null) {
                blink.setStatusline(String.valueOf(response.getStatusLine()));
                blink.setCode(response.getStatusLine().getStatusCode());
                instream = entity.getContent();
                if (instream != null) {
                    final StringBuffer document = new StringBuffer();
                    final BufferedReader reader = new BufferedReader(new InputStreamReader(instream));
                    String line = "";
                    while ((line = reader.readLine()) != null) {
                        document.append(line);
                        document.append(NL);
                    } // End of the while //

                    db.proc(blink);
                    Thread.sleep(LINK_PROCESS_DELAY);

                    return document.toString();
                } // End of - instream ///
            } // End of the if /

        } catch (final Throwable e) {
            logger.error("Error at connect to LINK", e);
            throw new CrawlerError("Error at connect to LINK", e);
        } finally {
            try {
                if (instream != null) {
                    instream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        } // End of the try - catch block //
        return null;
    } // End of the method //

    /**
     * @return the response
     */
    public HttpResponse getResponse() {
        return response;
    }

} // End of the class //