org.eweb4j.spiderman.plugin.util.PageFetcherImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.eweb4j.spiderman.plugin.util.PageFetcherImpl.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.eweb4j.spiderman.plugin.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpOptions;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntity;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.eweb4j.mvc.Http;
import org.eweb4j.spiderman.fetcher.FetchRequest;
import org.eweb4j.spiderman.fetcher.FetchResult;
import org.eweb4j.spiderman.fetcher.Page;
import org.eweb4j.spiderman.fetcher.PageFetcher;
import org.eweb4j.spiderman.fetcher.Status;
import org.eweb4j.spiderman.xml.Site;
import org.eweb4j.util.CommonUtil;

/**
 * Web ??
 * @author weiwei l.weiwei@163.com
 * @date 2013-1-7 ?11:04:50
 */
public class PageFetcherImpl implements PageFetcher {

    private ThreadSafeClientConnManager connectionManager;
    private DefaultHttpClient httpClient;
    private final Object mutex = new Object();
    private long lastFetchTime = 0;
    private SpiderConfig config;
    private Map<String, String> headers = new Hashtable<String, String>();
    private Map<String, List<String>> cookies = new Hashtable<String, List<String>>();
    private Site site;

    public PageFetcherImpl() {
    }

    /**
     * ?GZIP
     * @author weiwei l.weiwei@163.com
     * @date 2013-1-7 ?11:26:24
     */
    private static class GzipDecompressingEntity extends HttpEntityWrapper {
        public GzipDecompressingEntity(final HttpEntity entity) {
            super(entity);
        }

        public InputStream getContent() throws IOException, IllegalStateException {
            InputStream wrappedin = wrappedEntity.getContent();
            return new GZIPInputStream(wrappedin);
        }

        public long getContentLength() {
            return -1;
        }
    }

    public void setConfig(SpiderConfig config) {
        this.config = config;
    }

    public void addCookie(String key, String val, String host, String path) {
        Cookie c = new Cookie(key, val, host, path);
        //Cookie
        String name = c.name();
        String value = c.value();
        List<String> vals = this.cookies.get(name);
        if (vals == null)
            vals = new ArrayList<String>();
        vals.add(value);
        this.cookies.put(key, vals);

        BasicClientCookie clientCookie = new BasicClientCookie(name, value);
        clientCookie.setPath(c.path());
        clientCookie.setDomain(c.domain());
        httpClient.getCookieStore().addCookie(clientCookie);
    }

    public void addHeader(String key, String val) {
        if (this.headers.containsKey(key))
            this.headers.put(key, this.headers.get(key) + "; " + val);
        else
            this.headers.put(key, val);
    }

    /**
     * client?Header?Cookie
     * @param aconfig
     * @param cookies
     */
    public void init(Site _site) {
        //HTTP?
        HttpParams params = new BasicHttpParams();
        params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
        params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
        params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());

        HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
        paramsBean.setVersion(HttpVersion.HTTP_1_1);
        paramsBean.setContentCharset("UTF-8");
        paramsBean.setUseExpectContinue(false);

        SchemeRegistry schemeRegistry = new SchemeRegistry();
        schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));

        if (config.isIncludeHttpsPages())
            schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

        connectionManager = new ThreadSafeClientConnManager(schemeRegistry);
        connectionManager.setMaxTotal(config.getMaxTotalConnections());
        connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
        httpClient = new DefaultHttpClient(connectionManager, params);

        httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
        httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
        httpClient.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS, config.isFollowRedirects());
        //      HttpClientParams.setCookiePolicy(httpClient.getParams(),CookiePolicy.BEST_MATCH);

        //?
        httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
            public void process(final HttpResponse response, final HttpContext context)
                    throws HttpException, IOException {
                HttpEntity entity = response.getEntity();
                Header contentEncoding = entity.getContentEncoding();
                if (contentEncoding != null) {
                    HeaderElement[] codecs = contentEncoding.getElements();
                    for (HeaderElement codec : codecs) {
                        //?GZIP
                        if (codec.getName().equalsIgnoreCase("gzip")) {
                            response.setEntity(new GzipDecompressingEntity(response.getEntity()));
                            return;
                        }
                    }
                }
            }
        });

        if (_site != null) {
            this.site = _site;
            if (this.site.getHeaders() != null && this.site.getHeaders().getHeader() != null) {
                for (org.eweb4j.spiderman.xml.Header header : this.site.getHeaders().getHeader()) {
                    this.addHeader(header.getName(), header.getValue());
                }
            }
            if (this.site.getCookies() != null && this.site.getCookies().getCookie() != null) {
                for (org.eweb4j.spiderman.xml.Cookie cookie : this.site.getCookies().getCookie()) {
                    this.addCookie(cookie.getName(), cookie.getValue(), cookie.getHost(), cookie.getPath());
                }
            }
        }
    }

    /**
     * ?url
     * @date 2013-1-7 ?11:08:54
     * @param toFetchURL
     * @return
     */
    public FetchResult fetch(FetchRequest req) throws Exception {
        if (req.getHttpMethod() != null && !Http.Method.GET.equals(req.getHttpMethod())) {
            //?URL??QueryParam
            String query = new URL(req.getUrl()).getQuery();
            for (String q : query.split("\\&")) {
                String[] qv = q.split("=");
                String name = qv[0];
                String val = qv[1];
                List<Object> vals = req.getParams().get(name);
                if (vals == null) {
                    vals = new ArrayList<Object>();
                    req.getParams().put(name, vals);
                }

                vals.add(val);
            }

            return request(req);
        }
        FetchResult fetchResult = new FetchResult();
        HttpGet get = null;
        HttpEntity entity = null;
        String toFetchURL = req.getUrl();
        try {
            get = new HttpGet(toFetchURL);
            //GZIP???GZIP?
            get.addHeader("Accept-Encoding", "gzip");
            for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();) {
                Entry<String, String> entry = it.next();
                get.addHeader(entry.getKey(), entry.getValue());
            }

            //???,??
            // TODO ?delay?
            synchronized (mutex) {
                //??
                long now = (new Date()).getTime();
                //?Host??
                if (now - lastFetchTime < config.getPolitenessDelay())
                    Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
                //????HOST??URL
                lastFetchTime = (new Date()).getTime();
            }

            //get?
            Header[] headers = get.getAllHeaders();
            for (Header h : headers) {
                Map<String, List<String>> hs = req.getHeaders();
                String key = h.getName();
                List<String> val = hs.get(key);
                if (val == null)
                    val = new ArrayList<String>();
                val.add(h.getValue());

                hs.put(key, val);
            }

            req.getCookies().putAll(this.cookies);

            fetchResult.setReq(req);
            //get??
            HttpResponse response = httpClient.execute(get);
            headers = response.getAllHeaders();
            for (Header h : headers) {
                Map<String, List<String>> hs = fetchResult.getHeaders();
                String key = h.getName();
                List<String> val = hs.get(key);
                if (val == null)
                    val = new ArrayList<String>();
                val.add(h.getValue());

                hs.put(key, val);
            }
            //URL
            fetchResult.setFetchedUrl(toFetchURL);
            String uri = get.getURI().toString();
            if (!uri.equals(toFetchURL))
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL))
                    fetchResult.setFetchedUrl(uri);

            entity = response.getEntity();
            //???
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode != HttpStatus.SC_OK) {
                if (statusCode != HttpStatus.SC_NOT_FOUND) {
                    Header locationHeader = response.getFirstHeader("Location");
                    //301?302?URL??
                    if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
                            || statusCode == HttpStatus.SC_MOVED_TEMPORARILY))
                        fetchResult.setMovedToUrl(
                                URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL));
                }
                //???OKURLstatusCode??
                //???
                if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) {
                    String[] scs = this.site.getSkipStatusCode().split(",");
                    for (String code : scs) {
                        int c = CommonUtil.toInt(code);
                        //????entity
                        if (statusCode == c) {
                            assemPage(fetchResult, entity);
                            break;
                        }
                    }
                }
                fetchResult.setStatusCode(statusCode);
                return fetchResult;
            }

            //??
            if (entity != null) {
                fetchResult.setStatusCode(statusCode);
                assemPage(fetchResult, entity);
                return fetchResult;
            }
        } catch (Throwable e) {
            fetchResult.setFetchedUrl(e.toString());
            fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal());
            return fetchResult;
        } finally {
            try {
                if (entity == null && get != null)
                    get.abort();
            } catch (Exception e) {
                throw e;
            }
        }

        fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal());
        return fetchResult;
    }

    /**
     * 
     * @date 2013-1-7 ?11:08:54
     * @param toFetchURL
     * @return
     */
    public FetchResult request(FetchRequest req) throws Exception {
        FetchResult fetchResult = new FetchResult();
        HttpUriRequest request = null;
        HttpEntity entity = null;
        String toFetchURL = req.getUrl();
        boolean isPost = false;
        try {
            if (Http.Method.GET.equalsIgnoreCase(req.getHttpMethod()))
                request = new HttpGet(toFetchURL);
            else if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod())) {
                request = new HttpPost(toFetchURL);
                isPost = true;
            } else if (Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod()))
                request = new HttpPut(toFetchURL);
            else if (Http.Method.HEAD.equalsIgnoreCase(req.getHttpMethod()))
                request = new HttpHead(toFetchURL);
            else if (Http.Method.OPTIONS.equalsIgnoreCase(req.getHttpMethod()))
                request = new HttpOptions(toFetchURL);
            else if (Http.Method.DELETE.equalsIgnoreCase(req.getHttpMethod()))
                request = new HttpDelete(toFetchURL);
            else
                throw new Exception("Unknown http method name");

            //???,??
            // TODO ?delay?
            synchronized (mutex) {
                //??
                long now = (new Date()).getTime();
                //?Host??
                if (now - lastFetchTime < config.getPolitenessDelay())
                    Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
                //????HOST??URL
                lastFetchTime = (new Date()).getTime();
            }

            //GZIP???GZIP?
            request.addHeader("Accept-Encoding", "gzip");
            for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();) {
                Entry<String, String> entry = it.next();
                request.addHeader(entry.getKey(), entry.getValue());
            }

            //?
            Header[] headers = request.getAllHeaders();
            for (Header h : headers) {
                Map<String, List<String>> hs = req.getHeaders();
                String key = h.getName();
                List<String> val = hs.get(key);
                if (val == null)
                    val = new ArrayList<String>();
                val.add(h.getValue());

                hs.put(key, val);
            }
            req.getCookies().putAll(this.cookies);
            fetchResult.setReq(req);

            HttpEntity reqEntity = null;
            if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod())
                    || Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) {
                if (!req.getFiles().isEmpty()) {
                    reqEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE);
                    for (Iterator<Entry<String, List<File>>> it = req.getFiles().entrySet().iterator(); it
                            .hasNext();) {
                        Entry<String, List<File>> e = it.next();
                        String paramName = e.getKey();
                        for (File file : e.getValue()) {
                            // For File parameters
                            ((MultipartEntity) reqEntity).addPart(paramName, new FileBody(file));
                        }
                    }

                    for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it
                            .hasNext();) {
                        Entry<String, List<Object>> e = it.next();
                        String paramName = e.getKey();
                        for (Object paramValue : e.getValue()) {
                            // For usual String parameters
                            ((MultipartEntity) reqEntity).addPart(paramName, new StringBody(
                                    String.valueOf(paramValue), "text/plain", Charset.forName("UTF-8")));
                        }
                    }
                } else {
                    List<NameValuePair> params = new ArrayList<NameValuePair>(req.getParams().size());
                    for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it
                            .hasNext();) {
                        Entry<String, List<Object>> e = it.next();
                        String paramName = e.getKey();
                        for (Object paramValue : e.getValue()) {
                            params.add(new BasicNameValuePair(paramName, String.valueOf(paramValue)));
                        }
                    }
                    reqEntity = new UrlEncodedFormEntity(params, HTTP.UTF_8);
                }

                if (isPost)
                    ((HttpPost) request).setEntity(reqEntity);
                else
                    ((HttpPut) request).setEntity(reqEntity);
            }

            //??
            HttpResponse response = httpClient.execute(request);
            headers = response.getAllHeaders();
            for (Header h : headers) {
                Map<String, List<String>> hs = fetchResult.getHeaders();
                String key = h.getName();
                List<String> val = hs.get(key);
                if (val == null)
                    val = new ArrayList<String>();
                val.add(h.getValue());

                hs.put(key, val);
            }
            //URL
            fetchResult.setFetchedUrl(toFetchURL);
            String uri = request.getURI().toString();
            if (!uri.equals(toFetchURL))
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL))
                    fetchResult.setFetchedUrl(uri);

            entity = response.getEntity();
            //???
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode != HttpStatus.SC_OK) {
                if (statusCode != HttpStatus.SC_NOT_FOUND) {
                    Header locationHeader = response.getFirstHeader("Location");
                    //301?302?URL??
                    if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
                            || statusCode == HttpStatus.SC_MOVED_TEMPORARILY))
                        fetchResult.setMovedToUrl(
                                URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL));
                }
                //???OKURLstatusCode??
                //???
                if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) {
                    String[] scs = this.site.getSkipStatusCode().split(",");
                    for (String code : scs) {
                        int c = CommonUtil.toInt(code);
                        //????entity
                        if (statusCode == c) {
                            assemPage(fetchResult, entity);
                            break;
                        }
                    }
                }
                fetchResult.setStatusCode(statusCode);
                return fetchResult;
            }

            //??
            if (entity != null) {
                fetchResult.setStatusCode(statusCode);
                assemPage(fetchResult, entity);
                return fetchResult;
            }
        } catch (Throwable e) {
            fetchResult.setFetchedUrl(e.toString());
            fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal());
            return fetchResult;
        } finally {
            try {
                if (entity == null && request != null)
                    request.abort();
            } catch (Exception e) {
                throw e;
            }
        }

        fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal());
        return fetchResult;
    }

    private void assemPage(FetchResult fetchResult, HttpEntity entity) throws Exception {
        Page page = load(entity);
        page.setUrl(fetchResult.getFetchedUrl());
        fetchResult.setPage(page);
    }

    /**
     * EntityPage
     * @date 2013-1-7 ?11:22:06
     * @param entity
     * @return
     * @throws Exception
     */
    private Page load(HttpEntity entity) throws Exception {
        Page page = new Page();

        //ContentType
        String contentType = null;
        Header type = entity.getContentType();
        if (type != null)
            contentType = type.getValue();
        page.setContentType(contentType);

        //?
        String contentEncoding = null;
        Header encoding = entity.getContentEncoding();
        if (encoding != null)
            contentEncoding = encoding.getValue();
        page.setEncoding(contentEncoding);

        //
        String contentCharset = EntityUtils.getContentCharSet(entity);
        page.setCharset(contentCharset);
        //????
        String charset = config.getCharset();
        String content = this.read(entity.getContent(), charset);
        page.setContent(content);
        //      if (charset == null || charset.trim().length() == 0)
        //         page.setContentData(content.getBytes());
        //      else
        //         page.setContentData(content.getBytes(charset));

        return page;
    }

    /**
     * ????String
     * @date 2013-1-7 ?11:25:04
     * @param inputStream
     * @param charset
     * @return
     */
    private String read(final InputStream inputStream, String charset) {
        StringBuilder sb = new StringBuilder();
        BufferedReader reader = null;
        try {
            if (charset == null || charset.trim().length() == 0)
                reader = new BufferedReader(new InputStreamReader(inputStream));
            else
                reader = new BufferedReader(new InputStreamReader(inputStream, charset));

            String line = null;
            while ((line = reader.readLine()) != null) {
                sb.append(line);
            }
        } catch (IOException e) {
        }

        return sb.toString();
    }

    /**
     * ???
     * @date 2013-1-7 ?11:25:38
     * @param inputStream
     * @return
     * @throws Exception
     */
    private byte[] read(final InputStream inputStream) throws Exception {
        byte[] bytes = new byte[1000];
        int i = 0;
        int b;
        try {
            while ((b = inputStream.read()) != -1) {
                bytes[i++] = (byte) b;
                if (bytes.length == i) {
                    byte[] newBytes = new byte[(bytes.length * 3) / 2 + 1];
                    for (int j = 0; j < bytes.length; j++) {
                        newBytes[j] = bytes[j];
                    }
                    bytes = newBytes;
                }
            }
        } catch (IOException e) {
            throw new Exception("There was a problem reading stream.", e);
        }

        byte[] copy = Arrays.copyOf(bytes, i);

        return copy;
    }

    public HttpClient getHttpClient() {
        return httpClient;
    }

    /**
     * Proxy
     * if (config.getProxyHost() != null) {
     if (config.getProxyUsername() != null) {
        httpClient.getCredentialsProvider().setCredentials(
              new AuthScope(config.getProxyHost(), config.getProxyPort()),
              new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
     }
        
     HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
     httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
    }
     */
}