org.eweb4j.spiderman.plugin.util.WebDriverDownloader.java Source code

Java tutorial

Introduction

Here is the source code for org.eweb4j.spiderman.plugin.util.WebDriverDownloader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.eweb4j.spiderman.plugin.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;

import org.apache.http.HttpStatus;
import org.eweb4j.spiderman.fetcher.FetchRequest;
import org.eweb4j.spiderman.fetcher.FetchResult;
import org.eweb4j.spiderman.fetcher.Page;
import org.eweb4j.spiderman.fetcher.PageFetcher;
import org.eweb4j.spiderman.fetcher.SpiderConfig;
import org.eweb4j.spiderman.fetcher.Status;
import org.eweb4j.spiderman.xml.Site;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

/**
 * Web ??
 * @author weiwei l.weiwei@163.com
 * @date 2013-1-7 ?11:04:50
 */
public class WebDriverDownloader extends PageFetcher {

    private WebDriver client = null;
    private final Object mutex = new Object();
    private long lastFetchTime = 0;
    private SpiderConfig config;
    private Map<String, String> headers = new Hashtable<String, String>();
    private Map<String, List<String>> cookies = new Hashtable<String, List<String>>();
    private Site site;

    public WebDriverDownloader() {
    }

    public Object getClient() {
        return this.client;
    }

    public void addCookie(String key, String val, String host, String path) {
        Cookie c = new Cookie(key, val, host, path);
        //Cookie
        String name = c.name();
        String value = c.value();
        List<String> vals = this.cookies.get(name);
        if (vals == null)
            vals = new ArrayList<String>();
        vals.add(value);
        this.cookies.put(key, vals);
    }

    public void addHeader(String key, String val) {
        if (this.headers.containsKey(key)) {
            this.headers.put(key, this.headers.get(key) + "; " + val);
        } else {
            this.headers.put(key, val);
        }
    }

    public void init(SpiderConfig config, Site _site) {
        this.config = config;
        String opt = _site.getOption("webdriver.chrome.driver");
        System.getProperties().setProperty("webdriver.chrome.driver", opt);
        client = new ChromeDriver();

        if (_site != null) {
            this.site = _site;
            if (this.site.getHeaders() != null && this.site.getHeaders().getHeader() != null) {
                for (org.eweb4j.spiderman.xml.Header header : this.site.getHeaders().getHeader()) {
                    this.addHeader(header.getName(), header.getValue());
                }
            }
            if (this.site.getCookies() != null && this.site.getCookies().getCookie() != null) {
                for (org.eweb4j.spiderman.xml.Cookie cookie : this.site.getCookies().getCookie()) {
                    this.addCookie(cookie.getName(), cookie.getValue(), cookie.getHost(), cookie.getPath());
                }
            }
        }
    }

    /**
     * ?url
     */
    public FetchResult fetch(FetchRequest req) throws Exception {
        FetchResult fetchResult = new FetchResult();
        String toFetchURL = req.getUrl();
        try {
            //???,??
            // TODO ?delay?
            synchronized (mutex) {
                //??
                long now = (new Date()).getTime();
                //?Host??
                if (now - lastFetchTime < config.getPolitenessDelay())
                    Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
                //????HOST??URL
                lastFetchTime = (new Date()).getTime();
            }

            //get?

            for (String header : this.headers.keySet()) {
                String key = header;
                List<String> val = Arrays.asList(this.headers.get(key).split(","));
                req.getHeaders().put(key, val);
            }

            req.getCookies().putAll(this.cookies);

            fetchResult.setReq(req);

            //get??
            this.client.get(toFetchURL);
            if (this.site.getCookies() != null && this.site.getCookies().getCookie() != null) {
                for (org.eweb4j.spiderman.xml.Cookie cookie : this.site.getCookies().getCookie()) {
                    org.openqa.selenium.Cookie cok = new org.openqa.selenium.Cookie(cookie.getName(),
                            cookie.getValue(), cookie.getHost(), cookie.getPath(), null);
                    this.client.manage().addCookie(cok);
                }
            }

            this.client.get(toFetchURL);
            WebElement html = this.client.findElement(By.tagName("html"));
            //URL
            fetchResult.setFetchedUrl(toFetchURL);
            String uri = toFetchURL;
            if (!uri.equals(toFetchURL))
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL))
                    fetchResult.setFetchedUrl(uri);

            if (html != null) {
                fetchResult.setStatusCode(HttpStatus.SC_OK);
                assemPage(fetchResult, html);
            }
        } catch (Throwable e) {
            e.printStackTrace();
            fetchResult.setFetchedUrl(e.toString());
            fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal());
        }

        return fetchResult;
    }

    private void assemPage(FetchResult fetchResult, WebElement html) throws Exception {
        Page page = load(html);
        page.setUrl(fetchResult.getFetchedUrl());
        fetchResult.setPage(page);
    }

    private Page load(WebElement html) throws Exception {
        Page page = new Page();
        page.setContent(html.getAttribute("outerHTML"));
        return page;
    }

    public void close() throws Exception {
        for (String h : this.client.getWindowHandles()) {
            WebDriver d = this.client.switchTo().window(h);
            try {
                d.quit();
            } catch (Throwable e) {
            }
            try {
                d.quit();
            } catch (Throwable e) {
            }
        }
        try {
            this.client.close();

        } catch (Throwable e) {
        }

        try {
            this.client.quit();
        } catch (Throwable e) {
        }
    }

}