org.sbs.goodcrawler.jobconf.FetchConfig.java Source code

Java tutorial

Introduction

Here is the source code for org.sbs.goodcrawler.jobconf.FetchConfig.java

Source

/**
 * ########################  SHENBAISE'S WORK  ##########################
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sbs.goodcrawler.jobconf;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.sbs.goodcrawler.conf.Configuration;
import org.sbs.goodcrawler.exception.ConfigurationException;
import org.sbs.goodcrawler.exception.QueueException;
import org.sbs.pendingqueue.PendingManager;
import org.sbs.url.WebURL;
import org.sbs.util.BloomfilterHelper;

import com.google.common.collect.Lists;

/**
 * @author whiteme
 * @date 201383
 * @desc 
 */
public class FetchConfig extends Configuration {
    private Log log = LogFactory.getLog(this.getClass());

    public FetchConfig() {

    }

    private String type;
    /**
     * job??
     */
    private int threadNum;
    /**
     * Socket??
     */
    private int socketTimeoutMilliseconds = 10000;
    /**
     * connection??
     */
    private int connectionTimeout = 20000;
    /**
     * 
     */
    private int delayBetweenRequests = 200;
    /**
     * ?-1?
     */
    private int maxDepthOfCrawling = -1;
    /**
     * ???
     */
    private int maxOutgoingLinksToFollow = 5000;
    /**
     * ?
     */
    private boolean fetchBinaryContent = false;
    /**
     * ??
     */
    private String fileSuffix = "jpg,gif,png,avi,mtk";
    /**
     * agent
     */
    private String agent;
    /**
     * ?https
     */
    private boolean https = true;
    /**
     * ?????
     */
    private boolean onlyDomain = true;
    /**
     * ??robots??
     */
    private boolean robots = true;
    /**
     * 
     */
    private int maxTotalConnections = 100;
    /**
     * ?
     */
    private int maxConnectionsPerHost = 100;
    /**
     * ?????
     */
    private int maxDownloadSizePerPage = 1048576;
    /**
     * ?
     */
    private String proxyHost = null;
    /**
     * ??
     */
    private int proxyPort = 80;

    /**
     * ???
     */
    private String proxyUsername = null;

    /**
     * ??
     */
    private String proxyPassword = null;
    /**
     * ???
     */
    private List<String> seeds = Lists.newArrayList();
    /**
     * url
     */
    private List<String> fetchUrlFilters = Lists.newArrayList();
    /**
     * ?Url?
     */
    private List<String> extractUrlfilters = Lists.newArrayList();

    public int getThreadNum() {
        return threadNum;
    }

    public void setThreadNum(int threadNum) {
        this.threadNum = threadNum;
    }

    public int getSocketTimeoutMilliseconds() {
        return socketTimeoutMilliseconds;
    }

    public void setSocketTimeoutMilliseconds(int socketTimeoutMilliseconds) {
        this.socketTimeoutMilliseconds = socketTimeoutMilliseconds;
    }

    public int getConnectionTimeout() {
        return connectionTimeout;
    }

    public void setConnectionTimeout(int connectionTimeout) {
        this.connectionTimeout = connectionTimeout;
    }

    public int getDelayBetweenRequests() {
        return delayBetweenRequests;
    }

    public void setDelayBetweenRequests(int delayBetweenRequests) {
        this.delayBetweenRequests = delayBetweenRequests;
    }

    public int getMaxDepthOfCrawling() {
        return maxDepthOfCrawling;
    }

    public void setMaxDepthOfCrawling(int maxDepthOfCrawling) {
        this.maxDepthOfCrawling = maxDepthOfCrawling;
    }

    public int getMaxOutgoingLinksToFollow() {
        return maxOutgoingLinksToFollow;
    }

    public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) {
        this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow;
    }

    public boolean isFetchBinaryContent() {
        return fetchBinaryContent;
    }

    public void setFetchBinaryContent(boolean fetchBinaryContent) {
        this.fetchBinaryContent = fetchBinaryContent;
    }

    public String getFileSuffix() {
        return fileSuffix;
    }

    public void setFileSuffix(String fileSuffix) {
        this.fileSuffix = fileSuffix;
    }

    public String getAgent() {
        return agent;
    }

    public void setAgent(String agent) {
        this.agent = agent;
    }

    public boolean isHttps() {
        return https;
    }

    public void setHttps(boolean https) {
        this.https = https;
    }

    public boolean isOnlyDomain() {
        return onlyDomain;
    }

    public void setOnlyDomain(boolean onlyDomain) {
        this.onlyDomain = onlyDomain;
    }

    public boolean isRobots() {
        return robots;
    }

    public void setRobots(boolean robots) {
        this.robots = robots;
    }

    public int getMaxTotalConnections() {
        return maxTotalConnections;
    }

    public void setMaxTotalConnections(int maxTotalConnections) {
        this.maxTotalConnections = maxTotalConnections;
    }

    public int getMaxConnectionsPerHost() {
        return maxConnectionsPerHost;
    }

    public void setMaxConnectionsPerHost(int maxConnectionsPerHost) {
        this.maxConnectionsPerHost = maxConnectionsPerHost;
    }

    public int getMaxDownloadSizePerPage() {
        return maxDownloadSizePerPage;
    }

    public void setMaxDownloadSizePerPage(int maxDownloadSizePerPage) {
        this.maxDownloadSizePerPage = maxDownloadSizePerPage;
    }

    public String getProxyHost() {
        return proxyHost;
    }

    public void setProxyHost(String proxyHost) {
        this.proxyHost = proxyHost;
    }

    public int getProxyPort() {
        return proxyPort;
    }

    public void setProxyPort(int proxyPort) {
        this.proxyPort = proxyPort;
    }

    public String getProxyUsername() {
        return proxyUsername;
    }

    public void setProxyUsername(String proxyUsername) {
        this.proxyUsername = proxyUsername;
    }

    public String getProxyPassword() {
        return proxyPassword;
    }

    public void setProxyPassword(String proxyPassword) {
        this.proxyPassword = proxyPassword;
    }

    public List<String> getSeeds() {
        return seeds;
    }

    public void setSeeds(List<String> seeds) {
        this.seeds = seeds;
    }

    public List<String> getFetchUrlFilters() {
        return fetchUrlFilters;
    }

    public void setFetchUrlFilters(List<String> fetchUrlFilters) {
        this.fetchUrlFilters = fetchUrlFilters;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public List<String> getExtractUrlfilters() {
        return extractUrlfilters;
    }

    public void setExtractUrlfilters(List<String> extractUrlfilters) {
        this.extractUrlfilters = extractUrlfilters;
    }

    /**
     * ???
     * @param confFile
     * @return
     */
    public FetchConfig loadConfig(Document confDoc) throws ConfigurationException {
        try {
            Document doc = confDoc;
            super.jobName = doc.select("job").attr("name");
            super.indexName = doc.select("job").attr("indexName");
            Elements e = doc.select("fetch");
            this.type = e.select("type").text();
            this.agent = e.select("agent").text();
            String temp = e.select("threadNum").text();
            if (StringUtils.isNotBlank(temp)) {
                this.threadNum = Integer.parseInt(temp);
            }

            temp = e.select("delayBetweenRequests").text();
            if (StringUtils.isNotBlank(temp)) {
                this.delayBetweenRequests = Integer.parseInt(temp);
            }

            temp = e.select("maxDepthOfCrawling").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxDepthOfCrawling = Integer.parseInt(temp);
            }

            temp = e.select("fetchBinaryContent").text();
            if (StringUtils.isNotBlank(temp)) {
                this.fetchBinaryContent = Boolean.parseBoolean(temp);
            }

            if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) {
                this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text());
            }

            temp = e.select("fileSuffix").text();
            if (StringUtils.isNotBlank(temp)) {
                this.fileSuffix = temp;
            }

            temp = e.select("maxDownloadSizePerPage").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxDownloadSizePerPage = Integer.parseInt(temp);
            }

            temp = e.select("https").text();
            if (StringUtils.isNotBlank(temp)) {
                this.https = Boolean.parseBoolean(temp);
            }

            temp = e.select("onlyDomain").text();
            if (StringUtils.isNotBlank(temp)) {
                this.onlyDomain = Boolean.parseBoolean(temp);
            }

            temp = e.select("socketTimeoutMilliseconds").text();
            if (StringUtils.isNotBlank(temp)) {
                this.socketTimeoutMilliseconds = Integer.parseInt(temp);
            }

            temp = e.select("connectionTimeout").text();
            if (StringUtils.isNotBlank(temp)) {
                this.connectionTimeout = Integer.parseInt(temp);
            }

            temp = e.select("maxTotalConnections").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxTotalConnections = Integer.parseInt(temp);
            }

            temp = e.select("maxConnectionsPerHost").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text());
            }

            temp = e.select("maxConnectionsPerHost").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxConnectionsPerHost = Integer.parseInt(temp);
            }

            if (StringUtils.isNotBlank(e.select("proxyHost").text())) {
                this.proxyHost = e.select("proxyHost").text();
            }
            if (StringUtils.isNotBlank(e.select("proxyPort").text())) {
                this.proxyPort = Integer.parseInt(e.select("proxyPort").text());
            }
            if (StringUtils.isNotBlank(e.select("proxyUsername").text())) {
                this.proxyUsername = e.select("proxyUsername").text();
            }
            if (StringUtils.isNotBlank(e.select("proxyPassword").text())) {
                this.proxyPassword = e.select("proxyPassword").text();
            }
            if (StringUtils.isNotBlank(e.select("proxyHost").text())) {
                this.proxyHost = e.select("proxyHost").text();
            }

            // seed
            Elements seeds = doc.select("fetch seeds seed");
            for (Element element : seeds) {
                WebURL url = new WebURL();
                String seed = element.text();
                this.seeds.add(seed);
                url.setURL(seed);
                url.setJobName(jobName);
                url.setDepth((short) 0);
                try {
                    PendingManager.getPendingUlr(jobName).addElement(url);
                    BloomfilterHelper.getInstance().add(url.getURL());
                } catch (QueueException e1) {
                    e1.printStackTrace();
                }
            }

            /*
             * ??Url
             */
            Elements fetchUrlFilters = doc.select("fetchUrlFilters filter");
            for (Element element : fetchUrlFilters) {
                this.fetchUrlFilters.add(element.text());
            }

            /*
             * ?????Url
             */
            Elements extractUrlfilters = doc.select("extractUrlfilters filter");
            for (Element element : extractUrlfilters) {
                this.extractUrlfilters.add(element.text());
            }
        } catch (NumberFormatException e) {
            throw new ConfigurationException("?" + e.getMessage());
        }

        return this;
    }

    @Override
    public String toString() {
        final int maxLen = 10;
        StringBuilder builder = new StringBuilder();
        builder.append("FetchConfig [log=").append(log).append(", type=").append(type).append(", threadNum=")
                .append(threadNum).append(", socketTimeoutMilliseconds=").append(socketTimeoutMilliseconds)
                .append(", connectionTimeout=").append(connectionTimeout).append(", delayBetweenRequests=")
                .append(delayBetweenRequests).append(", maxDepthOfCrawling=").append(maxDepthOfCrawling)
                .append(", maxOutgoingLinksToFollow=").append(maxOutgoingLinksToFollow)
                .append(", fetchBinaryContent=").append(fetchBinaryContent).append(", fileSuffix=")
                .append(fileSuffix).append(", agent=").append(agent).append(", https=").append(https)
                .append(", onlyDomain=").append(onlyDomain).append(", robots=").append(robots)
                .append(", maxTotalConnections=").append(maxTotalConnections).append(", maxConnectionsPerHost=")
                .append(maxConnectionsPerHost).append(", maxDownloadSizePerPage=").append(maxDownloadSizePerPage)
                .append(", proxyHost=").append(proxyHost).append(", proxyPort=").append(proxyPort)
                .append(", proxyUsername=").append(proxyUsername).append(", proxyPassword=").append(proxyPassword)
                .append(", seeds=").append(seeds != null ? seeds.subList(0, Math.min(seeds.size(), maxLen)) : null)
                .append(", fetchUrlFilters=")
                .append(fetchUrlFilters != null
                        ? fetchUrlFilters.subList(0, Math.min(fetchUrlFilters.size(), maxLen))
                        : null)
                .append(", extractUrlfilters=")
                .append(extractUrlfilters != null
                        ? extractUrlfilters.subList(0, Math.min(extractUrlfilters.size(), maxLen))
                        : null)
                .append("]");
        return builder.toString();
    }

    // test
    public static void main(String[] args) {
        FetchConfig fetchConfig = new FetchConfig();
        Document document;
        try {
            document = Jsoup.parse(new File("conf/youku_conf.xml"), "utf-8");
            System.out.println(fetchConfig.loadConfig(document).toString());
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ConfigurationException e) {
            e.printStackTrace();
        }
    }
}