com.fatwire.dta.sscrawler.URLReaderService.java Source code

Java tutorial

Introduction

Here is the source code for com.fatwire.dta.sscrawler.URLReaderService.java

Source

/*
 * Copyright 2008 FatWire Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fatwire.dta.sscrawler;

import java.io.IOException;
import java.net.ConnectException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicInteger;

import javax.management.MBeanServer;
import javax.management.ObjectName;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.fatwire.dta.sscrawler.domain.HostConfig;
import com.fatwire.dta.sscrawler.events.PageletRenderedEvent;
import com.fatwire.dta.sscrawler.events.PageletRenderingListener;
import com.fatwire.dta.sscrawler.handlers.BodyHandler;
import com.fatwire.dta.sscrawler.jobs.ProgressMonitor;
import com.fatwire.dta.sscrawler.util.HelperStrings;
import com.fatwire.dta.sscrawler.util.SSUriHelper;

public class URLReaderService {
    private final Log log = LogFactory.getLog(getClass());

    private volatile boolean stopped = false;

    private HostConfig hostConfig;

    private SSUriHelper uriHelper;

    private BodyHandler handler;

    private MultiThreadedHttpConnectionManager connectionManager;

    private final HttpClientService httpClientService = new HttpClientService() {
        private final ThreadLocal<HttpClient> tl = new ThreadLocal<HttpClient>() {

            /*
             * (non-Javadoc)
             * 
             * @see java.lang.ThreadLocal#initialValue()
             */
            @Override
            protected HttpClient initialValue() {
                return initClient();
            }

        };

        public HttpClient get() {
            return tl.get();
        }
    };

    private int maxPages = Integer.MAX_VALUE;

    private final Set<PageletRenderingListener> listeners = new CopyOnWriteArraySet<PageletRenderingListener>();

    private final List<Link> startUrls = new ArrayList<Link>();

    private final Scheduler scheduler;

    public URLReaderService(final Executor readerPool) {
        super();
        scheduler = new Scheduler(readerPool);

    }

    interface HttpClientService {
        HttpClient get();
    }

    protected HttpClient initClient() {
        final HttpClient client = new HttpClient(connectionManager);
        client.getHostConfiguration().setHost(hostConfig.getHostname(), hostConfig.getPort(),
                hostConfig.getProtocol());

        if (hostConfig.getProxyHost() != null) {
            client.getHostConfiguration().setProxyHost(hostConfig.getProxyHost());
            if (hostConfig.getProxyCredentials() != null) {
                client.getState().setProxyCredentials(AuthScope.ANY, hostConfig.getProxyCredentials());
            }
        }

        client.getParams().setParameter(HttpMethodParams.USER_AGENT, "ss-crawler-0.9");

        // RFC 2101 cookie management spec is used per default
        // to parse, validate, format & match cookies
        // client.getParams().setCookiePolicy(CookiePolicy.RFC_2109);
        // client.getParams().setCookiePolicy(CookiePolicy.DEFAULT);
        client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

        // client.getParams().makeStrict();
        client.getParams().getDefaults().setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);

        client.getState().addCookie(new Cookie(hostConfig.getHostname(), HelperStrings.SS_CLIENT_INDICATOR,
                Boolean.TRUE.toString(), hostConfig.getDomain(), -1, false));
        return client;
    }

    public void start(final ProgressMonitor monitor) {

        connectionManager = new MultiThreadedHttpConnectionManager();
        connectionManager.getParams().setConnectionTimeout(30000);
        connectionManager.getParams().setDefaultMaxConnectionsPerHost(1500);
        connectionManager.getParams().setMaxTotalConnections(30000);
        final MBeanServer platform = java.lang.management.ManagementFactory.getPlatformMBeanServer();
        try {
            platform.registerMBean(new ReaderService(scheduler, connectionManager),
                    new ObjectName("com.fatwire.crawler:name=scheduler"));
        } catch (final Throwable t) {
            log.error(t.getMessage(), t);
        }

        monitor.beginTask("Crawling on " + hostConfig.toString(), maxPages == Integer.MAX_VALUE ? -1 : maxPages);
        scheduler.monitor = monitor;
        for (final QueryString thingToDo : startUrls) {

            scheduler.schedulePage(thingToDo);
        }
        scheduler.waitForlAllTasksToFinish();
        try {
            connectionManager.shutdown();
        } catch (final Throwable t) {
            log.error(t.getMessage(), t);
        }
        try {
            platform.unregisterMBean(new ObjectName("com.fatwire.crawler:name=scheduler"));
        } catch (final Throwable t) {
            log.error(t.getMessage(), t);
        }

        monitor.done();

    }

    class Scheduler {

        private final Set<QueryString> urlsDone = new HashSet<QueryString>();

        private final Executor executor;

        private final CountDownLatch complete = new CountDownLatch(1);

        private final AtomicInteger scheduledCounter = new AtomicInteger();

        private final AtomicInteger count = new AtomicInteger();

        private final AtomicInteger completeCount = new AtomicInteger();

        private ProgressMonitor monitor;

        private final boolean requestPageData = true;

        /**
         * @param executor
         */
        public Scheduler(final Executor readerPool) {
            super();
            executor = readerPool;
        }

        synchronized void schedulePage(final QueryString qs) {
            if (monitor.isCanceled()) {
                return;
            }

            if (qs instanceof Link) {
                if (count.incrementAndGet() > maxPages) {
                    return; // do not schedule beyond max number of pages
                }
            }
            urlsDone.add(qs);
            final String uri = checkUri(qs);

            scheduledCounter.incrementAndGet();
            try {
                final UrlRenderingCallable downloader = new UrlRenderingCallable(httpClientService, uri, qs);
                int priority = 0;
                if (qs instanceof Link) {
                    priority = 5;
                }
                executor.execute(new Harvester(downloader, monitor, priority, count.get()));

            } catch (final Exception e) {
                scheduledCounter.decrementAndGet();
                log.error(e.getMessage(), e);

            }

        }

        private String checkUri(final QueryString ssuri) {
            final String uri = uriHelper.toLink(ssuri);
            if (requestPageData) {
                if (ssuri.has(HelperStrings.SS_PAGEDATA_REQUEST) == false) {
                    return uri + "&" + HelperStrings.SS_PAGEDATA_REQUEST + "=true";
                }
            } else {
                if (ssuri.has(HelperStrings.SS_CLIENT_INDICATOR) == false) {
                    return uri + "&" + HelperStrings.SS_CLIENT_INDICATOR + "=true";
                }

            }
            return uri;
        }

        void pageComplete(final ResultPage page) {

            synchronized (this) {
                completeCount.incrementAndGet();
                for (final QueryString ssUri : page.getMarkers()) {

                    if (!urlsDone.contains(ssUri)) {
                        if (log.isDebugEnabled()) {
                            log.debug("adding " + ssUri);
                        }
                        schedulePage(ssUri);
                    }
                }
                for (final QueryString ssUri : page.getLinks()) {
                    if (!urlsDone.contains(ssUri)) {
                        if (log.isDebugEnabled()) {
                            log.debug("adding " + ssUri);
                        }
                        schedulePage(ssUri);
                    }
                }
            }
            final PageletRenderedEvent event = new PageletRenderedEvent(page);
            for (final PageletRenderingListener listener : listeners) {
                listener.renderPerformed(event);
            }
        }

        public void taskFinished() {
            log.debug("Active workers: " + scheduledCounter.get());
            if (scheduledCounter.decrementAndGet() == 0) {
                complete.countDown();
            }

        }

        void waitForlAllTasksToFinish() {
            try {
                complete.await();
            } catch (final InterruptedException e) {
                log.warn(e, e);
            }

        }

        public int getCount() {
            return count.get();
        }

        public int getScheduledCount() {
            return scheduledCounter.get();
        }

        public int getCompleteCount() {
            return completeCount.get();
        }

    }

    class Harvester implements Runnable, Comparable<Harvester> {
        private final UrlRenderingCallable downloader;

        private final ProgressMonitor monitor;

        private final int priority;

        private final int orderNumber;

        /**
         * @param downloader
         */
        public Harvester(final UrlRenderingCallable downloader, final ProgressMonitor monitor, final int priority,
                final int orderNumber) {
            super();
            this.downloader = downloader;
            this.monitor = monitor;
            this.priority = priority;
            this.orderNumber = orderNumber;
        }

        public void run() {
            try {
                if (monitor.isCanceled()) {
                    return;
                }
                final ResultPage page;
                page = downloader.call();
                if (page.getBody() != null) {
                    handler.visit(page);
                }
                scheduler.pageComplete(page);
            } catch (final ConnectException e) {
                log.error(e + " for " + downloader.getUri());
            } catch (final HttpException e) {
                log.error(e + " for " + downloader.getUri());
            } catch (final IOException e) {
                log.error(e + " for " + downloader.getUri());
            } catch (final Exception e) {
                log.error(e + " for " + downloader.getUri(), e);
            } finally {
                scheduler.taskFinished();
            }

        }

        public int compareTo(final Harvester o) {
            // comparing on priority and orderNumber, with same priority, the
            // lower order number comes first.
            if (priority != o.priority) {
                return priority < o.priority ? -1 : 1;
            } else if (orderNumber == o.orderNumber) {
                return 0;
            } else {
                return orderNumber < o.orderNumber ? -1 : 1;
            }

        }

    }

    /**
     * @return true if the service is stopped
     */
    public boolean isStopped() {
        return stopped;
    }

    /**
     * stop the service
     */
    public void stop() {
        stopped = true;
    }

    /**
     * @return the maxPages
     */
    public int getMaxPages() {
        return maxPages;
    }

    /**
     * @param maxPages the maxPages to set
     */
    public void setMaxPages(final int maxPages) {
        this.maxPages = maxPages;
    }

    public void addListener(final PageletRenderingListener listener) {
        listeners.add(listener);
    }

    public void removeListener(final PageletRenderingListener listener) {
        listeners.remove(listener);
    }

    public void setHostConfig(final HostConfig hostConfig) {
        this.hostConfig = hostConfig;

    }

    public void addStartUris(final Collection<Link> uri) {
        startUrls.addAll(uri);
    }

    /**
     * @return the handler
     */
    public BodyHandler getHandler() {
        return handler;
    }

    /**
     * @param handler the handler to set
     */
    public void setHandler(final BodyHandler handler) {
        this.handler = handler;
    }

    /**
     * @return the uriHelper
     */
    public SSUriHelper getUriHelper() {
        return uriHelper;
    }

    /**
     * @param uriHelper the uriHelper to set
     */
    public void setUriHelper(final SSUriHelper uriHelper) {
        this.uriHelper = uriHelper;
    }

}