com.threadswarm.imagefeedarchiver.driver.CommandLineDriver.java Source code

Java tutorial

Introduction

Here is the source code for com.threadswarm.imagefeedarchiver.driver.CommandLineDriver.java

Source

/*
 * Copyright 2014 steve(at)threadswarm.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.threadswarm.imagefeedarchiver.driver;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import com.threadswarm.imagefeedarchiver.FeedUtils;
import com.threadswarm.imagefeedarchiver.dao.ProcessedRssItemDAO;
import com.threadswarm.imagefeedarchiver.filter.ChainedRssItemFilter;
import com.threadswarm.imagefeedarchiver.filter.PreviouslyDownloadedItemFilter;
import com.threadswarm.imagefeedarchiver.filter.RssItemFilter;
import com.threadswarm.imagefeedarchiver.model.ProcessedRssItem;
import com.threadswarm.imagefeedarchiver.model.RssChannel;
import com.threadswarm.imagefeedarchiver.model.RssItem;
import com.threadswarm.imagefeedarchiver.parser.FeedParserException;
import com.threadswarm.imagefeedarchiver.parser.RssDOMFeedParser;
import com.threadswarm.imagefeedarchiver.processor.RssItemProcessor;

public class CommandLineDriver implements Runnable {

    private final static Header DNT_HEADER = new BasicHeader("DNT", "1");
    private final static Header RSS_ACCEPT_HEADER = new BasicHeader("Accept",
            "application/rss+xml, application/xml, text/xml");
    private final static Logger LOGGER = LoggerFactory.getLogger(CommandLineDriver.class);

    // configuration parameters
    private final URI rssFeedUri;
    private final File outputDirectory;
    private final int threadCount;
    private final long downloadDelay;
    private final boolean doNotTrackRequested;
    private final boolean forceHttps;

    // components
    private final HttpClient httpClient;
    private final ProcessedRssItemDAO processedRssItemDAO;

    private CommandLineDriver(Builder builder) {
        this.rssFeedUri = builder.rssFeedUri;
        this.outputDirectory = builder.outputDirectory;
        this.threadCount = builder.threadCount;
        this.downloadDelay = builder.downloadDelay;
        this.doNotTrackRequested = builder.doNotTrackRequested;
        this.forceHttps = builder.forceHttps;

        this.httpClient = builder.httpClient;
        this.processedRssItemDAO = builder.processedRssItemDAO;
    }

    public static void main(String[] args) throws InterruptedException, ExecutionException, ParseException {
        // define available command-line options
        Options options = new Options();
        options.addOption("h", "help", false, "display usage information");
        options.addOption("u", "url", true, "RSS feed URL");
        options.addOption("a", "user-agent", true, "User-Agent header value to use when making HTTP requests");
        options.addOption("o", "output-directory", true, "output directory for downloaded images");
        options.addOption("t", "thread-count", true, "number of worker threads, defaults to cpu-count + 1");
        options.addOption("d", "delay", true, "delay between image downloads (in milliseconds)");
        options.addOption("p", "notrack", false, "tell websites that you don't wish to be tracked (DNT)");
        options.addOption("s", "https", false, "Rewrite image URLs to leverage SSL/TLS");

        CommandLineParser commandLineParser = new BasicParser();
        CommandLine commandLine = commandLineParser.parse(options, args);

        // print usage information if 'h'/'help' or no-args were given
        if (args.length == 0 || commandLine.hasOption("h")) {
            HelpFormatter helpFormatter = new HelpFormatter();
            helpFormatter.printHelp("java -jar ImageFeedArchiver.jar", options);
            return; //abort execution
        }

        URI rssFeedUri = null;
        if (commandLine.hasOption("u")) {
            String rssFeedUrlString = commandLine.getOptionValue("u");
            try {
                rssFeedUri = FeedUtils.getUriFromUrlString(rssFeedUrlString);
            } catch (MalformedURLException | URISyntaxException e) {
                LOGGER.error("The Feed URL you supplied was malformed or violated syntax rules.. exiting", e);
                System.exit(1);
            }
            LOGGER.info("Target RSS feed URL: {}", rssFeedUri);
        } else {
            throw new IllegalStateException("RSS feed URL was not specified!");
        }

        File outputDirectory = null;
        if (commandLine.hasOption("o")) {
            outputDirectory = new File(commandLine.getOptionValue("o"));
            if (!outputDirectory.isDirectory())
                throw new IllegalArgumentException("output directory must be a *directory*!");
            LOGGER.info("Using output directory: '{}'", outputDirectory);
        } else {
            throw new IllegalStateException("output directory was not specified!");
        }

        String userAgentString = null;
        if (commandLine.hasOption("a")) {
            userAgentString = commandLine.getOptionValue("a");
            LOGGER.info("Setting 'User-Agent' header value to '{}'", userAgentString);
        }

        int threadCount;
        if (commandLine.hasOption("t")) {
            threadCount = Integer.parseInt(commandLine.getOptionValue("t"));
        } else {
            threadCount = Runtime.getRuntime().availableProcessors() + 1;
        }
        LOGGER.info("Using {} worker threads", threadCount);

        long downloadDelay = 0;
        if (commandLine.hasOption("d")) {
            String downloadDelayString = commandLine.getOptionValue("d");
            downloadDelay = Long.parseLong(downloadDelayString);
        }
        LOGGER.info("Using a download-delay of {} milliseconds", downloadDelay);

        boolean doNotTrackRequested = commandLine.hasOption("p");

        boolean forceHttps = commandLine.hasOption("s");

        ApplicationContext context = new ClassPathXmlApplicationContext("META-INF/applicationContext.xml");
        ((ConfigurableApplicationContext) context).registerShutdownHook();

        HttpClient httpClient = (HttpClient) context.getBean("httpClient");
        if (userAgentString != null)
            httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgentString);

        ProcessedRssItemDAO processedRssItemDAO = (ProcessedRssItemDAO) context.getBean("processedRssItemDAO");

        CommandLineDriver.Builder driverBuilder = new CommandLineDriver.Builder(rssFeedUri);
        driverBuilder.setDoNotTrackRequested(doNotTrackRequested).setOutputDirectory(outputDirectory)
                .setDownloadDelay(downloadDelay).setThreadCount(threadCount).setHttpClient(httpClient)
                .setForceHttps(forceHttps).setProcessedRssItemDAO(processedRssItemDAO);

        CommandLineDriver driver = driverBuilder.build();
        driver.run();
    }

    private RssChannel fetchRssChannel(URI targetUri) throws IOException, FeedParserException {
        RssChannel rssChannel = null;
        HttpEntity responseEntity = null;
        try {
            LOGGER.info("Attempting to fetch feed from URI: {}", targetUri.toString());
            HttpGet rssFeedGet = new HttpGet(targetUri);
            if (doNotTrackRequested) {
                LOGGER.debug("Adding 'DNT' header to feed-fetch request");
                rssFeedGet.addHeader(DNT_HEADER);
            }
            rssFeedGet.addHeader(RSS_ACCEPT_HEADER);

            HttpResponse imageResponse = httpClient.execute(rssFeedGet);
            responseEntity = imageResponse.getEntity();
            String rssFeedXmlString = EntityUtils.toString(responseEntity);
            RssDOMFeedParser parser = new RssDOMFeedParser();
            rssChannel = parser.readFeed(rssFeedXmlString);
        } finally {
            EntityUtils.consumeQuietly(responseEntity);
        }

        return rssChannel;
    }

    @Override
    public void run() {
        //setup filters
        List<RssItemFilter> filterList = new LinkedList<RssItemFilter>();
        filterList.add(new PreviouslyDownloadedItemFilter(processedRssItemDAO));
        RssItemFilter chainedItemFilter = new ChainedRssItemFilter(filterList);

        RssChannel rssChannel = null;
        try {
            rssChannel = fetchRssChannel(rssFeedUri);
        } catch (IOException | FeedParserException e) {
            LOGGER.error(
                    "An Exception was thrown while attempting to download and parse the target RSS feed.. exiting",
                    e);
            System.exit(1);
        }

        List<RssItem> filteredItemList = new LinkedList<RssItem>();
        if (rssChannel != null && rssChannel.getItems() != null) {
            for (RssItem rssItem : rssChannel.getItems()) {
                rssItem = chainedItemFilter.filter(rssItem);
                if (rssItem != null)
                    filteredItemList.add(rssItem);
            }
        }

        if (!filteredItemList.isEmpty()) {
            //create list of headers to be used when downloading images
            List<Header> headerList = new ArrayList<Header>(2);
            if (doNotTrackRequested) {
                LOGGER.debug("Adding 'DNT' header to worker requests");
                headerList.add(DNT_HEADER);
            }
            headerList.add(new BasicHeader(HttpHeaders.REFERER, rssFeedUri.toString()));
            headerList = Collections.unmodifiableList(headerList);

            ExecutorService executorService = null;
            try {
                executorService = Executors.newFixedThreadPool(threadCount);
                CompletionService<ProcessedRssItem> completionService = new ExecutorCompletionService<ProcessedRssItem>(
                        executorService);
                Set<URI> processedURISet = new ConcurrentSkipListSet<URI>();
                int itemCount = 0;
                for (RssItem rssItem : filteredItemList) {
                    completionService.submit(new RssItemProcessor(httpClient, rssItem, processedRssItemDAO,
                            outputDirectory, headerList, processedURISet, downloadDelay, forceHttps));
                    itemCount++;
                }

                LOGGER.info("{} jobs submitted for execution", itemCount);

                for (int x = 0; x < itemCount; x++) {
                    ProcessedRssItem processedItem = completionService.take().get();
                    LOGGER.info("Item status: {} --> [{}]", processedItem.getRssItem().getTitle(),
                            processedItem.getDownloadStatus());
                }
            } catch (InterruptedException e) {
                LOGGER.warn("Thread interrupted while blocking", e);
                Thread.currentThread().interrupt(); // restore interrupt
            } catch (ExecutionException e) {
                LOGGER.error("An Exception was thrown during worker execution and subsequently propagated", e);
                e.printStackTrace();
            } finally {
                executorService.shutdown();
                try {
                    executorService.awaitTermination(10, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    LOGGER.warn("Thread interrupted while blocking", e);
                    Thread.currentThread().interrupt(); // restore interrupt
                }
                httpClient.getConnectionManager().shutdown();
            }
        }
    }

    /**
     * Builder used for creating instances of {@link CommandLineDriver}.
     * <p>
     * This class implements the "fluent-builder" pattern which allows for 
     * multiple setters to be called in a chain-like manner.
     * 
     * @author steve(at)threadswarm.com
     */
    public static class Builder {

        private URI rssFeedUri;
        private File outputDirectory;
        private int threadCount;
        private long downloadDelay;
        private boolean doNotTrackRequested;
        private boolean forceHttps;
        private HttpClient httpClient;
        private ProcessedRssItemDAO processedRssItemDAO;

        /**
         * Default no-arg constructor
         */
        public Builder() {
        }

        /**
         * Constructor which accepts a {@code URI} as an argument.
         * <p>
         * This constructor can be used in lieu of the {@code setRssFeedUri(URI)} 
         * method.  That being said, a call to the aforementioned method 
         * will result in the value provided to this constructor being 
         * overwritten.
         * 
         * @param rssFeedUri
         */
        public Builder(URI rssFeedUri) {
            this.rssFeedUri = rssFeedUri;
        }

        /**
         * Sets the {@code URI} corresponding to the RSS feed to be parsed.
         * 
         * @param rssFeedUri the URI corresponding to the RSS feed.
         * @return the Builder upon which the method call was invoked
         */
        public Builder setRssFeedUri(URI rssFeedUri) {
            this.rssFeedUri = rssFeedUri;

            return this;
        }

        /**
         * Sets the directory in which downloaded images should be written.
         * 
         * @param outputDirectory the directory to be used for storing downloaded images
         * @return the Builder upon which the method call was invoked
         */
        public Builder setOutputDirectory(File outputDirectory) {
            this.outputDirectory = outputDirectory;

            return this;
        }

        /**
         * Sets the number of threads to be used for downloading images.
         * <p>
         * This value sets the limit on the number of parallel downloads 
         * from a given feed.  Typically you should to set this to a 
         * respectful value of three or less.
         * 
         * @param threadCount the number of worker threads to be used for downloading images
         * @return the Builder upon which the method call was invoked
         */
        public Builder setThreadCount(int threadCount) {
            this.threadCount = threadCount;

            return this;
        }

        /**
         * Sets the delay between image downloads in milliseconds.
         * <p>
         * Please note that this value is not global, rather it is on 
         * a per-worker/thread basis.
         * 
         * @param downloadDelay the delay between image downloads in milliseconds
         * @return the Builder upon which the method call was invoked
         */
        public Builder setDownloadDelay(long downloadDelay) {
            this.downloadDelay = downloadDelay;

            return this;
        }

        /**
         * Sets a boolean flag indicating if the "DO NOT TRACK" header should be used.
         * <p>
         * A value of {@code true} indicates that the client wishes that the "DNT" header should 
         * be included with a value of {@code 1}, telling the website that the user does not 
         * wish to be tracked.
         * 
         * @param doNotTrackRequested
         * @return the Builder upon which the method call was invoked
         */
        public Builder setDoNotTrackRequested(boolean doNotTrackRequested) {
            this.doNotTrackRequested = doNotTrackRequested;

            return this;
        }

        /**
         * Sets a boolean flag indicating if "http://" URLs should be rewritten to use "https://".
         * 
         * @param forceHttps
         * @return the Builder upon which the method call was invoked
         */
        public Builder setForceHttps(boolean forceHttps) {
            this.forceHttps = forceHttps;

            return this;
        }

        /**
         * Sets the {@link HttpClient} to be used when making all HTTP requests.
         * 
         * @param httpClient the HttClient instance to be used for all HTTP requests.
         * @return the Builder upon which the method call was invoked
         */
        public Builder setHttpClient(HttpClient httpClient) {
            this.httpClient = httpClient;

            return this;
        }

        /**
         * Sets the {@link ProcessedRssItemDAO} instance to be used during operations.
         * 
         * @param processedRssItemDAO
         * @return the Builder upon which the method call was invoked
         */
        public Builder setProcessedRssItemDAO(ProcessedRssItemDAO processedRssItemDAO) {
            this.processedRssItemDAO = processedRssItemDAO;

            return this;
        }

        /**
         * Returns a configured instance of {@link CommandLineDriver}.
         * 
         * @return an instance of CommandLineDriver that has been configured using this Builder
         */
        public CommandLineDriver build() {
            return new CommandLineDriver(this);
        }
    }

}