org.apache.nutch.tools.ResolveUrls.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.tools.ResolveUrls.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.util.URLUtil;

/**
 * A simple tool that will spin up multiple threads to resolve urls to ip
 * addresses. This can be used to verify that pages that are failing due to
 * UnknownHostException during fetching are actually bad and are not failing due
 * to a dns problem in fetching.
 */
public class ResolveUrls {

    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    private String urlsFile = null;
    private int numThreads = 100;
    private ExecutorService pool = null;
    private static AtomicInteger numTotal = new AtomicInteger(0);
    private static AtomicInteger numErrored = new AtomicInteger(0);
    private static AtomicInteger numResolved = new AtomicInteger(0);
    private static AtomicLong totalTime = new AtomicLong(0L);

    /**
     * A Thread which gets the ip address of a single host by name.
     */
    private static class ResolverThread extends Thread {

        private String url = null;

        public ResolverThread(String url) {
            this.url = url;
        }

        public void run() {

            numTotal.incrementAndGet();
            String host = URLUtil.getHost(url);
            long start = System.currentTimeMillis();
            try {

                // get the address by name and if no error is thrown then it
                // is resolved successfully
                InetAddress.getByName(host);
                LOG.info("Resolved: " + host);
                numResolved.incrementAndGet();
            } catch (Exception uhe) {
                LOG.info("Error Resolving: " + host);
                numErrored.incrementAndGet();
            }
            long end = System.currentTimeMillis();
            long total = (end - start);
            totalTime.addAndGet(total);
            LOG.info(", " + total + " millis");
        }
    }

    /**
     * Creates a thread pool for resolving urls. Reads in the url file on the
     * local filesystem. For each url it attempts to resolve it keeping a total
     * account of the number resolved, errored, and the amount of time.
     */
    public void resolveUrls() {

        try {

            // create a thread pool with a fixed number of threads
            pool = Executors.newFixedThreadPool(numThreads);

            // read in the urls file and loop through each line, one url per line
            BufferedReader buffRead = new BufferedReader(new FileReader(new File(urlsFile)));
            String urlStr = null;
            while ((urlStr = buffRead.readLine()) != null) {

                // spin up a resolver thread per url
                LOG.info("Starting: " + urlStr);
                pool.execute(new ResolverThread(urlStr));
            }

            // close the file and wait for up to 60 seconds before shutting down
            // the thread pool to give urls time to finish resolving
            buffRead.close();
            pool.awaitTermination(60, TimeUnit.SECONDS);
        } catch (Exception e) {

            // on error shutdown the thread pool immediately
            pool.shutdownNow();
            LOG.info(StringUtils.stringifyException(e));
        }

        // shutdown the thread pool and log totals
        pool.shutdown();
        LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get() + ", Errored: " + numErrored.get()
                + ", Average Time: " + totalTime.get() / numTotal.get());
    }

    /**
     * Create a new ResolveUrls with a file from the local file system.
     * 
     * @param urlsFile
     *          The local urls file, one url per line.
     */
    public ResolveUrls(String urlsFile) {
        this(urlsFile, 100);
    }

    /**
     * Create a new ResolveUrls with a urls file and a number of threads for the
     * Thread pool. Number of threads is 100 by default.
     * 
     * @param urlsFile
     *          The local urls file, one url per line.
     * @param numThreads
     *          The number of threads used to resolve urls in parallel.
     */
    public ResolveUrls(String urlsFile, int numThreads) {
        this.urlsFile = urlsFile;
        this.numThreads = numThreads;
    }

    /**
     * Runs the resolve urls tool.
     */
    public static void main(String[] args) {

        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option helpOpts = OptionBuilder.create("help");
        options.addOption(helpOpts);

        OptionBuilder.withArgName("urls");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the urls file to check");
        Option urlOpts = OptionBuilder.create("urls");
        options.addOption(urlOpts);

        OptionBuilder.withArgName("numThreads");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("the number of threads to use");
        Option numThreadOpts = OptionBuilder.create("numThreads");
        options.addOption(numThreadOpts);

        CommandLineParser parser = new GnuParser();
        try {
            // parse out common line arguments
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("urls")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("ResolveUrls", options);
                return;
            }

            // get the urls and the number of threads and start the resolver
            String urls = line.getOptionValue("urls");
            int numThreads = 100;
            String numThreadsStr = line.getOptionValue("numThreads");
            if (numThreadsStr != null) {
                numThreads = Integer.parseInt(numThreadsStr);
            }
            ResolveUrls resolve = new ResolveUrls(urls, numThreads);
            resolve.resolveUrls();
        } catch (Exception e) {
            LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
        }
    }
}