opendap.metacat.old_code.ThreddsCrawler.java Source code

Introduction

Here is the source code for opendap.metacat.old_code.ThreddsCrawler.java
Source

/////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 OPeNDAP, Inc.
// Author: James Gallagher <jgallagher@opendap.org>
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
/////////////////////////////////////////////////////////////////////////////

package opendap.metacat.old_code;

import java.io.PrintStream;
import java.util.Enumeration;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import opendap.metacat.ThreddsCatalogUtil;

/**
 * This class is a simple thredds catalog crawler that uses an Enumeration
 * object returned by the ThreddsCatalogUtil() class. This crawler makes an
 * instance of TCU and configures it so that caching is turned on. It can 
 * crawl a set of catalogs starting at some root node and then re-play the
 * crawl using cached data.
 * 
 * @note The 're-play' option is cool but it's also a bit of a fraud because
 * the order of the catalog URLs is not the same as for the real crawl.
 * 
 * @author jimg
 *
 */
public class ThreddsCrawler {

    private ThreddsCatalogUtil tcc;

    private int catalogsVisited = 0;
    private boolean verbose = false;
    private String cacheNamePrefix = "";

    /**
     * @param args
     */
    @SuppressWarnings("static-access")
    public static void main(String[] args) {

        ThreddsCrawler crawler = new ThreddsCrawler();

        // create the command line parser
        CommandLineParser parser = new PosixParser();

        // create the Options
        Options options = new Options();

        options.addOption("r", "read-cache", false, "Read catalogs from the cache");
        options.addOption("n", "no-cache", false, "Do not cache catalogs");
        options.addOption("p", "print-catalogs", false, "Print the THREDDS catalogs");
        options.addOption("v", "verbose", false, "Verbose output");

        options.addOption(OptionBuilder.withLongOpt("cache-name")
                .withDescription("Use this to set a prefix for the cache name.").hasArg().withArgName("cacheName")
                .create());

        options.addOption(OptionBuilder.withLongOpt("catalog-root").withDescription("use this as the root catalog")
                .hasArg().withArgName("catalogRoot").create());

        try {
            // parse the command line arguments
            CommandLine line = parser.parse(options, args);

            String catalogURL = line.getOptionValue("catalog-root");
            System.out.println("Catalog Root: " + catalogURL);

            crawler.cacheNamePrefix = line.getOptionValue("cache-name");
            System.out.println("Cache name: " + crawler.cacheNamePrefix);

            if (line.hasOption("v"))
                crawler.verbose = true;

            if (line.hasOption("n")) {
                if (crawler.verbose)
                    System.out.println("Caching is off");
                crawler.crawlCatalog(catalogURL, false, System.out);
                if (crawler.verbose)
                    System.out.println("Found " + new Integer(crawler.catalogsVisited).toString() + " catalogs.");
            } else if (line.hasOption("r")) {
                if (crawler.verbose)
                    System.out.println("Reading from cache, no network accesses");
                crawler.crawlCatalogCache(System.out, line.hasOption("p"));
                if (crawler.verbose)
                    System.out.println("Read " + new Integer(crawler.catalogsVisited).toString() + " catalogs.");
            } else if (line.hasOption("p")) {
                throw new Exception(
                        "The Print Catalog option (-p) can only be used when reading from the cache (-r).");
            } else {
                if (crawler.verbose)
                    System.out.println("Caching is on");
                crawler.crawlCatalog(catalogURL, true, System.out);
                crawler.tcc.saveCatalogCache();
                if (crawler.verbose)
                    System.out.println("Found " + new Integer(crawler.catalogsVisited).toString() + " catalogs.");
            }

        } catch (ParseException exp) {
            System.err.println("Unexpected exception:" + exp.getMessage());
        }

        catch (Exception e) {
            System.err.println("Error : " + e.getMessage());
            e.printStackTrace();
        }
    }

    ThreddsCrawler() {
        catalogsVisited = 0;
    }

    public void crawlCatalog(String catalogURL, boolean useCache, PrintStream ps) throws Exception {
        tcc = new ThreddsCatalogUtil(useCache, cacheNamePrefix, false);

        // The ThreddsCatalogUtil caches by default, so each catalog URL
        // is recorded (but not the catalog itself) every time nextElement()
        // is called.
        Enumeration<String> catalogs = tcc.getCatalogEnumeration(catalogURL);

        if (verbose)
            ps.println("Root catalog: " + catalogURL);
        ++catalogsVisited;

        while (catalogs.hasMoreElements()) {
            String childURL = catalogs.nextElement();
            if (verbose)
                ps.println("catalog: " + childURL);
            ++catalogsVisited;
        }
    }

    public void crawlCatalogCache(PrintStream ps, boolean printCatalog) throws Exception {
        tcc = new ThreddsCatalogUtil(true, cacheNamePrefix, true);

        Enumeration<String> catalogs = tcc.getCachedCatalogEnumeration();

        while (catalogs.hasMoreElements()) {
            String childURL = catalogs.nextElement();
            ps.println("catalog: " + childURL);
            if (printCatalog)
                ps.println(tcc.getCachedCatalog(childURL));
            ++catalogsVisited;
        }
    }
}