cascading.load.Main.java Source code

Java tutorial

Introduction

Here is the source code for cascading.load.Main.java

Source

/*
 * Copyright (c) 2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.concurrentinc.com/
 */

package cascading.load;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import cascading.cascade.Cascade;
import cascading.cascade.CascadeConnector;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.load.countsort.CountSort;
import cascading.load.generate.GenerateData;
import cascading.load.join.MultiJoin;
import cascading.load.pipeline.Pipeline;
import cascading.load.util.StatsPrinter;
import cascading.load.util.Util;
import cascading.operation.DebugLevel;
import cascading.pipe.cogroup.CoGroupClosure;
import cascading.scheme.TextLine;
import cascading.stats.CascadeStats;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryCollector;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;

/**
 *
 */
public class Main {
    private static final Logger LOG = Logger.getLogger(Options.class);

    private Options options;

    public static void main(String[] args) throws Exception {
        new Main(args).execute();
    }

    public Main(String[] args) throws IOException {
        options = new Options();

        initOptions(args, options);
    }

    public boolean execute() throws Exception {
        List<Flow> flows = new ArrayList<Flow>();

        if (options.isDataGenerate())
            flows.add(new GenerateData(options, getDefaultProperties()).createFlow());

        if (options.isCountSort())
            flows.add(new CountSort(options, getDefaultProperties()).createFlow());

        if (options.isMultiJoin())
            flows.add(new MultiJoin(options, getDefaultProperties()).createFlow());

        if (options.isPipeline())
            flows.add(new Pipeline(options, getDefaultProperties()).createFlow());

        Cascade cascade = new CascadeConnector(getDefaultProperties()).connect(flows.toArray(new Flow[0]));

        CascadeStats stats = cascade.getCascadeStats();

        try {
            cascade.complete();
        } catch (Exception exception) {
            LOG.error("failed running cascade ", exception);

            return false;
        }

        printSummary(stats);

        return true;
    }

    private void printSummary(CascadeStats stats) throws IOException {
        stats.captureDetail();

        OutputStream outputStream = options.hasStatsRoot() ? new ByteArrayOutputStream() : System.out;
        PrintWriter writer = new PrintWriter(outputStream);

        writer.println(options);

        StatsPrinter.printCascadeStats(writer, stats);

        if (options.hasStatsRoot()) {
            String[] lines = outputStream.toString().split("\n");

            Hfs statsTap = new Hfs(new TextLine(), options.getStatsRoot(), SinkMode.REPLACE);

            TupleEntryCollector tapWriter = statsTap.openForWrite(new JobConf());

            for (String line : lines)
                tapWriter.add(new Tuple(line));

            tapWriter.close();
        }
    }

    protected Properties getDefaultProperties() throws IOException {
        Properties properties = new Properties();

        if (options.isDebugLogging())
            properties.put("log4j.logger", "cascading=DEBUG,load=DEBUG");
        else
            properties.put("log4j.logger", "cascading=INFO,load=INFO");

        if (options.isDebugLogging())
            FlowConnector.setDebugLevel(properties, DebugLevel.VERBOSE);
        else
            FlowConnector.setDebugLevel(properties, DebugLevel.NONE);

        properties.setProperty(CoGroupClosure.SPILL_THRESHOLD, Integer.toString(options.getTupleSpillThreshold()));

        //    properties.setProperty( "mapred.output.compress", "true" );
        properties.setProperty("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
        properties.setProperty("mapred.output.compression.type", "BLOCK");
        properties.setProperty("mapred.compress.map.output", "true");

        // -XX:+UseParallelOldGC -XX:ParallelGCThreads=1
        properties.setProperty("mapred.child.java.opts", "-server -Xmx1000m -XX:+UseParallelOldGC");

        if (options.getNumDefaultMappers() != -1)
            properties.setProperty("mapred.map.tasks", Integer.toString(options.getNumDefaultMappers()));

        if (options.getNumDefaultReducers() != -1)
            properties.setProperty("mapred.reduce.tasks", Integer.toString(options.getNumDefaultReducers()));

        properties.setProperty("mapred.map.tasks.speculative.execution",
                options.isMapSpecExec() ? "true" : "false");
        properties.setProperty("mapred.reduce.tasks.speculative.execution",
                options.isReduceSpecExec() ? "true" : "false");

        properties.setProperty("dfs.block.size", Long.toString(options.getBlockSizeMB() * 1024 * 1024));

        // need to try and detect if native codecs are loaded, if so, use gzip
        if (Util.hasNativeZlib()) {
            properties.setProperty("mapred.map.output.compression.codec",
                    "org.apache.hadoop.io.compress.GzipCodec");
            LOG.info("using native codec for gzip");
        } else {
            properties.setProperty("mapred.map.output.compression.codec",
                    "org.apache.hadoop.io.compress.DefaultCodec");
            LOG.info("native codec not found");
        }

        for (String property : options.getHadoopProperties()) {
            String[] split = property.split("=");
            properties.setProperty(split[0], split[1]);
        }

        FlowConnector.setApplicationJarClass(properties, Main.class);

        return properties;
    }

    protected static Options initOptions(String[] args, Options options) throws IOException {
        CmdLineParser parser = new CmdLineParser(options);

        try {
            parser.parseArgument(args);
        } catch (CmdLineException exception) {
            System.out.print("error: ");
            System.out.println(exception.getMessage());
            printUsageAndExit(parser);
        }

        options.prepare();

        LOG.info(options);

        return options;
    }

    private static void printCascadingVersion() {
        try {
            Properties versionProperties = new Properties();

            InputStream stream = Cascade.class.getClassLoader().getResourceAsStream("cascading/version.properties");
            versionProperties.load(stream);

            stream = Cascade.class.getClassLoader().getResourceAsStream("cascading/build.number.properties");
            if (stream != null)
                versionProperties.load(stream);

            String releaseMajor = versionProperties.getProperty("cascading.release.major");
            String releaseMinor = versionProperties.getProperty("cascading.release.minor", null);
            String releaseBuild = versionProperties.getProperty("build.number", null);
            String releaseFull = null;

            if (releaseMinor == null)
                releaseFull = releaseMajor;
            else if (releaseBuild == null)
                releaseFull = String.format("%s.%s", releaseMajor, releaseMinor);
            else
                releaseFull = String.format("%s.%s%s", releaseMajor, releaseMinor, releaseBuild);

            System.out.println(String.format("Using Cascading %s", releaseFull));
        } catch (IOException exception) {
            System.out.println("Unknown Cascading Version");
        }
    }

    private static void printLicense() {
        try {
            InputStream stream = Main.class.getResourceAsStream("/LOAD-LICENSE.txt");
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream));

            System.out.print("This release is licensed under the ");

            String line = reader.readLine();

            while (line != null) {
                if (line.matches("^Binary License:.*$")) {
                    System.out.println(line.substring(15).trim());
                    break;
                }

                line = reader.readLine();
            }

            reader.close();
        } catch (IOException exception) {
            System.out.println("Unspecified License");
        }
    }

    protected static void printUsageAndExit(CmdLineParser parser) {
        System.out.println("cascading.load [options...]");

        printLicense();
        printCascadingVersion();

        //    System.err.println( "Optional:" );
        //    System.err.println( String.format( " env vars: %s, %s", AWS.AWS_ACCESS_KEY_ENV, AWS.AWS_SECRET_KEY_ENV ) );

        System.out.println("");
        System.out.println("Usage:");
        parser.printUsage(System.out);

        System.exit(1);
    }

}