cascading.flow.Flow.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.Flow.java

Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.flow;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import cascading.CascadingException;
import cascading.cascade.Cascade;
import cascading.pipe.Pipe;
import cascading.stats.FlowStats;
import cascading.tap.Tap;
import cascading.tap.hadoop.HttpFileSystem;
import cascading.tap.hadoop.S3HttpFileSystem;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.TupleIterator;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.jgrapht.Graphs;
import org.jgrapht.traverse.TopologicalOrderIterator;
import riffle.process.DependencyIncoming;
import riffle.process.DependencyOutgoing;
import riffle.process.ProcessCleanup;
import riffle.process.ProcessComplete;
import riffle.process.ProcessPrepare;
import riffle.process.ProcessStart;
import riffle.process.ProcessStop;

/**
 * A {@link Pipe} assembly is connected to the necessary number of {@link Tap} sinks and
 * sources into a Flow.  A Flow is then executed to push the incoming source data through
 * the assembly into one or more sinks.
 * <p/>
 * Note that {@link Pipe} assemblies can be reused in multiple Flow instances. They maintain
 * no state regarding the Flow execution. Subsequently, {@link Pipe} assemblies can be given
 * parameters through its calling Flow so they can be built in a generic fashion.
 * <p/>
 * When a Flow is created, an optimized internal representation is created that is then executed
 * within the cluster. Thus any overhead inherent to a give {@link Pipe} assembly will be removed
 * once it's placed in context with the actual execution environment.
 * <p/>
 * <p/>
 * Flows are submitted in order of dependency. If two or more steps do not share the same dependencies and all
 * can be scheduled simultaneously, the {@link #getSubmitPriority()} value determines the order in which
 * all steps will be submitted for execution. The default submit priority is 5.
 * </p>
 * <strong>Properties</strong><br/>
 * <ul>
 * <li>cascading.flow.preservetemporaryfiles</li>
 * <li>cascading.flow.stopjobsonexit</li>
 * </ul>
 *
 * @see cascading.flow.FlowConnector
 */
@riffle.process.Process
public class Flow implements Runnable {
    /** Field LOG */
    private static final Logger LOG = Logger.getLogger(Flow.class);

    /** Field hdfsShutdown */
    private static Thread hdfsShutdown = null;
    /** Field shutdownCount */
    private static int shutdownCount = 0;

    /** Field id */
    private String id;
    /** Field name */
    private String name;
    /** Field listeners */
    private List<SafeFlowListener> listeners;
    /** Field skipStrategy */
    private FlowSkipStrategy flowSkipStrategy = new FlowSkipIfSinkStale();
    /** Field submitPriority */
    private int submitPriority = 5;
    /** Field flowStats */
    private final FlowStats flowStats; // don't use a listener to set values
    /** Field sources */
    protected Map<String, Tap> sources;
    /** Field sinks */
    private Map<String, Tap> sinks;
    /** Field traps */
    private Map<String, Tap> traps;
    /** Field preserveTemporaryFiles */
    private boolean preserveTemporaryFiles = false;
    /** Field stopJobsOnExit */
    protected boolean stopJobsOnExit = true;

    /** Field stepGraph */
    private StepGraph stepGraph;
    /** Field jobConf */
    private transient JobConf jobConf;
    /** Field thread */
    private transient Thread thread;
    /** Field throwable */
    private Throwable throwable;
    /** Field stop */
    private boolean stop;

    /** Field pipeGraph */
    private ElementGraph pipeGraph; // only used for documentation purposes

    /** Field steps */
    private transient List<FlowStep> steps;
    /** Field jobsMap */
    private transient Map<String, Callable<Throwable>> jobsMap;
    /** Field executor */
    private transient ExecutorService executor;
    /** Field shutdownHook */
    private transient Thread shutdownHook;

    /**
     * Property preserveTemporaryFiles forces the Flow instance to keep any temporary intermediate data sets. Useful
     * for debugging. Defaults to {@code false}.
     *
     * @param properties             of type Map
     * @param preserveTemporaryFiles of type boolean
     */
    public static void setPreserveTemporaryFiles(Map<Object, Object> properties, boolean preserveTemporaryFiles) {
        properties.put("cascading.flow.preservetemporaryfiles", Boolean.toString(preserveTemporaryFiles));
    }

    /**
     * Returns property preserveTemporaryFiles.
     *
     * @param properties of type Map
     * @return a boolean
     */
    public static boolean getPreserveTemporaryFiles(Map<Object, Object> properties) {
        return Boolean.parseBoolean(Util.getProperty(properties, "cascading.flow.preservetemporaryfiles", "false"));
    }

    /**
     * Property stopJobsOnExit will tell the Flow to add a JVM shutdown hook that will kill all running processes if the
     * underlying computing system supports it. Defaults to {@code true}.
     *
     * @param properties     of type Map
     * @param stopJobsOnExit of type boolean
     */
    public static void setStopJobsOnExit(Map<Object, Object> properties, boolean stopJobsOnExit) {
        properties.put("cascading.flow.stopjobsonexit", Boolean.toString(stopJobsOnExit));
    }

    /**
     * Returns property stopJobsOnExit.
     *
     * @param properties of type Map
     * @return a boolean
     */
    public static boolean getStopJobsOnExit(Map<Object, Object> properties) {
        return Boolean.parseBoolean(Util.getProperty(properties, "cascading.flow.stopjobsonexit", "true"));
    }

    /**
     * Property jobPollingInterval will set the time to wait between polling the remote server for the status of a job.
     * The default value is 5000 msec (5 seconds).
     *
     * @param properties of type Map
     * @param interval   of type long
     */
    public static void setJobPollingInterval(Map<Object, Object> properties, long interval) {
        properties.put("cascading.flow.job.pollinginterval", Long.toString(interval));
    }

    /**
     * Returns property jobPollingInterval. The default is 5000 (5 sec).
     *
     * @param properties of type Map
     * @return a long
     */
    public static long getJobPollingInterval(Map<Object, Object> properties) {
        return Long.parseLong(Util.getProperty(properties, "cascading.flow.job.pollinginterval", "500"));
    }

    public static long getJobPollingInterval(JobConf jobConf) {
        return jobConf.getLong("cascading.flow.job.pollinginterval", 5000);
    }

    /**
     * Method setMaxConcurrentSteps sets the maximum number of steps that a Flow can run concurrently.
     * <p/>
     * By default a Flow will attempt to run all give steps at the same time. But there are occasions
     * where limiting the number of steps helps manages resources.
     *
     * @param properties         of type Map<Object, Object>
     * @param numConcurrentSteps of type int
     */
    public static void setMaxConcurrentSteps(Map<Object, Object> properties, int numConcurrentSteps) {
        properties.put("cascading.flow.maxconcurrentsteps", Integer.toString(numConcurrentSteps));
    }

    public static int getMaxConcurrentSteps(JobConf jobConf) {
        return jobConf.getInt("cascading.flow.maxconcurrentsteps", 0);
    }

    /** Used for testing. */
    protected Flow() {
        this.name = "NA";
        this.flowStats = new FlowStats(name, getID());
    }

    protected Flow(Map<Object, Object> properties, JobConf jobConf, String name) {
        this.name = name;
        this.flowStats = new FlowStats(name, getID());
        setJobConf(jobConf);
        initFromProperties(properties);
    }

    protected Flow(Map<Object, Object> properties, JobConf jobConf, String name, ElementGraph pipeGraph,
            StepGraph stepGraph, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps) {
        this.name = name;
        this.pipeGraph = pipeGraph;
        this.stepGraph = stepGraph;
        this.flowStats = new FlowStats(name, getID());
        setJobConf(jobConf);
        setSources(sources);
        setSinks(sinks);
        setTraps(traps);
        initFromProperties(properties);
        initFromTaps();
    }

    protected Flow(Map<Object, Object> properties, JobConf jobConf, String name, StepGraph stepGraph,
            Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps) {
        this.name = name;
        this.stepGraph = stepGraph;
        this.flowStats = new FlowStats(name, getID());
        setJobConf(jobConf);
        setSources(sources);
        setSinks(sinks);
        setTraps(traps);
        initFromProperties(properties);
        initFromTaps();
    }

    private void initFromProperties(Map<Object, Object> properties) {
        preserveTemporaryFiles = getPreserveTemporaryFiles(properties);
        stopJobsOnExit = getStopJobsOnExit(properties);
    }

    private void initFromTaps() {
        initFromTaps(sources);
        initFromTaps(sinks);
        initFromTaps(traps);
    }

    private void initFromTaps(Map<String, Tap> taps) {
        for (Tap tap : taps.values())
            tap.flowInit(this);
    }

    /**
     * Method getName returns the name of this Flow object.
     *
     * @return the name (type String) of this Flow object.
     */
    public String getName() {
        return name;
    }

    protected void setName(String name) {
        this.name = name;
    }

    /**
     * Method getID returns the ID of this Flow object.
     * <p/>
     * The ID value is a long HEX String used to identify this instance globally. Subsequent Flow
     * instances created with identical parameters will not return the same ID.
     *
     * @return the ID (type String) of this Flow object.
     */
    public String getID() {
        if (id == null)
            id = Util.createUniqueID(getName());

        return id;
    }

    /**
     * Method getSubmitPriority returns the submitPriority of this Flow object.
     * <p/>
     * 10 is lowest, 1 is the highest, 5 is the default.
     *
     * @return the submitPriority (type int) of this FlowStep object.
     */
    public int getSubmitPriority() {
        return submitPriority;
    }

    /**
     * Method setSubmitPriority sets the submitPriority of this Flow object.
     * <p/>
     * 10 is lowest, 1 is the highest, 5 is the default.
     *
     * @param submitPriority the submitPriority of this FlowStep object.
     */
    public void setSubmitPriority(int submitPriority) {
        this.submitPriority = submitPriority;
    }

    protected void setSources(Map<String, Tap> sources) {
        addListeners(sources.values());
        this.sources = sources;
    }

    protected void setSinks(Map<String, Tap> sinks) {
        addListeners(sinks.values());
        this.sinks = sinks;
    }

    protected void setTraps(Map<String, Tap> traps) {
        addListeners(traps.values());
        this.traps = traps;
    }

    protected void setStepGraph(StepGraph stepGraph) {
        this.stepGraph = stepGraph;
    }

    private void setJobConf(JobConf jobConf) {
        if (jobConf == null) // this is ok, getJobConf will pass a default parent in
            return;

        this.jobConf = new JobConf(jobConf); // prevent local values from being shared
        this.jobConf.set("fs.http.impl", HttpFileSystem.class.getName());
        this.jobConf.set("fs.https.impl", HttpFileSystem.class.getName());
        this.jobConf.set("fs.s3tp.impl", S3HttpFileSystem.class.getName());

        // set the ID for future reference
        this.jobConf.set("cascading.flow.id", getID());
    }

    /**
     * Method getJobConf returns the jobConf of this Flow object.
     *
     * @return the jobConf (type JobConf) of this Flow object.
     */
    public JobConf getJobConf() {
        if (jobConf == null)
            setJobConf(new JobConf());

        return jobConf;
    }

    /**
     * Method setProperty sets the given key and value on the underlying properites system.
     *
     * @param key   of type String
     * @param value of type String
     */
    public void setProperty(String key, String value) {
        getJobConf().set(key, value);
    }

    /**
     * Method getProperty returns the value associated with the given key from the underlying properties system.
     *
     * @param key of type String
     * @return String
     */
    public String getProperty(String key) {
        return getJobConf().get(key);
    }

    /**
     * Method getFlowStats returns the flowStats of this Flow object.
     *
     * @return the flowStats (type FlowStats) of this Flow object.
     */
    public FlowStats getFlowStats() {
        return flowStats;
    }

    void addListeners(Collection listeners) {
        for (Object listener : listeners) {
            if (listener instanceof FlowListener)
                addListener((FlowListener) listener);
        }
    }

    List<SafeFlowListener> getListeners() {
        if (listeners == null)
            listeners = new LinkedList<SafeFlowListener>();

        return listeners;
    }

    /**
     * Method hasListeners returns true if {@link FlowListener} instances have been registered.
     *
     * @return boolean
     */
    public boolean hasListeners() {
        return listeners != null && !listeners.isEmpty();
    }

    /**
     * Method addListener registers the given flowListener with this instance.
     *
     * @param flowListener of type FlowListener
     */
    public void addListener(FlowListener flowListener) {
        getListeners().add(new SafeFlowListener(flowListener));
    }

    /**
     * Method removeListener removes the given flowListener from this instance.
     *
     * @param flowListener of type FlowListener
     * @return true if the listener was removed
     */
    public boolean removeListener(FlowListener flowListener) {
        return getListeners().remove(new SafeFlowListener(flowListener));
    }

    /**
     * Method getSources returns the sources of this Flow object.
     *
     * @return the sources (type Map) of this Flow object.
     */
    public Map<String, Tap> getSources() {
        return Collections.unmodifiableMap(sources);
    }

    /**
     * Method getSourcesCollection returns a {@link Collection} of source {@link Tap}s for this Flow object.
     *
     * @return the sourcesCollection (type Collection<Tap>) of this Flow object.
     */
    @DependencyIncoming
    public Collection<Tap> getSourcesCollection() {
        return getSources().values();
    }

    /**
     * Method getSinks returns the sinks of this Flow object.
     *
     * @return the sinks (type Map) of this Flow object.
     */
    public Map<String, Tap> getSinks() {
        return Collections.unmodifiableMap(sinks);
    }

    /**
     * Method getSinksCollection returns a {@link Collection} of sink {@link Tap}s for this Flow object.
     *
     * @return the sinkCollection (type Collection<Tap>) of this Flow object.
     */
    @DependencyOutgoing
    public Collection<Tap> getSinksCollection() {
        return getSinks().values();
    }

    /**
     * Method getTraps returns the traps of this Flow object.
     *
     * @return the traps (type Map<String, Tap>) of this Flow object.
     */
    public Map<String, Tap> getTraps() {
        return Collections.unmodifiableMap(traps);
    }

    /**
     * Method getTrapsCollection returns a {@link Collection} of trap {@link Tap}s for this Flow object.
     *
     * @return the trapsCollection (type Collection<Tap>) of this Flow object.
     */
    public Collection<Tap> getTrapsCollection() {
        return getTraps().values();
    }

    /**
     * Method getSink returns the first sink of this Flow object.
     *
     * @return the sink (type Tap) of this Flow object.
     */
    public Tap getSink() {
        return sinks.values().iterator().next();
    }

    /**
     * Method isPreserveTemporaryFiles returns false if temporary files will be cleaned when this Flow completes.
     *
     * @return the preserveTemporaryFiles (type boolean) of this Flow object.
     */
    public boolean isPreserveTemporaryFiles() {
        return preserveTemporaryFiles;
    }

    /**
     * Method isStopJobsOnExit returns the stopJobsOnExit of this Flow object. Defaults to {@code true}.
     *
     * @return the stopJobsOnExit (type boolean) of this Flow object.
     */
    public boolean isStopJobsOnExit() {
        return stopJobsOnExit;
    }

    /**
     * Method getFlowSkipStrategy returns the current {@link cascading.flow.FlowSkipStrategy} used by this Flow.
     *
     * @return FlowSkipStrategy
     */
    public FlowSkipStrategy getFlowSkipStrategy() {
        return flowSkipStrategy;
    }

    /**
     * Method setFlowSkipStrategy sets a new {@link cascading.flow.FlowSkipStrategy}, the current strategy is returned.
     * <p/>
     * FlowSkipStrategy instances define when a Flow instance should be skipped. The default strategy is {@link cascading.flow.FlowSkipIfSinkStale}.
     * An alternative strategy would be {@link cascading.flow.FlowSkipIfSinkExists}.
     * <p/>
     * A FlowSkipStrategy will not be consulted when executing a Flow directly through {@link #start()} or {@link #complete()}. Only
     * when the Flow is executed through a {@link Cascade} instance.
     *
     * @param flowSkipStrategy of type FlowSkipStrategy
     * @return FlowSkipStrategy
     */
    public FlowSkipStrategy setFlowSkipStrategy(FlowSkipStrategy flowSkipStrategy) {
        if (flowSkipStrategy == null)
            throw new IllegalArgumentException("flowSkipStrategy may not be null");

        try {
            return this.flowSkipStrategy;
        } finally {
            this.flowSkipStrategy = flowSkipStrategy;
        }
    }

    /**
     * Method isSkipFlow returns true if the parent {@link Cascade} should skip this Flow instance. True is returned
     * if the current {@link cascading.flow.FlowSkipStrategy} returns true.
     *
     * @return the skipFlow (type boolean) of this Flow object.
     * @throws IOException when
     */
    public boolean isSkipFlow() throws IOException {
        return flowSkipStrategy.skipFlow(this);
    }

    /**
     * Method areSinksStale returns true if any of the sinks referenced are out of date in relation to the sources. Or
     * if any sink method {@link Tap#isReplace()} returns true.
     *
     * @return boolean
     * @throws IOException when
     */
    public boolean areSinksStale() throws IOException {
        return areSourcesNewer(getSinkModified());
    }

    /**
     * Method areSourcesNewer returns true if any source is newer than the given sinkModified date value.
     *
     * @param sinkModified of type long
     * @return boolean
     * @throws IOException when
     */
    public boolean areSourcesNewer(long sinkModified) throws IOException {
        JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
        long sourceMod = 0;

        try {
            for (Tap source : sources.values()) {
                if (!source.pathExists(confCopy))
                    throw new FlowException("source does not exist: " + source);

                sourceMod = source.getPathModified(confCopy);

                if (sinkModified < sourceMod)
                    return true;
            }

            return false;
        } finally {
            if (LOG.isInfoEnabled())
                logInfo("source modification date at: " + new Date(sourceMod)); // not oldest, we didnt check them all
        }
    }

    /**
     * Method getSinkModified returns the youngest modified date of any sink {@link Tap} managed by this Flow instance.
     * <p/>
     * If zero (0) is returned, atleast one of the sink resources does not exist. If minus one (-1) is returned,
     * atleast one of the sinks are marked for delete ({@link Tap#isReplace() returns true}).
     *
     * @return the sinkModified (type long) of this Flow object.
     * @throws IOException when
     */
    public long getSinkModified() throws IOException {
        JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
        long sinkModified = Long.MAX_VALUE;

        for (Tap sink : sinks.values()) {
            if (sink.isReplace() || sink.isUpdate())
                sinkModified = -1L;
            else {
                if (!sink.pathExists(confCopy))
                    sinkModified = 0L;
                else
                    sinkModified = Math.min(sinkModified, sink.getPathModified(confCopy)); // return youngest mod date
            }
        }

        if (LOG.isInfoEnabled()) {
            if (sinkModified == -1L)
                logInfo("atleast one sink is marked for delete");
            if (sinkModified == 0L)
                logInfo("atleast one sink does not exist");
            else
                logInfo("sink oldest modified date: " + new Date(sinkModified));
        }

        return sinkModified;
    }

    /**
     * Method getSteps returns the steps of this Flow object. They will be in topological order.
     *
     * @return the steps (type List<FlowStep>) of this Flow object.
     */
    public List<FlowStep> getSteps() {
        if (steps != null)
            return steps;

        TopologicalOrderIterator topoIterator = new TopologicalOrderIterator<FlowStep, Integer>(stepGraph);

        steps = new ArrayList<FlowStep>();

        while (topoIterator.hasNext())
            steps.add((FlowStep) topoIterator.next());

        return steps;
    }

    /**
     * Method prepare is used by a {@link Cascade} to notify the given Flow it should initialize or clear any resources
     * necessary for {@link #start()} to be called successfully.
     * <p/>
     * Specifically, this implementation calls {@link #deleteSinksIfNotUpdate()}.
     *
     * @throws IOException when
     */
    @ProcessPrepare
    public void prepare() {
        try {
            deleteSinksIfNotUpdate();
        } catch (IOException exception) {
            throw new FlowException("unable to prepare flow", exception);
        }
    }

    /**
     * Method start begins the execution of this Flow instance. It will return immediately. Use the method {@link #complete()}
     * to block until this Flow completes.
     */
    @ProcessStart
    public synchronized void start() {
        if (thread != null)
            return;

        if (stop)
            return;

        registerShutdownHook();

        thread = new Thread(this, ("flow " + Util.toNull(getName())).trim());

        thread.start();
    }

    /** Method stop stops all running jobs, killing any currently executing. */
    @ProcessStop
    public synchronized void stop() {
        if (stop)
            return;

        if (thread == null)
            return;

        stop = true;

        fireOnStopping();

        if (!flowStats.isFinished())
            flowStats.markStopped();

        internalStopAllJobs();

        handleExecutorShutdown();

        if (!isPreserveTemporaryFiles())
            cleanTemporaryFiles(false); // force cleanup
    }

    /** Method complete starts the current Flow instance if it has not be previously started, then block until completion. */
    @ProcessComplete
    public void complete() {
        start();

        try {
            try {
                thread.join();
            } catch (InterruptedException exception) {
                throw new FlowException(getName(), "thread interrupted", exception);
            }

            if (throwable instanceof FlowException)
                ((FlowException) throwable).setFlowName(getName());

            if (throwable instanceof CascadingException)
                throw (CascadingException) throwable;

            if (throwable != null)
                throw new FlowException(getName(), "unhandled exception", throwable);

            if (hasListeners()) {
                for (SafeFlowListener safeFlowListener : getListeners()) {
                    if (safeFlowListener.throwable != null)
                        throw new FlowException(getName(), "unhandled listener exception", throwable);
                }
            }
        } finally {
            thread = null;
            throwable = null;

            if (hasListeners()) {
                for (SafeFlowListener safeFlowListener : getListeners())
                    safeFlowListener.throwable = null;
            }
        }
    }

    @ProcessCleanup
    public void cleanup() {
        // do nothing
    }

    /**
     * Method openSource opens the first source Tap.
     *
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openSource() throws IOException {
        return sources.values().iterator().next().openForRead(getJobConf());
    }

    /**
     * Method openSource opens the named source Tap.
     *
     * @param name of type String
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openSource(String name) throws IOException {
        return sources.get(name).openForRead(getJobConf());
    }

    /**
     * Method openSink opens the first sink Tap.
     *
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openSink() throws IOException {
        return sinks.values().iterator().next().openForRead(getJobConf());
    }

    /**
     * Method openSink opens the named sink Tap.
     *
     * @param name of type String
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openSink(String name) throws IOException {
        return sinks.get(name).openForRead(getJobConf());
    }

    /**
     * Method openTrap opens the first trap Tap.
     *
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openTrap() throws IOException {
        return traps.values().iterator().next().openForRead(getJobConf());
    }

    /**
     * Method openTrap opens the named trap Tap.
     *
     * @param name of type String
     * @return TupleIterator
     * @throws IOException when
     */
    public TupleEntryIterator openTrap(String name) throws IOException {
        return traps.get(name).openForRead(getJobConf());
    }

    /**
     * Method deleteSinks deletes all sinks, whether or not they are configured for {@link cascading.tap.SinkMode#UPDATE}.
     * <p/>
     * Use with caution.
     *
     * @throws IOException when
     * @see Flow#deleteSinksIfNotUpdate()
     */
    public void deleteSinks() throws IOException {
        for (Tap tap : sinks.values())
            tap.deletePath(getJobConf());
    }

    /**
     * Method deleteSinksIfNotAppend deletes all sinks if they are not configured with the {@link cascading.tap.SinkMode#APPEND} flag.
     * <p/>
     * Typically used by a {@link Cascade} before executing the flow if the sinks are stale.
     * <p/>
     * Use with caution.
     *
     * @throws IOException when
     */
    @Deprecated
    public void deleteSinksIfNotAppend() throws IOException {
        for (Tap tap : sinks.values()) {
            if (!tap.isUpdate())
                tap.deletePath(getJobConf());
        }
    }

    /**
     * Method deleteSinksIfNotUpdate deletes all sinks if they are not configured with the {@link cascading.tap.SinkMode#UPDATE} flag.
     * <p/>
     * Typically used by a {@link Cascade} before executing the flow if the sinks are stale.
     * <p/>
     * Use with caution.
     *
     * @throws IOException when
     */
    public void deleteSinksIfNotUpdate() throws IOException {
        for (Tap tap : sinks.values()) {
            if (!tap.isUpdate())
                tap.deletePath(getJobConf());
        }
    }

    /**
     * Method tapExists returns true if the resource represented by the given Tap instance exists.
     *
     * @param tap of type Tap
     * @return boolean
     * @throws IOException when
     */
    public boolean tapPathExists(Tap tap) throws IOException {
        return tap.pathExists(getJobConf());
    }

    /**
     * Method openTapForRead return a {@link TupleIterator} for the given Tap instance.
     *
     * @param tap of type Tap
     * @return TupleIterator
     * @throws IOException when there is an error opening the resource
     */
    public TupleEntryIterator openTapForRead(Tap tap) throws IOException {
        return tap.openForRead(getJobConf());
    }

    /**
     * Method openTapForWrite returns a (@link TupleCollector} for the given Tap instance.
     *
     * @param tap of type Tap
     * @return TupleCollector
     * @throws IOException when there is an error opening the resource
     */
    public TupleEntryCollector openTapForWrite(Tap tap) throws IOException {
        return tap.openForWrite(getJobConf());
    }

    /**
     * Method jobsAreLocal returns true if all jobs are executed in-process as a single map and reduce task.
     *
     * @return boolean
     */
    public boolean jobsAreLocal() {
        return getJobConf().get("mapred.job.tracker").equalsIgnoreCase("local");
    }

    /** Method run implements the Runnable run method and should not be called by users. */
    public void run() {
        if (thread == null)
            throw new IllegalStateException("to start a Flow call start() or complete(), not Runnable#run()");

        Cascade.printBanner();

        try {
            flowStats.markRunning();

            fireOnStarting();

            if (LOG.isInfoEnabled()) {
                logInfo("starting");

                for (Tap source : getSourcesCollection())
                    logInfo(" source: " + source);
                for (Tap sink : getSinksCollection())
                    logInfo(" sink: " + sink);
            }

            initializeNewJobsMap();

            // if jobs are run local, then only use one thread to force execution serially
            int numThreads = jobsAreLocal() ? 1 : getMaxConcurrentSteps(getJobConf());

            if (numThreads == 0)
                numThreads = jobsMap.size();

            if (numThreads == 0)
                throw new IllegalStateException("no jobs rendered for flow: " + getName());

            if (LOG.isInfoEnabled()) {
                logInfo(" parallel execution is enabled: " + !jobsAreLocal());
                logInfo(" starting jobs: " + jobsMap.size());
                logInfo(" allocating threads: " + numThreads);
            }

            List<Future<Throwable>> futures = spawnJobs(numThreads);

            for (Future<Throwable> future : futures) {
                throwable = future.get();

                if (throwable != null) {
                    if (!stop)
                        internalStopAllJobs();

                    handleExecutorShutdown();
                    break;
                }
            }
        } catch (Throwable throwable) {
            this.throwable = throwable;
        } finally {
            if (!isPreserveTemporaryFiles())
                cleanTemporaryFiles(stop);

            handleThrowableAndMarkFailed();

            if (!stop && !flowStats.isFinished())
                flowStats.markSuccessful();

            try {
                fireOnCompleted();
            } finally {
                deregisterShutdownHook();
            }
        }
    }

    private List<Future<Throwable>> spawnJobs(int numThreads) throws InterruptedException {
        if (stop)
            return new ArrayList<Future<Throwable>>();

        executor = Executors.newFixedThreadPool(numThreads);
        List<Future<Throwable>> futures = executor.invokeAll(jobsMap.values()); // todo: consider submit()
        executor.shutdown(); // don't accept any more work
        return futures;
    }

    private void handleThrowableAndMarkFailed() {
        if (throwable != null && !stop) {
            flowStats.markFailed(throwable);

            fireOnThrowable();
        }
    }

    synchronized Map<String, Callable<Throwable>> getJobsMap() {
        return jobsMap;
    }

    private synchronized void initializeNewJobsMap() throws IOException {
        // keep topo order
        jobsMap = new LinkedHashMap<String, Callable<Throwable>>();
        TopologicalOrderIterator topoIterator = stepGraph.getTopologicalIterator();

        while (topoIterator.hasNext()) {
            FlowStep step = (FlowStep) topoIterator.next();
            FlowStepJob flowStepJob = step.createFlowStepJob(getJobConf());

            jobsMap.put(step.getName(), flowStepJob);

            List<FlowStepJob> predecessors = new ArrayList<FlowStepJob>();

            for (FlowStep flowStep : Graphs.predecessorListOf(stepGraph, step))
                predecessors.add((FlowStepJob) jobsMap.get(flowStep.getName()));

            flowStepJob.setPredecessors(predecessors);

            flowStats.addStepStats(flowStepJob.getStepStats());
        }
    }

    private void internalStopAllJobs() {
        LOG.warn("stopping jobs");

        try {
            if (jobsMap == null)
                return;

            List<Callable<Throwable>> jobs = new ArrayList<Callable<Throwable>>(jobsMap.values());

            Collections.reverse(jobs);

            for (Callable<Throwable> callable : jobs)
                ((FlowStepJob) callable).stop();
        } finally {
            LOG.warn("stopped jobs");
        }
    }

    private void handleExecutorShutdown() {
        if (executor == null)
            return;

        LOG.warn("shutting down job executor");

        try {
            executor.awaitTermination(5 * 60, TimeUnit.SECONDS);
        } catch (InterruptedException exception) {
            // ignore
        }

        LOG.warn("shutdown complete");
    }

    private void fireOnCompleted() {
        if (hasListeners()) {
            if (LOG.isDebugEnabled())
                logDebug("firing onCompleted event: " + getListeners().size());

            for (FlowListener flowListener : getListeners())
                flowListener.onCompleted(this);
        }
    }

    private void fireOnThrowable() {
        if (hasListeners()) {
            if (LOG.isDebugEnabled())
                logDebug("firing onThrowable event: " + getListeners().size());

            boolean isHandled = false;

            for (FlowListener flowListener : getListeners())
                isHandled = flowListener.onThrowable(this, throwable) || isHandled;

            if (isHandled)
                throwable = null;
        }
    }

    private void fireOnStopping() {
        if (hasListeners()) {
            if (LOG.isDebugEnabled())
                logDebug("firing onStopping event: " + getListeners().size());

            for (FlowListener flowListener : getListeners())
                flowListener.onStopping(this);
        }
    }

    private void fireOnStarting() {
        if (hasListeners()) {
            if (LOG.isDebugEnabled())
                logDebug("firing onStarting event: " + getListeners().size());

            for (FlowListener flowListener : getListeners())
                flowListener.onStarting(this);
        }
    }

    private void cleanTemporaryFiles(boolean stop) {
        if (stop) // unstable to call fs operations during shutdown
            return;

        for (FlowStep step : getSteps())
            step.clean(getJobConf());
    }

    private void registerShutdownHook() {
        if (!isStopJobsOnExit())
            return;

        getHdfsShutdownHook();

        shutdownHook = new Thread() {
            @Override
            public void run() {
                Flow.this.stop();

                callHdfsShutdownHook();
            }
        };

        Runtime.getRuntime().addShutdownHook(shutdownHook);
    }

    private synchronized static void callHdfsShutdownHook() {
        if (--shutdownCount != 0)
            return;

        if (hdfsShutdown != null)
            hdfsShutdown.start();
    }

    private synchronized static void getHdfsShutdownHook() {
        shutdownCount++;

        if (hdfsShutdown == null)
            hdfsShutdown = Util.getHDFSShutdownHook();
    }

    private void deregisterShutdownHook() {
        if (!isStopJobsOnExit() || stop)
            return;

        Runtime.getRuntime().removeShutdownHook(shutdownHook);
    }

    @Override
    public String toString() {
        StringBuffer buffer = new StringBuffer();

        if (getName() != null)
            buffer.append(getName()).append(": ");

        for (FlowStep step : getSteps())
            buffer.append(step);

        return buffer.toString();
    }

    private void logInfo(String message) {
        LOG.info("[" + Util.truncate(getName(), 25) + "] " + message);
    }

    private void logDebug(String message) {
        LOG.debug("[" + Util.truncate(getName(), 25) + "] " + message);
    }

    private void logWarn(String message, Throwable throwable) {
        LOG.warn("[" + Util.truncate(getName(), 25) + "] " + message, throwable);
    }

    /**
     * Method writeDOT writes this Flow instance to the given filename as a DOT file for import into a graphics package.
     *
     * @param filename of type String
     */
    public void writeDOT(String filename) {
        if (pipeGraph == null)
            throw new UnsupportedOperationException("this flow instance cannot write a DOT file");

        pipeGraph.writeDOT(filename);
    }

    /**
     * Method writeStepsDOT writes this Flow step graph to the given filename as a DOT file for import into a graphics package.
     *
     * @param filename of type String
     */
    public void writeStepsDOT(String filename) {
        if (stepGraph == null)
            throw new UnsupportedOperationException("this flow instance cannot write a DOT file");

        stepGraph.writeDOT(filename);
    }

    /**
     * Used to return a simple wrapper for use as an edge in a graph where there can only be
     * one instance of every edge.
     *
     * @return FlowHolder
     */
    public FlowHolder getHolder() {
        return new FlowHolder(this);
    }

    /** Class FlowHolder is a helper class for wrapping Flow instances. */
    public static class FlowHolder {
        /** Field flow */
        public Flow flow;

        public FlowHolder() {
        }

        public FlowHolder(Flow flow) {
            this.flow = flow;
        }
    }

    /**
     * Class SafeFlowListener safely calls a wrapped FlowListener.
     * <p/>
     * This is done for a few reasons, the primary reason is so exceptions thrown by the Listener
     * can be caught by the calling Thread. Since Flow is asyncronous, much of the work is done in the run() method
     * which in turn is run in a new Thread.
     */
    private class SafeFlowListener implements FlowListener {
        /** Field flowListener */
        final FlowListener flowListener;
        /** Field throwable */
        Throwable throwable;

        private SafeFlowListener(FlowListener flowListener) {
            this.flowListener = flowListener;
        }

        public void onStarting(Flow flow) {
            try {
                flowListener.onStarting(flow);
            } catch (Throwable throwable) {
                handleThrowable(throwable);
            }
        }

        public void onStopping(Flow flow) {
            try {
                flowListener.onStopping(flow);
            } catch (Throwable throwable) {
                handleThrowable(throwable);
            }
        }

        public void onCompleted(Flow flow) {
            try {
                flowListener.onCompleted(flow);
            } catch (Throwable throwable) {
                handleThrowable(throwable);
            }
        }

        public boolean onThrowable(Flow flow, Throwable flowThrowable) {
            try {
                return flowListener.onThrowable(flow, flowThrowable);
            } catch (Throwable throwable) {
                handleThrowable(throwable);
            }

            return false;
        }

        private void handleThrowable(Throwable throwable) {
            this.throwable = throwable;

            logWarn(String.format("flow listener %s threw throwable", flowListener), throwable);

            // stop this flow
            stop();
        }

        public boolean equals(Object object) {
            if (object instanceof SafeFlowListener)
                return flowListener.equals(((SafeFlowListener) object).flowListener);

            return flowListener.equals(object);
        }

        public int hashCode() {
            return flowListener.hashCode();
        }
    }

}