Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.flow; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import cascading.CascadingException; import cascading.cascade.Cascade; import cascading.pipe.Pipe; import cascading.stats.FlowStats; import cascading.tap.Tap; import cascading.tap.hadoop.HttpFileSystem; import cascading.tap.hadoop.S3HttpFileSystem; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.tuple.TupleIterator; import cascading.util.Util; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; import org.jgrapht.Graphs; import org.jgrapht.traverse.TopologicalOrderIterator; import riffle.process.DependencyIncoming; import riffle.process.DependencyOutgoing; import riffle.process.ProcessCleanup; import riffle.process.ProcessComplete; import riffle.process.ProcessPrepare; import riffle.process.ProcessStart; import riffle.process.ProcessStop; /** * A {@link Pipe} assembly is connected to the necessary number of {@link Tap} sinks and * sources into a Flow. A Flow is then executed to push the incoming source data through * the assembly into one or more sinks. * <p/> * Note that {@link Pipe} assemblies can be reused in multiple Flow instances. They maintain * no state regarding the Flow execution. Subsequently, {@link Pipe} assemblies can be given * parameters through its calling Flow so they can be built in a generic fashion. * <p/> * When a Flow is created, an optimized internal representation is created that is then executed * within the cluster. Thus any overhead inherent to a give {@link Pipe} assembly will be removed * once it's placed in context with the actual execution environment. * <p/> * <p/> * Flows are submitted in order of dependency. If two or more steps do not share the same dependencies and all * can be scheduled simultaneously, the {@link #getSubmitPriority()} value determines the order in which * all steps will be submitted for execution. The default submit priority is 5. * </p> * <strong>Properties</strong><br/> * <ul> * <li>cascading.flow.preservetemporaryfiles</li> * <li>cascading.flow.stopjobsonexit</li> * </ul> * * @see cascading.flow.FlowConnector */ @riffle.process.Process public class Flow implements Runnable { /** Field LOG */ private static final Logger LOG = Logger.getLogger(Flow.class); /** Field hdfsShutdown */ private static Thread hdfsShutdown = null; /** Field shutdownCount */ private static int shutdownCount = 0; /** Field id */ private String id; /** Field name */ private String name; /** Field listeners */ private List<SafeFlowListener> listeners; /** Field skipStrategy */ private FlowSkipStrategy flowSkipStrategy = new FlowSkipIfSinkStale(); /** Field submitPriority */ private int submitPriority = 5; /** Field flowStats */ private final FlowStats flowStats; // don't use a listener to set values /** Field sources */ protected Map<String, Tap> sources; /** Field sinks */ private Map<String, Tap> sinks; /** Field traps */ private Map<String, Tap> traps; /** Field preserveTemporaryFiles */ private boolean preserveTemporaryFiles = false; /** Field stopJobsOnExit */ protected boolean stopJobsOnExit = true; /** Field stepGraph */ private StepGraph stepGraph; /** Field jobConf */ private transient JobConf jobConf; /** Field thread */ private transient Thread thread; /** Field throwable */ private Throwable throwable; /** Field stop */ private boolean stop; /** Field pipeGraph */ private ElementGraph pipeGraph; // only used for documentation purposes /** Field steps */ private transient List<FlowStep> steps; /** Field jobsMap */ private transient Map<String, Callable<Throwable>> jobsMap; /** Field executor */ private transient ExecutorService executor; /** Field shutdownHook */ private transient Thread shutdownHook; /** * Property preserveTemporaryFiles forces the Flow instance to keep any temporary intermediate data sets. Useful * for debugging. Defaults to {@code false}. * * @param properties of type Map * @param preserveTemporaryFiles of type boolean */ public static void setPreserveTemporaryFiles(Map<Object, Object> properties, boolean preserveTemporaryFiles) { properties.put("cascading.flow.preservetemporaryfiles", Boolean.toString(preserveTemporaryFiles)); } /** * Returns property preserveTemporaryFiles. * * @param properties of type Map * @return a boolean */ public static boolean getPreserveTemporaryFiles(Map<Object, Object> properties) { return Boolean.parseBoolean(Util.getProperty(properties, "cascading.flow.preservetemporaryfiles", "false")); } /** * Property stopJobsOnExit will tell the Flow to add a JVM shutdown hook that will kill all running processes if the * underlying computing system supports it. Defaults to {@code true}. * * @param properties of type Map * @param stopJobsOnExit of type boolean */ public static void setStopJobsOnExit(Map<Object, Object> properties, boolean stopJobsOnExit) { properties.put("cascading.flow.stopjobsonexit", Boolean.toString(stopJobsOnExit)); } /** * Returns property stopJobsOnExit. * * @param properties of type Map * @return a boolean */ public static boolean getStopJobsOnExit(Map<Object, Object> properties) { return Boolean.parseBoolean(Util.getProperty(properties, "cascading.flow.stopjobsonexit", "true")); } /** * Property jobPollingInterval will set the time to wait between polling the remote server for the status of a job. * The default value is 5000 msec (5 seconds). * * @param properties of type Map * @param interval of type long */ public static void setJobPollingInterval(Map<Object, Object> properties, long interval) { properties.put("cascading.flow.job.pollinginterval", Long.toString(interval)); } /** * Returns property jobPollingInterval. The default is 5000 (5 sec). * * @param properties of type Map * @return a long */ public static long getJobPollingInterval(Map<Object, Object> properties) { return Long.parseLong(Util.getProperty(properties, "cascading.flow.job.pollinginterval", "500")); } public static long getJobPollingInterval(JobConf jobConf) { return jobConf.getLong("cascading.flow.job.pollinginterval", 5000); } /** * Method setMaxConcurrentSteps sets the maximum number of steps that a Flow can run concurrently. * <p/> * By default a Flow will attempt to run all give steps at the same time. But there are occasions * where limiting the number of steps helps manages resources. * * @param properties of type Map<Object, Object> * @param numConcurrentSteps of type int */ public static void setMaxConcurrentSteps(Map<Object, Object> properties, int numConcurrentSteps) { properties.put("cascading.flow.maxconcurrentsteps", Integer.toString(numConcurrentSteps)); } public static int getMaxConcurrentSteps(JobConf jobConf) { return jobConf.getInt("cascading.flow.maxconcurrentsteps", 0); } /** Used for testing. */ protected Flow() { this.name = "NA"; this.flowStats = new FlowStats(name, getID()); } protected Flow(Map<Object, Object> properties, JobConf jobConf, String name) { this.name = name; this.flowStats = new FlowStats(name, getID()); setJobConf(jobConf); initFromProperties(properties); } protected Flow(Map<Object, Object> properties, JobConf jobConf, String name, ElementGraph pipeGraph, StepGraph stepGraph, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps) { this.name = name; this.pipeGraph = pipeGraph; this.stepGraph = stepGraph; this.flowStats = new FlowStats(name, getID()); setJobConf(jobConf); setSources(sources); setSinks(sinks); setTraps(traps); initFromProperties(properties); initFromTaps(); } protected Flow(Map<Object, Object> properties, JobConf jobConf, String name, StepGraph stepGraph, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps) { this.name = name; this.stepGraph = stepGraph; this.flowStats = new FlowStats(name, getID()); setJobConf(jobConf); setSources(sources); setSinks(sinks); setTraps(traps); initFromProperties(properties); initFromTaps(); } private void initFromProperties(Map<Object, Object> properties) { preserveTemporaryFiles = getPreserveTemporaryFiles(properties); stopJobsOnExit = getStopJobsOnExit(properties); } private void initFromTaps() { initFromTaps(sources); initFromTaps(sinks); initFromTaps(traps); } private void initFromTaps(Map<String, Tap> taps) { for (Tap tap : taps.values()) tap.flowInit(this); } /** * Method getName returns the name of this Flow object. * * @return the name (type String) of this Flow object. */ public String getName() { return name; } protected void setName(String name) { this.name = name; } /** * Method getID returns the ID of this Flow object. * <p/> * The ID value is a long HEX String used to identify this instance globally. Subsequent Flow * instances created with identical parameters will not return the same ID. * * @return the ID (type String) of this Flow object. */ public String getID() { if (id == null) id = Util.createUniqueID(getName()); return id; } /** * Method getSubmitPriority returns the submitPriority of this Flow object. * <p/> * 10 is lowest, 1 is the highest, 5 is the default. * * @return the submitPriority (type int) of this FlowStep object. */ public int getSubmitPriority() { return submitPriority; } /** * Method setSubmitPriority sets the submitPriority of this Flow object. * <p/> * 10 is lowest, 1 is the highest, 5 is the default. * * @param submitPriority the submitPriority of this FlowStep object. */ public void setSubmitPriority(int submitPriority) { this.submitPriority = submitPriority; } protected void setSources(Map<String, Tap> sources) { addListeners(sources.values()); this.sources = sources; } protected void setSinks(Map<String, Tap> sinks) { addListeners(sinks.values()); this.sinks = sinks; } protected void setTraps(Map<String, Tap> traps) { addListeners(traps.values()); this.traps = traps; } protected void setStepGraph(StepGraph stepGraph) { this.stepGraph = stepGraph; } private void setJobConf(JobConf jobConf) { if (jobConf == null) // this is ok, getJobConf will pass a default parent in return; this.jobConf = new JobConf(jobConf); // prevent local values from being shared this.jobConf.set("fs.http.impl", HttpFileSystem.class.getName()); this.jobConf.set("fs.https.impl", HttpFileSystem.class.getName()); this.jobConf.set("fs.s3tp.impl", S3HttpFileSystem.class.getName()); // set the ID for future reference this.jobConf.set("cascading.flow.id", getID()); } /** * Method getJobConf returns the jobConf of this Flow object. * * @return the jobConf (type JobConf) of this Flow object. */ public JobConf getJobConf() { if (jobConf == null) setJobConf(new JobConf()); return jobConf; } /** * Method setProperty sets the given key and value on the underlying properites system. * * @param key of type String * @param value of type String */ public void setProperty(String key, String value) { getJobConf().set(key, value); } /** * Method getProperty returns the value associated with the given key from the underlying properties system. * * @param key of type String * @return String */ public String getProperty(String key) { return getJobConf().get(key); } /** * Method getFlowStats returns the flowStats of this Flow object. * * @return the flowStats (type FlowStats) of this Flow object. */ public FlowStats getFlowStats() { return flowStats; } void addListeners(Collection listeners) { for (Object listener : listeners) { if (listener instanceof FlowListener) addListener((FlowListener) listener); } } List<SafeFlowListener> getListeners() { if (listeners == null) listeners = new LinkedList<SafeFlowListener>(); return listeners; } /** * Method hasListeners returns true if {@link FlowListener} instances have been registered. * * @return boolean */ public boolean hasListeners() { return listeners != null && !listeners.isEmpty(); } /** * Method addListener registers the given flowListener with this instance. * * @param flowListener of type FlowListener */ public void addListener(FlowListener flowListener) { getListeners().add(new SafeFlowListener(flowListener)); } /** * Method removeListener removes the given flowListener from this instance. * * @param flowListener of type FlowListener * @return true if the listener was removed */ public boolean removeListener(FlowListener flowListener) { return getListeners().remove(new SafeFlowListener(flowListener)); } /** * Method getSources returns the sources of this Flow object. * * @return the sources (type Map) of this Flow object. */ public Map<String, Tap> getSources() { return Collections.unmodifiableMap(sources); } /** * Method getSourcesCollection returns a {@link Collection} of source {@link Tap}s for this Flow object. * * @return the sourcesCollection (type Collection<Tap>) of this Flow object. */ @DependencyIncoming public Collection<Tap> getSourcesCollection() { return getSources().values(); } /** * Method getSinks returns the sinks of this Flow object. * * @return the sinks (type Map) of this Flow object. */ public Map<String, Tap> getSinks() { return Collections.unmodifiableMap(sinks); } /** * Method getSinksCollection returns a {@link Collection} of sink {@link Tap}s for this Flow object. * * @return the sinkCollection (type Collection<Tap>) of this Flow object. */ @DependencyOutgoing public Collection<Tap> getSinksCollection() { return getSinks().values(); } /** * Method getTraps returns the traps of this Flow object. * * @return the traps (type Map<String, Tap>) of this Flow object. */ public Map<String, Tap> getTraps() { return Collections.unmodifiableMap(traps); } /** * Method getTrapsCollection returns a {@link Collection} of trap {@link Tap}s for this Flow object. * * @return the trapsCollection (type Collection<Tap>) of this Flow object. */ public Collection<Tap> getTrapsCollection() { return getTraps().values(); } /** * Method getSink returns the first sink of this Flow object. * * @return the sink (type Tap) of this Flow object. */ public Tap getSink() { return sinks.values().iterator().next(); } /** * Method isPreserveTemporaryFiles returns false if temporary files will be cleaned when this Flow completes. * * @return the preserveTemporaryFiles (type boolean) of this Flow object. */ public boolean isPreserveTemporaryFiles() { return preserveTemporaryFiles; } /** * Method isStopJobsOnExit returns the stopJobsOnExit of this Flow object. Defaults to {@code true}. * * @return the stopJobsOnExit (type boolean) of this Flow object. */ public boolean isStopJobsOnExit() { return stopJobsOnExit; } /** * Method getFlowSkipStrategy returns the current {@link cascading.flow.FlowSkipStrategy} used by this Flow. * * @return FlowSkipStrategy */ public FlowSkipStrategy getFlowSkipStrategy() { return flowSkipStrategy; } /** * Method setFlowSkipStrategy sets a new {@link cascading.flow.FlowSkipStrategy}, the current strategy is returned. * <p/> * FlowSkipStrategy instances define when a Flow instance should be skipped. The default strategy is {@link cascading.flow.FlowSkipIfSinkStale}. * An alternative strategy would be {@link cascading.flow.FlowSkipIfSinkExists}. * <p/> * A FlowSkipStrategy will not be consulted when executing a Flow directly through {@link #start()} or {@link #complete()}. Only * when the Flow is executed through a {@link Cascade} instance. * * @param flowSkipStrategy of type FlowSkipStrategy * @return FlowSkipStrategy */ public FlowSkipStrategy setFlowSkipStrategy(FlowSkipStrategy flowSkipStrategy) { if (flowSkipStrategy == null) throw new IllegalArgumentException("flowSkipStrategy may not be null"); try { return this.flowSkipStrategy; } finally { this.flowSkipStrategy = flowSkipStrategy; } } /** * Method isSkipFlow returns true if the parent {@link Cascade} should skip this Flow instance. True is returned * if the current {@link cascading.flow.FlowSkipStrategy} returns true. * * @return the skipFlow (type boolean) of this Flow object. * @throws IOException when */ public boolean isSkipFlow() throws IOException { return flowSkipStrategy.skipFlow(this); } /** * Method areSinksStale returns true if any of the sinks referenced are out of date in relation to the sources. Or * if any sink method {@link Tap#isReplace()} returns true. * * @return boolean * @throws IOException when */ public boolean areSinksStale() throws IOException { return areSourcesNewer(getSinkModified()); } /** * Method areSourcesNewer returns true if any source is newer than the given sinkModified date value. * * @param sinkModified of type long * @return boolean * @throws IOException when */ public boolean areSourcesNewer(long sinkModified) throws IOException { JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident long sourceMod = 0; try { for (Tap source : sources.values()) { if (!source.pathExists(confCopy)) throw new FlowException("source does not exist: " + source); sourceMod = source.getPathModified(confCopy); if (sinkModified < sourceMod) return true; } return false; } finally { if (LOG.isInfoEnabled()) logInfo("source modification date at: " + new Date(sourceMod)); // not oldest, we didnt check them all } } /** * Method getSinkModified returns the youngest modified date of any sink {@link Tap} managed by this Flow instance. * <p/> * If zero (0) is returned, atleast one of the sink resources does not exist. If minus one (-1) is returned, * atleast one of the sinks are marked for delete ({@link Tap#isReplace() returns true}). * * @return the sinkModified (type long) of this Flow object. * @throws IOException when */ public long getSinkModified() throws IOException { JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident long sinkModified = Long.MAX_VALUE; for (Tap sink : sinks.values()) { if (sink.isReplace() || sink.isUpdate()) sinkModified = -1L; else { if (!sink.pathExists(confCopy)) sinkModified = 0L; else sinkModified = Math.min(sinkModified, sink.getPathModified(confCopy)); // return youngest mod date } } if (LOG.isInfoEnabled()) { if (sinkModified == -1L) logInfo("atleast one sink is marked for delete"); if (sinkModified == 0L) logInfo("atleast one sink does not exist"); else logInfo("sink oldest modified date: " + new Date(sinkModified)); } return sinkModified; } /** * Method getSteps returns the steps of this Flow object. They will be in topological order. * * @return the steps (type List<FlowStep>) of this Flow object. */ public List<FlowStep> getSteps() { if (steps != null) return steps; TopologicalOrderIterator topoIterator = new TopologicalOrderIterator<FlowStep, Integer>(stepGraph); steps = new ArrayList<FlowStep>(); while (topoIterator.hasNext()) steps.add((FlowStep) topoIterator.next()); return steps; } /** * Method prepare is used by a {@link Cascade} to notify the given Flow it should initialize or clear any resources * necessary for {@link #start()} to be called successfully. * <p/> * Specifically, this implementation calls {@link #deleteSinksIfNotUpdate()}. * * @throws IOException when */ @ProcessPrepare public void prepare() { try { deleteSinksIfNotUpdate(); } catch (IOException exception) { throw new FlowException("unable to prepare flow", exception); } } /** * Method start begins the execution of this Flow instance. It will return immediately. Use the method {@link #complete()} * to block until this Flow completes. */ @ProcessStart public synchronized void start() { if (thread != null) return; if (stop) return; registerShutdownHook(); thread = new Thread(this, ("flow " + Util.toNull(getName())).trim()); thread.start(); } /** Method stop stops all running jobs, killing any currently executing. */ @ProcessStop public synchronized void stop() { if (stop) return; if (thread == null) return; stop = true; fireOnStopping(); if (!flowStats.isFinished()) flowStats.markStopped(); internalStopAllJobs(); handleExecutorShutdown(); if (!isPreserveTemporaryFiles()) cleanTemporaryFiles(false); // force cleanup } /** Method complete starts the current Flow instance if it has not be previously started, then block until completion. */ @ProcessComplete public void complete() { start(); try { try { thread.join(); } catch (InterruptedException exception) { throw new FlowException(getName(), "thread interrupted", exception); } if (throwable instanceof FlowException) ((FlowException) throwable).setFlowName(getName()); if (throwable instanceof CascadingException) throw (CascadingException) throwable; if (throwable != null) throw new FlowException(getName(), "unhandled exception", throwable); if (hasListeners()) { for (SafeFlowListener safeFlowListener : getListeners()) { if (safeFlowListener.throwable != null) throw new FlowException(getName(), "unhandled listener exception", throwable); } } } finally { thread = null; throwable = null; if (hasListeners()) { for (SafeFlowListener safeFlowListener : getListeners()) safeFlowListener.throwable = null; } } } @ProcessCleanup public void cleanup() { // do nothing } /** * Method openSource opens the first source Tap. * * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openSource() throws IOException { return sources.values().iterator().next().openForRead(getJobConf()); } /** * Method openSource opens the named source Tap. * * @param name of type String * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openSource(String name) throws IOException { return sources.get(name).openForRead(getJobConf()); } /** * Method openSink opens the first sink Tap. * * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openSink() throws IOException { return sinks.values().iterator().next().openForRead(getJobConf()); } /** * Method openSink opens the named sink Tap. * * @param name of type String * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openSink(String name) throws IOException { return sinks.get(name).openForRead(getJobConf()); } /** * Method openTrap opens the first trap Tap. * * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openTrap() throws IOException { return traps.values().iterator().next().openForRead(getJobConf()); } /** * Method openTrap opens the named trap Tap. * * @param name of type String * @return TupleIterator * @throws IOException when */ public TupleEntryIterator openTrap(String name) throws IOException { return traps.get(name).openForRead(getJobConf()); } /** * Method deleteSinks deletes all sinks, whether or not they are configured for {@link cascading.tap.SinkMode#UPDATE}. * <p/> * Use with caution. * * @throws IOException when * @see Flow#deleteSinksIfNotUpdate() */ public void deleteSinks() throws IOException { for (Tap tap : sinks.values()) tap.deletePath(getJobConf()); } /** * Method deleteSinksIfNotAppend deletes all sinks if they are not configured with the {@link cascading.tap.SinkMode#APPEND} flag. * <p/> * Typically used by a {@link Cascade} before executing the flow if the sinks are stale. * <p/> * Use with caution. * * @throws IOException when */ @Deprecated public void deleteSinksIfNotAppend() throws IOException { for (Tap tap : sinks.values()) { if (!tap.isUpdate()) tap.deletePath(getJobConf()); } } /** * Method deleteSinksIfNotUpdate deletes all sinks if they are not configured with the {@link cascading.tap.SinkMode#UPDATE} flag. * <p/> * Typically used by a {@link Cascade} before executing the flow if the sinks are stale. * <p/> * Use with caution. * * @throws IOException when */ public void deleteSinksIfNotUpdate() throws IOException { for (Tap tap : sinks.values()) { if (!tap.isUpdate()) tap.deletePath(getJobConf()); } } /** * Method tapExists returns true if the resource represented by the given Tap instance exists. * * @param tap of type Tap * @return boolean * @throws IOException when */ public boolean tapPathExists(Tap tap) throws IOException { return tap.pathExists(getJobConf()); } /** * Method openTapForRead return a {@link TupleIterator} for the given Tap instance. * * @param tap of type Tap * @return TupleIterator * @throws IOException when there is an error opening the resource */ public TupleEntryIterator openTapForRead(Tap tap) throws IOException { return tap.openForRead(getJobConf()); } /** * Method openTapForWrite returns a (@link TupleCollector} for the given Tap instance. * * @param tap of type Tap * @return TupleCollector * @throws IOException when there is an error opening the resource */ public TupleEntryCollector openTapForWrite(Tap tap) throws IOException { return tap.openForWrite(getJobConf()); } /** * Method jobsAreLocal returns true if all jobs are executed in-process as a single map and reduce task. * * @return boolean */ public boolean jobsAreLocal() { return getJobConf().get("mapred.job.tracker").equalsIgnoreCase("local"); } /** Method run implements the Runnable run method and should not be called by users. */ public void run() { if (thread == null) throw new IllegalStateException("to start a Flow call start() or complete(), not Runnable#run()"); Cascade.printBanner(); try { flowStats.markRunning(); fireOnStarting(); if (LOG.isInfoEnabled()) { logInfo("starting"); for (Tap source : getSourcesCollection()) logInfo(" source: " + source); for (Tap sink : getSinksCollection()) logInfo(" sink: " + sink); } initializeNewJobsMap(); // if jobs are run local, then only use one thread to force execution serially int numThreads = jobsAreLocal() ? 1 : getMaxConcurrentSteps(getJobConf()); if (numThreads == 0) numThreads = jobsMap.size(); if (numThreads == 0) throw new IllegalStateException("no jobs rendered for flow: " + getName()); if (LOG.isInfoEnabled()) { logInfo(" parallel execution is enabled: " + !jobsAreLocal()); logInfo(" starting jobs: " + jobsMap.size()); logInfo(" allocating threads: " + numThreads); } List<Future<Throwable>> futures = spawnJobs(numThreads); for (Future<Throwable> future : futures) { throwable = future.get(); if (throwable != null) { if (!stop) internalStopAllJobs(); handleExecutorShutdown(); break; } } } catch (Throwable throwable) { this.throwable = throwable; } finally { if (!isPreserveTemporaryFiles()) cleanTemporaryFiles(stop); handleThrowableAndMarkFailed(); if (!stop && !flowStats.isFinished()) flowStats.markSuccessful(); try { fireOnCompleted(); } finally { deregisterShutdownHook(); } } } private List<Future<Throwable>> spawnJobs(int numThreads) throws InterruptedException { if (stop) return new ArrayList<Future<Throwable>>(); executor = Executors.newFixedThreadPool(numThreads); List<Future<Throwable>> futures = executor.invokeAll(jobsMap.values()); // todo: consider submit() executor.shutdown(); // don't accept any more work return futures; } private void handleThrowableAndMarkFailed() { if (throwable != null && !stop) { flowStats.markFailed(throwable); fireOnThrowable(); } } synchronized Map<String, Callable<Throwable>> getJobsMap() { return jobsMap; } private synchronized void initializeNewJobsMap() throws IOException { // keep topo order jobsMap = new LinkedHashMap<String, Callable<Throwable>>(); TopologicalOrderIterator topoIterator = stepGraph.getTopologicalIterator(); while (topoIterator.hasNext()) { FlowStep step = (FlowStep) topoIterator.next(); FlowStepJob flowStepJob = step.createFlowStepJob(getJobConf()); jobsMap.put(step.getName(), flowStepJob); List<FlowStepJob> predecessors = new ArrayList<FlowStepJob>(); for (FlowStep flowStep : Graphs.predecessorListOf(stepGraph, step)) predecessors.add((FlowStepJob) jobsMap.get(flowStep.getName())); flowStepJob.setPredecessors(predecessors); flowStats.addStepStats(flowStepJob.getStepStats()); } } private void internalStopAllJobs() { LOG.warn("stopping jobs"); try { if (jobsMap == null) return; List<Callable<Throwable>> jobs = new ArrayList<Callable<Throwable>>(jobsMap.values()); Collections.reverse(jobs); for (Callable<Throwable> callable : jobs) ((FlowStepJob) callable).stop(); } finally { LOG.warn("stopped jobs"); } } private void handleExecutorShutdown() { if (executor == null) return; LOG.warn("shutting down job executor"); try { executor.awaitTermination(5 * 60, TimeUnit.SECONDS); } catch (InterruptedException exception) { // ignore } LOG.warn("shutdown complete"); } private void fireOnCompleted() { if (hasListeners()) { if (LOG.isDebugEnabled()) logDebug("firing onCompleted event: " + getListeners().size()); for (FlowListener flowListener : getListeners()) flowListener.onCompleted(this); } } private void fireOnThrowable() { if (hasListeners()) { if (LOG.isDebugEnabled()) logDebug("firing onThrowable event: " + getListeners().size()); boolean isHandled = false; for (FlowListener flowListener : getListeners()) isHandled = flowListener.onThrowable(this, throwable) || isHandled; if (isHandled) throwable = null; } } private void fireOnStopping() { if (hasListeners()) { if (LOG.isDebugEnabled()) logDebug("firing onStopping event: " + getListeners().size()); for (FlowListener flowListener : getListeners()) flowListener.onStopping(this); } } private void fireOnStarting() { if (hasListeners()) { if (LOG.isDebugEnabled()) logDebug("firing onStarting event: " + getListeners().size()); for (FlowListener flowListener : getListeners()) flowListener.onStarting(this); } } private void cleanTemporaryFiles(boolean stop) { if (stop) // unstable to call fs operations during shutdown return; for (FlowStep step : getSteps()) step.clean(getJobConf()); } private void registerShutdownHook() { if (!isStopJobsOnExit()) return; getHdfsShutdownHook(); shutdownHook = new Thread() { @Override public void run() { Flow.this.stop(); callHdfsShutdownHook(); } }; Runtime.getRuntime().addShutdownHook(shutdownHook); } private synchronized static void callHdfsShutdownHook() { if (--shutdownCount != 0) return; if (hdfsShutdown != null) hdfsShutdown.start(); } private synchronized static void getHdfsShutdownHook() { shutdownCount++; if (hdfsShutdown == null) hdfsShutdown = Util.getHDFSShutdownHook(); } private void deregisterShutdownHook() { if (!isStopJobsOnExit() || stop) return; Runtime.getRuntime().removeShutdownHook(shutdownHook); } @Override public String toString() { StringBuffer buffer = new StringBuffer(); if (getName() != null) buffer.append(getName()).append(": "); for (FlowStep step : getSteps()) buffer.append(step); return buffer.toString(); } private void logInfo(String message) { LOG.info("[" + Util.truncate(getName(), 25) + "] " + message); } private void logDebug(String message) { LOG.debug("[" + Util.truncate(getName(), 25) + "] " + message); } private void logWarn(String message, Throwable throwable) { LOG.warn("[" + Util.truncate(getName(), 25) + "] " + message, throwable); } /** * Method writeDOT writes this Flow instance to the given filename as a DOT file for import into a graphics package. * * @param filename of type String */ public void writeDOT(String filename) { if (pipeGraph == null) throw new UnsupportedOperationException("this flow instance cannot write a DOT file"); pipeGraph.writeDOT(filename); } /** * Method writeStepsDOT writes this Flow step graph to the given filename as a DOT file for import into a graphics package. * * @param filename of type String */ public void writeStepsDOT(String filename) { if (stepGraph == null) throw new UnsupportedOperationException("this flow instance cannot write a DOT file"); stepGraph.writeDOT(filename); } /** * Used to return a simple wrapper for use as an edge in a graph where there can only be * one instance of every edge. * * @return FlowHolder */ public FlowHolder getHolder() { return new FlowHolder(this); } /** Class FlowHolder is a helper class for wrapping Flow instances. */ public static class FlowHolder { /** Field flow */ public Flow flow; public FlowHolder() { } public FlowHolder(Flow flow) { this.flow = flow; } } /** * Class SafeFlowListener safely calls a wrapped FlowListener. * <p/> * This is done for a few reasons, the primary reason is so exceptions thrown by the Listener * can be caught by the calling Thread. Since Flow is asyncronous, much of the work is done in the run() method * which in turn is run in a new Thread. */ private class SafeFlowListener implements FlowListener { /** Field flowListener */ final FlowListener flowListener; /** Field throwable */ Throwable throwable; private SafeFlowListener(FlowListener flowListener) { this.flowListener = flowListener; } public void onStarting(Flow flow) { try { flowListener.onStarting(flow); } catch (Throwable throwable) { handleThrowable(throwable); } } public void onStopping(Flow flow) { try { flowListener.onStopping(flow); } catch (Throwable throwable) { handleThrowable(throwable); } } public void onCompleted(Flow flow) { try { flowListener.onCompleted(flow); } catch (Throwable throwable) { handleThrowable(throwable); } } public boolean onThrowable(Flow flow, Throwable flowThrowable) { try { return flowListener.onThrowable(flow, flowThrowable); } catch (Throwable throwable) { handleThrowable(throwable); } return false; } private void handleThrowable(Throwable throwable) { this.throwable = throwable; logWarn(String.format("flow listener %s threw throwable", flowListener), throwable); // stop this flow stop(); } public boolean equals(Object object) { if (object instanceof SafeFlowListener) return flowListener.equals(((SafeFlowListener) object).flowListener); return flowListener.equals(object); } public int hashCode() { return flowListener.hashCode(); } } }