cascading.flow.FlowStep.java Source code

Introduction

Here is the source code for cascading.flow.FlowStep.java
Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.flow;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import cascading.operation.Operation;
import cascading.pipe.Group;
import cascading.pipe.Operator;
import cascading.pipe.Pipe;
import cascading.tap.Tap;
import cascading.tap.TempHfs;
import cascading.tap.hadoop.Hadoop18TapUtil;
import cascading.tap.hadoop.MultiInputFormat;
import cascading.tap.hadoop.TapIterator;
import cascading.tuple.Fields;
import cascading.tuple.IndexTuple;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.TuplePair;
import cascading.tuple.hadoop.CoGroupingComparator;
import cascading.tuple.hadoop.CoGroupingPartitioner;
import cascading.tuple.hadoop.GroupingComparator;
import cascading.tuple.hadoop.GroupingPartitioner;
import cascading.tuple.hadoop.GroupingSortingComparator;
import cascading.tuple.hadoop.IndexTupleCoGroupingComparator;
import cascading.tuple.hadoop.ReverseGroupingSortingComparator;
import cascading.tuple.hadoop.ReverseTupleComparator;
import cascading.tuple.hadoop.TupleComparator;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.jgrapht.graph.SimpleDirectedGraph;

/**
 * Class FlowStep is an internal representation of a given Job to be executed on a remote cluster. During
 * planning, pipe assemblies are broken down into "steps" and encapsulated in this class.
 * <p/>
 * FlowSteps are submitted in order of dependency. If two or more steps do not share the same dependencies and all
 * can be scheduled simultaneously, the {@link #getSubmitPriority()} value determines the order in which
 * all steps will be submitted for execution. The default submit priority is 5.
 * <p/>
 * This class is for internal use, there are no stable public methods.
 */
public class FlowStep implements Serializable {
    /** Field LOG */
    private static final Logger LOG = Logger.getLogger(FlowStep.class);

    /** Field properties */
    private Map<Object, Object> properties = null;
    /** Field parentFlowName */
    private String parentFlowName;

    /** Field submitPriority */
    private int submitPriority = 5;

    /** Field name */
    String name;
    /** Field id */
    private int id;
    /** Field graph */
    final SimpleDirectedGraph<FlowElement, Scope> graph = new SimpleDirectedGraph<FlowElement, Scope>(Scope.class);

    /** Field sources */
    final Map<Tap, String> sources = new HashMap<Tap, String>(); // all sources and all sinks must have same scheme
    /** Field sink */
    protected Tap sink;
    /** Field mapperTraps */
    private final Map<String, Tap> mapperTraps = new HashMap<String, Tap>();
    /** Field reducerTraps */
    private final Map<String, Tap> reducerTraps = new HashMap<String, Tap>();
    /** Field tempSink */
    TempHfs tempSink; // used if we need to bypass
    /** Field group */
    private Group group;

    protected FlowStep(String name, int id) {
        this.name = name;
        this.id = id;
    }

    /**
     * Method getId returns the id of this FlowStep object.
     *
     * @return the id (type int) of this FlowStep object.
     */
    public int getID() {
        return id;
    }

    /**
     * Method getName returns the name of this FlowStep object.
     *
     * @return the name (type String) of this FlowStep object.
     */
    public String getName() {
        return name;
    }

    public void setName(String name) {
        if (name == null || name.isEmpty())
            throw new IllegalArgumentException("step name may not be null or empty");

        this.name = name;
    }

    /**
     * Method getParentFlowName returns the parentFlowName of this FlowStep object.
     *
     * @return the parentFlowName (type Flow) of this FlowStep object.
     */
    public String getParentFlowName() {
        return parentFlowName;
    }

    /**
     * Method setParentFlowName sets the parentFlowName of this FlowStep object.
     *
     * @param parentFlowName the parentFlowName of this FlowStep object.
     */
    public void setParentFlowName(String parentFlowName) {
        this.parentFlowName = parentFlowName;
    }

    /**
     * Method getStepName returns the stepName of this FlowStep object.
     *
     * @return the stepName (type String) of this FlowStep object.
     */
    public String getStepName() {
        return String.format("%s[%s]", getParentFlowName(), getName());
    }

    /**
     * Method getSubmitPriority returns the submitPriority of this FlowStep object.
     * <p/>
     * 10 is lowest, 1 is the highest, 5 is the default.
     *
     * @return the submitPriority (type int) of this FlowStep object.
     */
    public int getSubmitPriority() {
        return submitPriority;
    }

    /**
     * Method setSubmitPriority sets the submitPriority of this FlowStep object.
     * <p/>
     * 10 is lowest, 1 is the highest, 5 is the default.
     *
     * @param submitPriority the submitPriority of this FlowStep object.
     */
    public void setSubmitPriority(int submitPriority) {
        this.submitPriority = submitPriority;
    }

    public Group getGroup() {
        return group;
    }

    protected void setGroup(Group group) {
        this.group = group;
    }

    public Map<String, Tap> getMapperTraps() {
        return mapperTraps;
    }

    public Map<String, Tap> getReducerTraps() {
        return reducerTraps;
    }

    /**
     * Method getProperties returns the properties of this FlowStep object.
     *
     * @return the properties (type Map<Object, Object>) of this FlowStep object.
     */
    public Map<Object, Object> getProperties() {
        if (properties == null)
            properties = new Properties();

        return properties;
    }

    /**
     * Method setProperties sets the properties of this FlowStep object.
     *
     * @param properties the properties of this FlowStep object.
     */
    public void setProperties(Map<Object, Object> properties) {
        this.properties = properties;
    }

    /**
     * Method hasProperties returns {@code true} if there are properties associated with this FlowStep.
     *
     * @return boolean
     */
    public boolean hasProperties() {
        return properties != null && !properties.isEmpty();
    }

    protected JobConf getJobConf() throws IOException {
        return getJobConf(null);
    }

    protected JobConf getJobConf(JobConf parentConf) throws IOException {
        JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf);

        // set values first so they can't break things downstream
        if (hasProperties()) {
            for (Map.Entry entry : getProperties().entrySet())
                conf.set(entry.getKey().toString(), entry.getValue().toString());
        }

        // disable warning
        conf.setBoolean("mapred.used.genericoptionsparser", true);

        conf.setJobName(getStepName());

        conf.setOutputKeyClass(Tuple.class);
        conf.setOutputValueClass(Tuple.class);

        conf.setMapperClass(FlowMapper.class);
        conf.setReducerClass(FlowReducer.class);

        // set for use by the shuffling phase
        TupleSerialization.setSerializations(conf);

        initFromSources(conf);

        initFromSink(conf);

        initFromTraps(conf);

        if (sink.getScheme().getNumSinkParts() != 0) {
            // if no reducer, set num map tasks to control parts
            if (getGroup() != null)
                conf.setNumReduceTasks(sink.getScheme().getNumSinkParts());
            else
                conf.setNumMapTasks(sink.getScheme().getNumSinkParts());
        }

        conf.setOutputKeyComparatorClass(TupleComparator.class);

        if (getGroup() == null) {
            conf.setNumReduceTasks(0); // disable reducers
        } else {
            // must set map output defaults when performing a reduce
            conf.setMapOutputKeyClass(Tuple.class);
            conf.setMapOutputValueClass(Tuple.class);

            // handles the case the groupby sort should be reversed
            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

            addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors());

            if (getGroup().isGroupBy())
                addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors());

            if (!getGroup().isGroupBy()) {
                conf.setPartitionerClass(CoGroupingPartitioner.class);
                conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
                conf.setMapOutputValueClass(IndexTuple.class);
                conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
                conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
            }

            if (getGroup().isSorted()) {
                conf.setPartitionerClass(GroupingPartitioner.class);
                conf.setMapOutputKeyClass(TuplePair.class);

                if (getGroup().isSortReversed())
                    conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
                else
                    conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

                // no need to supply a reverse comparator, only equality is checked
                conf.setOutputValueGroupingComparator(GroupingComparator.class);
            }
        }

        // perform last so init above will pass to tasks
        conf.setInt("cascading.flow.step.id", id);
        conf.set("cascading.flow.step", Util.serializeBase64(this));

        return conf;
    }

    private void addComparators(JobConf conf, String property, Map<String, Fields> map) throws IOException {
        Iterator<Fields> fieldsIterator = map.values().iterator();

        if (!fieldsIterator.hasNext())
            return;

        Fields fields = fieldsIterator.next();

        if (fields.hasComparators()) {
            conf.set(property, Util.serializeBase64(fields));
            return;
        }

        // use resolved fields if there are no comparators.
        Set<Scope> previousScopes = getPreviousScopes(getGroup());

        fields = previousScopes.iterator().next().getOutValuesFields();

        if (fields.size() != 0) // allows fields.UNKNOWN to be used
            conf.setInt(property + ".size", fields.size());

        return;
    }

    private void initFromTraps(JobConf conf) throws IOException {
        initFromTraps(conf, getMapperTraps());
        initFromTraps(conf, getReducerTraps());
    }

    private void initFromTraps(JobConf conf, Map<String, Tap> traps) throws IOException {
        if (!traps.isEmpty()) {
            JobConf trapConf = new JobConf(conf);

            for (Tap tap : traps.values())
                tap.sinkInit(trapConf);
        }
    }

    private void initFromSources(JobConf conf) throws IOException {
        JobConf[] fromJobs = new JobConf[sources.size()];
        int i = 0;

        for (Tap tap : sources.keySet()) {
            fromJobs[i] = new JobConf(conf);
            tap.sourceInit(fromJobs[i]);
            fromJobs[i].set("cascading.step.source", Util.serializeBase64(tap));
            i++;
        }

        MultiInputFormat.addInputFormat(conf, fromJobs);
    }

    private void initFromSink(JobConf conf) throws IOException {
        // init sink first so tempSink can take precedence
        if (sink != null)
            sink.sinkInit(conf);

        // tempSink exists because sink is writeDirect
        if (tempSink != null)
            tempSink.sinkInit(conf);
    }

    public TapIterator openSourceForRead(JobConf conf) throws IOException {
        return new TapIterator(sources.keySet().iterator().next(), conf);
    }

    public TupleEntryIterator openSinkForRead(JobConf conf) throws IOException {
        return sink.openForRead(conf);
    }

    public Tap getMapperTrap(String name) {
        return getMapperTraps().get(name);
    }

    public Tap getReducerTrap(String name) {
        return getReducerTraps().get(name);
    }

    /**
     * Method getPreviousScopes returns the previous Scope instances. If the flowElement is a Group (specifically a CoGroup),
     * there will be more than one instance.
     *
     * @param flowElement of type FlowElement
     * @return Set<Scope>
     */
    public Set<Scope> getPreviousScopes(FlowElement flowElement) {
        assertFlowElement(flowElement);

        return graph.incomingEdgesOf(flowElement);
    }

    /**
     * Method getNextScope returns the next Scope instance in the graph. There will always only be one next.
     *
     * @param flowElement of type FlowElement
     * @return Scope
     */
    public Scope getNextScope(FlowElement flowElement) {
        assertFlowElement(flowElement);

        Set<Scope> set = graph.outgoingEdgesOf(flowElement);

        if (set.size() != 1)
            throw new IllegalStateException("should only be one scope after current flow element: " + flowElement
                    + " found: " + set.size());

        return set.iterator().next();
    }

    public Set<Scope> getNextScopes(FlowElement flowElement) {
        assertFlowElement(flowElement);

        return graph.outgoingEdgesOf(flowElement);
    }

    private void assertFlowElement(FlowElement flowElement) {
        if (!graph.containsVertex(flowElement)) {
            String message = "unable to find %s in plan, class and serializable fields must implement #hashCode() and #equals()";

            if (flowElement instanceof Pipe)
                message = Util.formatTrace((Pipe) flowElement, String.format(message, "pipe"));
            else if (flowElement instanceof Tap)
                message = Util.formatTrace((Tap) flowElement, String.format(message, "tap"));

            throw new IllegalStateException(message);
        }
    }

    public FlowElement getNextFlowElement(Scope scope) {
        return graph.getEdgeTarget(scope);
    }

    public String getSourceName(Tap source) {
        return sources.get(source);
    }

    public Collection<Operation> getAllOperations() {
        Set<FlowElement> vertices = graph.vertexSet();
        List<Operation> operations = new ArrayList<Operation>(); // operations impl equals, so two instance may be the same

        for (FlowElement vertice : vertices) {
            if (vertice instanceof Operator)
                operations.add(((Operator) vertice).getOperation());
        }

        return operations;
    }

    public boolean containsPipeNamed(String pipeName) {
        Set<FlowElement> vertices = graph.vertexSet();

        for (FlowElement vertice : vertices) {
            if (vertice instanceof Pipe && ((Pipe) vertice).getName().equals(pipeName))
                return true;
        }

        return false;
    }

    /**
     * Method clean removes any temporary files used by this FlowStep instance. It will log any IOExceptions thrown.
     *
     * @param jobConf of type JobConf
     */
    public void clean(JobConf jobConf) {
        if (tempSink != null) {
            try {
                tempSink.deletePath(jobConf);
            } catch (Exception exception) {
                // sink all exceptions, don't fail app
                logWarn("unable to remove temporary file: " + tempSink, exception);
            }
        }

        if (sink instanceof TempHfs) {
            try {
                sink.deletePath(jobConf);
            } catch (Exception exception) {
                // sink all exceptions, don't fail app
                logWarn("unable to remove temporary file: " + sink, exception);
            }
        } else {
            cleanTap(jobConf, sink);
        }

        for (Tap tap : getMapperTraps().values())
            cleanTap(jobConf, tap);

        for (Tap tap : getReducerTraps().values())
            cleanTap(jobConf, tap);

    }

    private void cleanTap(JobConf jobConf, Tap tap) {
        try {
            Hadoop18TapUtil.cleanupTap(jobConf, tap);
        } catch (IOException exception) {
            // ignore exception
        }
    }

    @Override
    public boolean equals(Object object) {
        if (this == object)
            return true;
        if (object == null || getClass() != object.getClass())
            return false;

        FlowStep flowStep = (FlowStep) object;

        if (name != null ? !name.equals(flowStep.name) : flowStep.name != null)
            return false;

        return true;
    }

    @Override
    public int hashCode() {
        return name != null ? name.hashCode() : 0;
    }

    @Override
    public String toString() {
        StringBuffer buffer = new StringBuffer();

        buffer.append(getClass().getSimpleName());
        buffer.append("[name: ").append(getName()).append("]");

        return buffer.toString();
    }

    protected FlowStepJob createFlowStepJob(JobConf parentConf) throws IOException {
        return new FlowStepJob(this, getName(), getJobConf(parentConf));
    }

    protected final boolean isInfoEnabled() {
        return LOG.isInfoEnabled();
    }

    protected final boolean isDebugEnabled() {
        return LOG.isDebugEnabled();
    }

    protected void logDebug(String message) {
        LOG.debug("[" + Util.truncate(getParentFlowName(), 25) + "] " + message);
    }

    protected void logInfo(String message) {
        LOG.info("[" + Util.truncate(getParentFlowName(), 25) + "] " + message);
    }

    protected void logWarn(String message) {
        LOG.warn("[" + Util.truncate(getParentFlowName(), 25) + "] " + message);
    }

    protected void logWarn(String message, Throwable throwable) {
        LOG.warn("[" + Util.truncate(getParentFlowName(), 25) + "] " + message, throwable);
    }

    protected void logError(String message, Throwable throwable) {
        LOG.error("[" + Util.truncate(getParentFlowName(), 25) + "] " + message, throwable);
    }
}