com.etsy.arbiter.workflow.WorkflowGraphBuilder.java Source code

Java tutorial

Introduction

Here is the source code for com.etsy.arbiter.workflow.WorkflowGraphBuilder.java

Source

/*
 * Copyright 2015-2016 Etsy
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.etsy.arbiter.workflow;

import com.etsy.arbiter.Action;
import com.etsy.arbiter.Workflow;
import com.etsy.arbiter.config.Config;
import com.etsy.arbiter.exception.WorkflowGraphException;
import com.etsy.arbiter.util.GraphvizGenerator;
import com.etsy.arbiter.util.NamedArgumentInterpolator;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.Triple;
import org.jgrapht.Graphs;
import org.jgrapht.alg.ConnectivityInspector;
import org.jgrapht.experimental.dag.DirectedAcyclicGraph;
import org.jgrapht.graph.DefaultEdge;

import java.util.*;

/**
 * Takes a workflow graph specified by YAML and builds a graph in an Oozie-friendly format
 * This includes inserting start/end nodes, fork/join pairs, etc
 *
 * @author Andrew Johnson
 */
public class WorkflowGraphBuilder {
    private WorkflowGraphBuilder() {
    }

    // Every fork/join pair needs a unique name
    // To keep the names short, we just number them sequentially
    private static int forkCount = 0;

    /**
     * Build a workflow graph from the workflow definition, inserting fork/join pairs as appropriate for parallel
     *
     * @param workflow Arbiter Workflow object
     * @param config Arbiter Config object
     * @param outputDir Output directory for Graphviz graphs
     * @param generateGraphviz Indicate if Graphviz graphs should be generated for workflows
     * @param graphvizFormat The format in which Graphviz graphs should be generated if enabled
     * @return DirectedAcyclicGraph DAG of the workflow
     * @throws WorkflowGraphException
     */
    public static DirectedAcyclicGraph<Action, DefaultEdge> buildWorkflowGraph(Workflow workflow, Config config,
            String outputDir, boolean generateGraphviz, String graphvizFormat) throws WorkflowGraphException {
        forkCount = 0;
        Map<String, Action> actionsByName = new HashMap<>();
        List<Action> workflowActions = workflow.getActions();

        // Add all the actions to a map of string -> action
        for (Action a : workflowActions) {
            actionsByName.put(a.getName(), a);
        }

        // DAG of the workflow in its raw un-optimized state.
        DirectedAcyclicGraph<Action, DefaultEdge> inputGraph = new DirectedAcyclicGraph<>(DefaultEdge.class);

        // Add all the actions as vertices. At this point there are no connections within the graph, just vertices.
        for (Action a : workflowActions) {
            inputGraph.addVertex(a);
        }

        // We need to traverse a second time so all the vertices are present when adding edges
        for (Action a : workflowActions) {
            if (a.getDependencies() != null) {
                for (String d : a.getDependencies()) {
                    Action source = actionsByName.get(d);
                    if (source == null) {
                        throw new WorkflowGraphException("Missing action for dependency " + d);
                    }

                    // Add the edge between the dep and the action
                    try {
                        inputGraph.addDagEdge(source, a);
                    } catch (DirectedAcyclicGraph.CycleFoundException e) {
                        throw new WorkflowGraphException("Cycle found while building original graph", e);
                    }
                }
            }
        }

        if (generateGraphviz) {
            GraphvizGenerator.generateGraphviz(inputGraph, outputDir + "/" + workflow.getName() + "-input.dot",
                    graphvizFormat);
        }

        // Final DAG we will be returning
        DirectedAcyclicGraph<Action, DefaultEdge> workflowGraph;
        Action startTransitionNode;
        Action endTransitionNode;

        try {
            // Process the graph into its properly connected and organized structure.
            Triple<DirectedAcyclicGraph<Action, DefaultEdge>, Action, Action> workflowGraphTriple = processSubcomponents(
                    inputGraph);
            workflowGraph = workflowGraphTriple.getLeft();
            startTransitionNode = workflowGraphTriple.getMiddle();
            endTransitionNode = workflowGraphTriple.getRight();

            // These are the standard control flow nodes that must be present in every workflow
            Action start = new Action();
            start.setName("start");
            start.setType("start");
            workflowGraph.addVertex(start);
            workflowGraph.addDagEdge(start, startTransitionNode);

            Action end = new Action();
            end.setName("end");
            end.setType("end");
            workflowGraph.addVertex(end);

            if (workflow.getErrorHandler() != null) {
                workflowGraph.addVertex(workflow.getErrorHandler());
                workflowGraph.addDagEdge(workflow.getErrorHandler(), end);
                workflowGraph.addDagEdge(endTransitionNode, workflow.getErrorHandler());
            } else {
                workflowGraph.addDagEdge(endTransitionNode, end);
            }

            // The kill node will be used as the error transition when generating the XML as appropriate
            // These is no need to add any edges to it now
            if (config.getKillMessage() != null && config.getKillName() != null) {
                Action kill = new Action();
                kill.setType("kill");
                kill.setName(config.getKillName());
                kill.setProperty("message", NamedArgumentInterpolator.interpolate(config.getKillMessage(),
                        ImmutableMap.of("name", workflow.getName()), null));
                workflowGraph.addVertex(kill);
            }
        } catch (DirectedAcyclicGraph.CycleFoundException e) {
            throw new WorkflowGraphException("Cycle found while generating workflow", e);
        }

        return workflowGraph;
    }

    /**
     * Recursively insert fork/joins for connected subcomponents of a graph
     *
     * @param vertices The set of vertices to process
     * @param parentGraph The parentGraph graph of these vertices
     * @return DirectedAcyclicGraph A new graph containing all the given vertices with appropriate fork/join pairs inserted
     * @throws WorkflowGraphException
     * @throws DirectedAcyclicGraph.CycleFoundException
     */
    private static DirectedAcyclicGraph<Action, DefaultEdge> buildComponentGraph(Set<Action> vertices,
            DirectedAcyclicGraph<Action, DefaultEdge> parentGraph)
            throws WorkflowGraphException, DirectedAcyclicGraph.CycleFoundException {
        DirectedAcyclicGraph<Action, DefaultEdge> subgraph = buildSubgraph(parentGraph, vertices);

        // Start by pulling out the vertices with no incoming edges
        // These can run in parallel in a fork-join
        Set<Action> initialNodes = new HashSet<>();
        for (Action vertex : subgraph.vertexSet()) {
            if (subgraph.inDegreeOf(vertex) == 0) {
                initialNodes.add(vertex);
            }
        }

        DirectedAcyclicGraph<Action, DefaultEdge> result = new DirectedAcyclicGraph<>(DefaultEdge.class);

        if (initialNodes.isEmpty()) {
            // This is a very odd case, but just in case we'll fail if it happens
            throw new WorkflowGraphException("No nodes with inDegree = 0 found.  This shouldn't happen.");
        } else if (initialNodes.size() == 1) {
            // If there is only one node, we can't put it in a fork/join
            // In this case we'll add just that vertex to the resulting graph
            Action vertex = initialNodes.iterator().next();
            result.addVertex(vertex);
            // Remove the processed vertex so that we have new unprocessed subcomponents
            subgraph.removeVertex(vertex);
        } else {
            // If there are multiple nodes, insert a fork/join pair to run them in parallel
            Pair<Action, Action> forkJoin = addForkJoin(result);
            Action fork = forkJoin.getLeft();
            Action join = forkJoin.getRight();
            for (Action vertex : initialNodes) {
                result.addVertex(vertex);
                result.addDagEdge(fork, vertex);
                result.addDagEdge(vertex, join);
                // Remove the processed vertex so that we have new unprocessed subcomponents
                subgraph.removeVertex(vertex);
            }
        }

        // Now recursively process the graph with the processed nodes removed
        Triple<DirectedAcyclicGraph<Action, DefaultEdge>, Action, Action> subComponentGraphTriple = processSubcomponents(
                subgraph);
        DirectedAcyclicGraph<Action, DefaultEdge> subComponentGraph = subComponentGraphTriple.getLeft();

        // Having processed the subcomponents, we attach the "last" node of the graph created here to
        // the "first" node of the subcomponent graph
        Action noIncoming = subComponentGraphTriple.getMiddle();
        Action noOutgoing = null;

        for (Action vertex : result.vertexSet()) {
            if (noOutgoing == null && result.outDegreeOf(vertex) == 0) {
                noOutgoing = vertex;
            }
        }

        Graphs.addGraph(result, subComponentGraph);
        if (noOutgoing != null && noIncoming != null && !noOutgoing.equals(noIncoming)) {
            result.addDagEdge(noOutgoing, noIncoming);
        }
        return result;
    }

    /**
     * Processes all connected subcomponents of a given graph
     *
     * @param parentGraph The graph for which to process subcomponents
     * @return A Triple with these elements - A new graph with fork/join pairs inserted, the "first" node in this graph, and the "last" node in this graph
     * @throws WorkflowGraphException
     * @throws DirectedAcyclicGraph.CycleFoundException
     */
    private static Triple<DirectedAcyclicGraph<Action, DefaultEdge>, Action, Action> processSubcomponents(
            DirectedAcyclicGraph<Action, DefaultEdge> parentGraph)
            throws WorkflowGraphException, DirectedAcyclicGraph.CycleFoundException {
        ConnectivityInspector<Action, DefaultEdge> inspector = new ConnectivityInspector<>(parentGraph);
        List<Set<Action>> connectedComponents = inspector.connectedSets();

        // Recursively process each connected subcomponent of the graph
        List<DirectedAcyclicGraph<Action, DefaultEdge>> componentGraphs = new ArrayList<>(
                connectedComponents.size());
        for (Set<Action> subComponent : connectedComponents) {
            componentGraphs.add(buildComponentGraph(subComponent, parentGraph));
        }

        DirectedAcyclicGraph<Action, DefaultEdge> result = new DirectedAcyclicGraph<>(DefaultEdge.class);
        for (DirectedAcyclicGraph<Action, DefaultEdge> subSubgraph : componentGraphs) {
            Graphs.addGraph(result, subSubgraph);
        }

        // If we have more than one subcomponent, we must insert a fork/join to run them in parallel
        if (componentGraphs.size() > 1) {
            Pair<Action, Action> forkJoin = addForkJoin(result);
            Action fork = forkJoin.getLeft();
            Action join = forkJoin.getRight();
            for (DirectedAcyclicGraph<Action, DefaultEdge> subSubgraph : componentGraphs) {
                for (Action vertex : subSubgraph.vertexSet()) {
                    // Vertices with no incoming edges attach directly to the fork
                    if (subSubgraph.inDegreeOf(vertex) == 0) {
                        result.addDagEdge(fork, vertex);
                    }
                    // Vertices with no outgoing edges attach directly to the join
                    if (subSubgraph.outDegreeOf(vertex) == 0) {
                        result.addDagEdge(vertex, join);
                    }
                }
            }
        }

        // The graph will now have one node with no outgoing edges and one node with no incoming edges
        // The node with no outgoing edges is the "last" node in the resulting graph
        // The node with no incoming edges is the "first" node in the resulting graph
        // These are pulled out specifically to make it easier to attach the resulting graph into another one
        Action noOutgoing = null;
        Action noIncoming = null;

        for (Action vertex : result.vertexSet()) {
            if (noIncoming == null && result.inDegreeOf(vertex) == 0) {
                noIncoming = vertex;
            }
        }

        for (Action vertex : result.vertexSet()) {
            if (noOutgoing == null && result.outDegreeOf(vertex) == 0) {
                noOutgoing = vertex;
            }
        }

        return Triple.of(result, noIncoming, noOutgoing);
    }

    /**
     * Build a subgraph of a parentGraph graph given a set of vertices
     * This is a new object and not a view on the parentGraph graph
     *
     * @param parentGraph The parentGraph component for the new subgraph to start at
     * @param vertices The set of actions (vertices) making up the subgraph
     * @return DirectedAcyclicGraph A new graph containing only the given vertices
     */
    private static DirectedAcyclicGraph<Action, DefaultEdge> buildSubgraph(
            DirectedAcyclicGraph<Action, DefaultEdge> parentGraph, Set<Action> vertices)
            throws DirectedAcyclicGraph.CycleFoundException {
        // Create a new DAG to serve as the subgraph
        DirectedAcyclicGraph<Action, DefaultEdge> subgraph = new DirectedAcyclicGraph<>(DefaultEdge.class);

        // Add all the vertices for this subgraph
        for (Action vertex : vertices) {
            subgraph.addVertex(vertex);
        }

        // All vertices must exist in the graph before any edges can be added
        for (Action vertex : vertices) {
            // Grab the parentGraph's edges between the parentGraph and this vertex and then add an edge in the subgraph to
            // match what the parentGraph has.
            for (DefaultEdge edge : parentGraph.edgesOf(vertex)) {
                subgraph.addDagEdge(parentGraph.getEdgeSource(edge), parentGraph.getEdgeTarget(edge), edge);
            }
        }

        return subgraph;
    }

    /**
     * Create a fork/join pair and add it to a graph
     *
     * @param parentGraph The graph to which to add the fork/join actions
     * @return A Pair of actions. The left action is the fork and the right action is the join
     */
    private static Pair<Action, Action> addForkJoin(DirectedAcyclicGraph<Action, DefaultEdge> parentGraph) {
        Action fork = new Action();
        fork.setName("fork-" + forkCount);
        fork.setType("fork");

        Action join = new Action();
        join.setName("join-" + forkCount);
        join.setType("join");
        forkCount++;
        parentGraph.addVertex(fork);
        parentGraph.addVertex(join);

        return Pair.of(fork, join);
    }
}