com.dataartisans.flink.cascading.planner.FlinkFlowStep.java Source code

Java tutorial

Introduction

Here is the source code for com.dataartisans.flink.cascading.planner.FlinkFlowStep.java

Source

/*
 * Copyright 2015 data Artisans GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dataartisans.flink.cascading.planner;

import cascading.flow.FlowElement;
import cascading.flow.FlowException;
import cascading.flow.FlowNode;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.FlowStepJob;
import cascading.flow.planner.Scope;
import cascading.flow.planner.graph.ElementGraph;
import cascading.flow.planner.graph.Extent;
import cascading.flow.planner.process.FlowNodeGraph;
import cascading.management.state.ClientState;
import cascading.pipe.Boundary;
import cascading.pipe.CoGroup;
import cascading.pipe.GroupBy;
import cascading.pipe.HashJoin;
import cascading.pipe.Merge;
import cascading.pipe.Pipe;
import cascading.pipe.Splice;
import cascading.pipe.joiner.BufferJoin;
import cascading.pipe.joiner.InnerJoin;
import cascading.pipe.joiner.Joiner;
import cascading.pipe.joiner.LeftJoin;
import cascading.property.ConfigDef;
import cascading.tap.Tap;
import cascading.tap.hadoop.io.MultiInputFormat;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import com.dataartisans.flink.cascading.runtime.coGroup.bufferJoin.BufferJoinKeyExtractor;
import com.dataartisans.flink.cascading.runtime.coGroup.bufferJoin.CoGroupBufferReducer;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.CoGroupReducer;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.TupleAppendOuterJoiner;
import com.dataartisans.flink.cascading.runtime.coGroup.regularJoin.TupleOuterJoiner;
import com.dataartisans.flink.cascading.runtime.groupBy.GroupByReducer;
import com.dataartisans.flink.cascading.runtime.hashJoin.NaryHashJoinJoiner;
import com.dataartisans.flink.cascading.runtime.util.FlinkFlowProcess;
import com.dataartisans.flink.cascading.runtime.hashJoin.BinaryHashJoinJoiner;
import com.dataartisans.flink.cascading.runtime.hashJoin.JoinPrepareMapper;
import com.dataartisans.flink.cascading.runtime.hashJoin.TupleAppendCrosser;
import com.dataartisans.flink.cascading.runtime.hashJoin.TupleAppendJoiner;
import com.dataartisans.flink.cascading.runtime.hashJoin.HashJoinMapper;
import com.dataartisans.flink.cascading.runtime.each.EachMapper;
import com.dataartisans.flink.cascading.runtime.sink.TapOutputFormat;
import com.dataartisans.flink.cascading.runtime.source.TapInputFormat;
import com.dataartisans.flink.cascading.runtime.util.IdMapper;
import com.dataartisans.flink.cascading.types.tuple.TupleTypeInfo;
import com.dataartisans.flink.cascading.types.tuplearray.TupleArrayTypeInfo;
import com.dataartisans.flink.cascading.util.FlinkConfigConverter;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.GroupReduceOperator;
import org.apache.flink.api.java.operators.JoinOperator;
import org.apache.flink.api.java.operators.Operator;
import org.apache.flink.api.java.operators.PartitionOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.operators.SortedGrouping;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class FlinkFlowStep extends BaseFlowStep<Configuration> {

    private static final Logger LOG = LoggerFactory.getLogger(FlinkFlowStep.class);

    private ExecutionEnvironment env;
    private List<String> classPath;

    public FlinkFlowStep(ExecutionEnvironment env, ElementGraph elementGraph, FlowNodeGraph flowNodeGraph,
            List<String> classPath) {
        super(elementGraph, flowNodeGraph);
        this.env = env;
        this.classPath = classPath;
    }

    /**
     * Configures the Flink program for this step
     */
    public Configuration createInitializedConfig(FlowProcess<Configuration> flowProcess,
            Configuration parentConfig) {

        this.env.getConfig().registerKryoType(Tuple.class);

        Configuration config = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);
        config.set("cascading.flow.step.num", Integer.toString(getOrdinal()));
        HadoopUtil.setIsInflow(config);

        this.setConfig(config);

        return config;
    }

    protected FlowStepJob<Configuration> createFlowStepJob(ClientState clientState,
            FlowProcess<Configuration> flowProcess, Configuration initializedStepConfig) {
        this.buildFlinkProgram(flowProcess);
        return new FlinkFlowStepJob(clientState, this, initializedStepConfig, classPath);
    }

    /**
     * Method clean removes any temporary files used by this FlowStep instance. It will log any IOExceptions thrown.
     *
     * @param config of type Configuration
     */
    public void clean(Configuration config) {

    }

    public ExecutionEnvironment getExecutionEnvironment() {
        return this.env;
    }

    public Plan getFlinkPlan() {
        return this.env.createProgramPlan();
    }

    private void printFlowStep() {
        Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

        LOG.info("Step Cnt: {} ", getFlowNodeGraph().vertexSet().size());
        LOG.info("Edge Cnt: {} ", getFlowNodeGraph().edgeSet().size());
        LOG.info("Src Set: {} ", getFlowNodeGraph().getSourceElements());
        LOG.info("Snk Set: {} ", getFlowNodeGraph().getSinkElements());
        LOG.info("##############");

        while (iterator.hasNext()) {

            FlowNode next = iterator.next();

            LOG.info("Node cnt: {} ", next.getElementGraph().vertexSet().size());
            LOG.info("Edge cnt: {} ", next.getElementGraph().edgeSet().size());

            LOG.info("Nodes: {} ", next.getElementGraph().vertexSet());

            LOG.info("-----------");
        }

    }

    public void buildFlinkProgram(FlowProcess flowProcess) {

        printFlowStep();

        int numMappers;
        try {
            numMappers = Integer
                    .parseInt(((FlinkFlowProcess) flowProcess).getConfig().get("flink.num.sourceTasks"));
        } catch (NumberFormatException e) {
            numMappers = -1;
        }

        int numReducers;
        try {
            numReducers = Integer
                    .parseInt(((FlinkFlowProcess) flowProcess).getConfig().get("flink.num.shuffleTasks"));
        } catch (NumberFormatException e) {
            numReducers = -1;
        }

        numMappers = (numMappers > 0) ? numMappers : env.getParallelism();
        numReducers = (numReducers > 0) ? numReducers : env.getParallelism();

        FlowNodeGraph flowNodeGraph = getFlowNodeGraph();
        Iterator<FlowNode> iterator = flowNodeGraph.getTopologicalIterator();

        Map<FlowElement, DataSet<?>> flinkMemo = new HashMap<>();

        while (iterator.hasNext()) {
            FlowNode node = iterator.next();

            Set<FlowElement> all = node.getElementGraph().vertexSet();
            Set<FlowElement> sources = getSources(node);
            Set<FlowElement> sinks = getSinks(node);
            Set<FlowElement> inner = getInnerElements(node);

            // SOURCE
            if (sources.size() == 1 && allOfType(sources, Tap.class) && sinks.size() == 1
                    && allOfType(sinks, Boundary.class)) {

                DataSet<Tuple> sourceFlow = translateSource(flowProcess, env, node, numMappers);
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, sourceFlow);
                }
            }
            // SINK
            else if (sources.size() == 1 && allOfType(sources, Boundary.class) && sinks.size() == 1
                    && allOfType(sinks, Tap.class)) {

                DataSet<Tuple> input = (DataSet<Tuple>) flinkMemo.get(getSingle(sources));
                translateSink(flowProcess, input, node);
            }
            // SPLIT or EMPTY NODE (Single boundary source, one or more boundary sinks & no intermediate nodes)
            else if (sources.size() == 1 && allOfType(sources, Boundary.class) && sinks.size() >= 1
                    && allOfType(sinks, Boundary.class) && inner.size() == 0) {

                // just forward
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, flinkMemo.get(getSingle(sources)));
                }

            }
            // INPUT OF GROUPBY (one or more boundary sources, single groupBy sink, no inner)
            else if (sources.size() > 0 && allOfType(sources, Boundary.class) && sinks.size() == 1
                    && allOfType(sinks, GroupBy.class) && inner.size() == 0) {

                GroupBy groupBy = (GroupBy) getSingle(sinks);

                List<DataSet<Tuple>> groupByInputs = new ArrayList<>(sources.size());
                for (FlowElement e : sources) {
                    groupByInputs.add((DataSet<Tuple>) flinkMemo.get(e));
                }

                // prepare groupBy input
                DataSet<Tuple> groupByInput = prepareGroupByInput(groupByInputs, node);

                flinkMemo.put(groupBy, groupByInput);
            }
            // GROUPBY (Single groupBy source)
            else if (sources.size() == 1 && allOfType(sources, GroupBy.class)) {

                DataSet<Tuple> input = (DataSet<Tuple>) flinkMemo.get(getSingle(sources));
                DataSet<Tuple> grouped = translateGroupBy(input, node, numReducers);
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, grouped);
                }
            }
            // INPUT OF COGROUP (one or more boundary sources, single coGroup sink, no inner)
            else if (sources.size() > 0 && allOfType(sources, Boundary.class) && sinks.size() == 1
                    && allOfType(sinks, CoGroup.class) && inner.size() == 0) {

                CoGroup coGroup = (CoGroup) getSingle(sinks);

                List<DataSet<Tuple>> coGroupInputs = new ArrayList<>(sources.size());
                for (FlowElement e : getNodeInputsInOrder(node, coGroup)) {
                    coGroupInputs.add((DataSet<Tuple>) flinkMemo.get(e));
                }

                // prepare coGroup input
                DataSet<?> input = prepareCoGroupInput(coGroupInputs, node, numReducers);
                flinkMemo.put(coGroup, input);
            }
            // COGROUP (Single CoGroup source)
            else if (sources.size() == 1 && allOfType(sources, CoGroup.class)) {

                CoGroup coGroup = (CoGroup) getSingle(sources);

                DataSet<?> input = flinkMemo.get(coGroup);
                DataSet<Tuple> coGrouped = translateCoGroup(input, node, numReducers);

                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, coGrouped);
                }
            }
            // HASHJOIN (one or more boundary source, followed by a single HashJoin)
            else if (sources.size() > 0 && allOfType(sources, Boundary.class)
                    && getCommonSuccessor(sources, node) instanceof HashJoin) {

                HashJoin hashJoin = (HashJoin) getCommonSuccessor(sources, node);
                List<DataSet<Tuple>> hashJoinInputs = new ArrayList<>(sources.size());
                for (FlowElement e : getNodeInputsInOrder(node, hashJoin)) {
                    hashJoinInputs.add((DataSet<Tuple>) flinkMemo.get(e));
                }

                DataSet<Tuple> joined = translateHashJoin(hashJoinInputs, node);
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, joined);
                }

            }
            // MERGE (multiple boundary sources, single boundary sink, single merge inner)
            else if (sources.size() > 1 && allOfType(sources, Boundary.class) && sinks.size() == 1
                    && allOfType(sinks, Boundary.class) && inner.size() == 1 && allOfType(inner, Merge.class)) {

                List<DataSet<Tuple>> mergeInputs = new ArrayList<>(sources.size());
                for (FlowElement e : sources) {
                    mergeInputs.add((DataSet<Tuple>) flinkMemo.get(e));
                }

                DataSet<Tuple> unioned = translateMerge(mergeInputs, node);
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, unioned);
                }
            }
            // MAP (Single boundary source AND nothing else matches)
            else if (sources.size() == 1 && allOfType(sources, Boundary.class)) {

                DataSet<Tuple> input = (DataSet<Tuple>) flinkMemo.get(getSingle(sources));
                DataSet<Tuple> mapped = translateMap(input, node);
                for (FlowElement sink : sinks) {
                    flinkMemo.put(sink, mapped);
                }
            } else {
                throw new RuntimeException("Could not translate this node: " + node.getElementGraph().vertexSet());
            }
        }

    }

    private DataSet<Tuple> translateSource(FlowProcess flowProcess, ExecutionEnvironment env, FlowNode node,
            int dop) {

        Tap tap = this.getSingle(node.getSourceTaps());
        JobConf tapConfig = new JobConf(this.getNodeConfig(node));
        tap.sourceConfInit(flowProcess, tapConfig);
        tapConfig.set("cascading.step.source", Tap.id(tap));

        Fields outFields = tap.getSourceFields();
        registerKryoTypes(outFields);

        JobConf sourceConfig = new JobConf(this.getNodeConfig(node));
        MultiInputFormat.addInputFormat(sourceConfig, tapConfig);

        DataSet<Tuple> src = env.createInput(new TapInputFormat(node), new TupleTypeInfo(outFields))
                .name(tap.getIdentifier()).setParallelism(dop)
                .withParameters(FlinkConfigConverter.toFlinkConfig(new Configuration(sourceConfig)));

        return src;

    }

    private void translateSink(FlowProcess flowProcess, DataSet<Tuple> input, FlowNode node) {

        Tap tap = this.getSingle(node.getSinkTaps());
        Configuration sinkConfig = this.getNodeConfig(node);
        tap.sinkConfInit(flowProcess, sinkConfig);

        int desiredDop = tap.getScheme().getNumSinkParts();
        int inputDop = ((Operator) input).getParallelism();
        int dop;

        if (inputDop == 1) {
            // input operators have dop 1. Probably because they perform a non-keyed reduce or coGroup
            dop = 1;
        } else {
            if (desiredDop > 0) {
                // output dop explicitly set.
                if (input instanceof GroupReduceOperator) {
                    // input is a reduce and we must preserve its sorting.
                    // we must set the desired dop also for reduce and related operators
                    adjustDopOfReduceOrCoGroup((GroupReduceOperator) input, desiredDop);
                }
                dop = desiredDop;
            } else {
                dop = inputDop;
            }
        }

        input.output(new TapOutputFormat(node)).name(tap.getIdentifier()).setParallelism(dop)
                .withParameters(FlinkConfigConverter.toFlinkConfig(sinkConfig));

    }

    /**
     * Adjusts the parallelism of a GroupReduce operator (and all associated operators) that
     * belongs to a Cascading GroupBy or CoGroup pipe.
     * This needs to be done if the result must be emitted in order and a specific sink
     * parallelism is requested.
     *
     * @param reduceOp The operator whose DOP needs to be adjusted
     * @param dop The parallelism to set
     */
    private void adjustDopOfReduceOrCoGroup(GroupReduceOperator reduceOp, int dop) {

        reduceOp.setParallelism(dop);

        DataSet reduceInput = reduceOp.getInput();
        if (reduceInput instanceof SortPartitionOperator) {
            // We have a Reduce operator whose grouping keys need to be reversely ordered.
            // This yields: input -> PartitionOperator -> SortPartitionOperator -> GroupReduceOperator.
            // The DOPs of the PartitionOperator and SortPartitionOperator must be adjusted.
            SortPartitionOperator sortOp = (SortPartitionOperator) reduceInput;
            sortOp.setParallelism(dop);
            DataSet sortInput = sortOp.getInput();
            if (sortInput instanceof PartitionOperator) {
                PartitionOperator partitionOp = (PartitionOperator) sortInput;
                partitionOp.setParallelism(dop);
            }
        } else if (reduceInput instanceof JoinOperator
                && ((JoinOperator) reduceInput).getJoinHint() == JoinHint.REPARTITION_SORT_MERGE) {
            // We have a CoGroup operator whose input is processed by one or more sort-merge outer joins.
            // The DOPs of all outer joins must be adjusted.
            JoinOperator joinOp = (JoinOperator) reduceInput;
            while (joinOp != null && joinOp.getJoinHint() == JoinHint.REPARTITION_SORT_MERGE) {

                joinOp.setParallelism(dop);
                DataSet leftJoinInput = joinOp.getInput1();
                if (leftJoinInput instanceof JoinOperator) {
                    joinOp = (JoinOperator) leftJoinInput;
                } else {
                    joinOp = null;
                }
            }
        }
    }

    private DataSet<Tuple> translateMap(DataSet<Tuple> input, FlowNode node) {

        Fields outFields = getOutScope(node).getOutValuesFields();
        registerKryoTypes(outFields);

        int dop = ((Operator) input).getParallelism();

        return input.mapPartition(new EachMapper(node)).returns(new TupleTypeInfo(outFields))
                .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop).name("map-" + node.getID());

    }

    private DataSet<Tuple> prepareGroupByInput(List<DataSet<Tuple>> inputs, FlowNode node) {

        DataSet<Tuple> merged = null;

        for (int i = 0; i < inputs.size(); i++) {

            // get Flink DataSet
            DataSet<Tuple> input = inputs.get(i);

            if (merged == null) {
                merged = input;
            } else {
                merged = merged.union(input);
            }
        }

        return merged;
    }

    private DataSet<Tuple> translateGroupBy(DataSet<Tuple> input, FlowNode node, int dop) {

        GroupBy groupBy = (GroupBy) node.getSourceElements().iterator().next();

        Scope outScope = getOutScope(node);
        List<Scope> inScopes = getInputScopes(node, groupBy);

        Fields outFields;
        if (outScope.isEvery()) {
            outFields = outScope.getOutGroupingFields();
        } else {
            outFields = outScope.getOutValuesFields();
        }
        registerKryoTypes(outFields);

        // get input scope
        Scope inScope = inScopes.get(0);

        // get grouping keys
        Fields groupKeyFields = groupBy.getKeySelectors().get(inScope.getName());
        // get group sorting keys
        Fields sortKeyFields = groupBy.getSortingSelectors().get(inScope.getName());

        String[] groupKeys = registerKeyFields(input, groupKeyFields);
        String[] sortKeys = null;
        if (sortKeyFields != null) {
            sortKeys = registerKeyFields(input, sortKeyFields);
        }
        Order sortOrder = groupBy.isSortReversed() ? Order.DESCENDING : Order.ASCENDING;

        if (sortOrder == Order.DESCENDING) {
            // translate groupBy with inverse sort order
            return translateInverseSortedGroupBy(input, node, dop, groupKeys, sortKeys, outFields);
        } else if (groupKeys == null || groupKeys.length == 0) {
            // translate key-less (global) groupBy
            return translateGlobalGroupBy(input, node, dop, sortKeys, sortOrder, outFields);
        } else {

            UnsortedGrouping<Tuple> grouping = input.groupBy(groupKeys);

            if (sortKeys != null && sortKeys.length > 0) {
                // translate groupBy with group sorting

                SortedGrouping<Tuple> sortedGrouping = grouping.sortGroup(sortKeys[0], Order.ASCENDING);
                for (int i = 1; i < sortKeys.length; i++) {
                    sortedGrouping = sortedGrouping.sortGroup(sortKeys[i], Order.DESCENDING);
                }

                return sortedGrouping.reduceGroup(new GroupByReducer(node)).returns(new TupleTypeInfo(outFields))
                        .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop)
                        .name("reduce-" + node.getID());
            } else {
                // translate groupBy without group sorting

                return grouping.reduceGroup(new GroupByReducer(node)).returns(new TupleTypeInfo(outFields))
                        .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop)
                        .name("reduce-" + node.getID());
            }
        }

    }

    private DataSet<Tuple> translateGlobalGroupBy(DataSet<Tuple> input, FlowNode node, int dop, String[] sortKeys,
            Order sortOrder, Fields outFields) {

        DataSet<Tuple> result = input;

        // sort on sorting keys if necessary
        if (sortKeys != null && sortKeys.length > 0) {

            result = result.sortPartition(sortKeys[0], sortOrder).setParallelism(1).name("reduce-" + node.getID());
            for (int i = 1; i < sortKeys.length; i++) {
                result = result.sortPartition(sortKeys[i], sortOrder).setParallelism(1);
            }
        }

        // group all data
        return result.reduceGroup(new GroupByReducer(node)).returns(new TupleTypeInfo(outFields))
                .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop).name("reduce-" + node.getID());

    }

    private DataSet<Tuple> translateInverseSortedGroupBy(DataSet<Tuple> input, FlowNode node, int dop,
            String[] groupKeys, String[] sortKeys, Fields outFields) {

        DataSet<Tuple> result = input;

        // hash partition and sort on grouping keys if necessary
        if (groupKeys != null && groupKeys.length > 0) {
            // hash partition
            result = result.partitionByHash(groupKeys).setParallelism(dop).name("reduce-" + node.getID());

            // sort on grouping keys
            result = result.sortPartition(groupKeys[0], Order.DESCENDING).setParallelism(dop)
                    .name("reduce-" + node.getID());
            for (int i = 1; i < groupKeys.length; i++) {
                result = result.sortPartition(groupKeys[i], Order.DESCENDING).setParallelism(dop)
                        .name("reduce-" + node.getID());
            }
        }

        // sort on sorting keys if necessary
        if (sortKeys != null && sortKeys.length > 0) {

            result = result.sortPartition(sortKeys[0], Order.DESCENDING).setParallelism(dop)
                    .name("reduce-" + node.getID());
            for (int i = 1; i < sortKeys.length; i++) {
                result = result.sortPartition(sortKeys[i], Order.DESCENDING).setParallelism(dop)
                        .name("reduce-" + node.getID());
            }
        }

        return result.groupBy(groupKeys).reduceGroup(new GroupByReducer(node)).returns(new TupleTypeInfo(outFields))
                .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop).name("reduce-" + node.getID());
    }

    private DataSet<Tuple> translateMerge(List<DataSet<Tuple>> inputs, FlowNode node) {

        DataSet<Tuple> unioned = null;
        TypeInformation<Tuple> type = null;

        int maxDop = -1;

        for (DataSet<Tuple> input : inputs) {
            maxDop = Math.max(maxDop, ((Operator) input).getParallelism());
            if (unioned == null) {
                unioned = input;
                type = input.getType();
            } else {
                unioned = unioned.union(input);
            }
        }
        return unioned.map(new IdMapper()).returns(type).setParallelism(maxDop);

    }

    private DataSet<?> prepareCoGroupInput(List<DataSet<Tuple>> inputs, FlowNode node, int dop) {

        CoGroup coGroup = (CoGroup) getSingle(node.getSinkElements());

        Joiner joiner = coGroup.getJoiner();

        int numJoinInputs = coGroup.isSelfJoin() ? coGroup.getNumSelfJoins() + 1 : inputs.size();

        Fields[] inputFields = new Fields[numJoinInputs];
        Fields[] keyFields = new Fields[numJoinInputs];
        String[][] flinkKeys = new String[numJoinInputs][];
        List<DataSet<Tuple>> joinInputs = computeSpliceInputsFieldsKeys(coGroup, node, inputs, inputFields,
                keyFields, flinkKeys);

        if (joiner.getClass().equals(InnerJoin.class)) {
            if (!keyFields[0].isNone()) {
                return prepareFullOuterCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
            } else {
                // Cartesian product
                return prepareInnerCrossInput(joinInputs, node, inputFields, dop);
            }
        } else if (joiner.getClass().equals(BufferJoin.class)) {
            return prepareBufferCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
        } else {
            return prepareFullOuterCoGroupInput(joinInputs, node, inputFields, keyFields, flinkKeys, dop);
        }

    }

    private DataSet<Tuple2<Tuple, Tuple[]>> prepareFullOuterCoGroupInput(List<DataSet<Tuple>> inputs, FlowNode node,
            Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys, int dop) {

        int numJoinInputs = inputs.size();

        TupleTypeInfo keysTypeInfo = inputFields[0].isDefined()
                ? new TupleTypeInfo(inputFields[0].select(keyFields[0]))
                : new TupleTypeInfo(Fields.UNKNOWN);
        keysTypeInfo.registerKeyFields(keyFields[0]);

        TypeInformation<Tuple2<Tuple, Tuple[]>> tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                keysTypeInfo, new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, 2)));

        String[] listKeys = new String[flinkKeys[0].length];
        String[] listKeysFwd = new String[flinkKeys[0].length];

        for (int i = 0; i < flinkKeys[0].length; i++) {
            listKeys[i] = "f0." + i;
            listKeysFwd[i] = flinkKeys[0][i] + " -> " + listKeys[i];
        }

        // first outer join with CoGroup
        DataSet<Tuple2<Tuple, Tuple[]>> tupleJoinLists = inputs.get(0)
                .fullOuterJoin(inputs.get(1), JoinHint.REPARTITION_SORT_MERGE).where(flinkKeys[0])
                .equalTo(flinkKeys[1])
                .with(new TupleOuterJoiner(numJoinInputs, inputFields[0], keyFields[0], inputFields[1],
                        keyFields[1]))
                .returns(tupleJoinListsTypeInfo).withForwardedFieldsFirst(listKeysFwd).setParallelism(dop)
                .name("coGroup-" + node.getID());

        // further outer joins with CoGroup
        for (int i = 2; i < inputs.size(); i++) {

            tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(keysTypeInfo,
                    new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, i + 1)));

            tupleJoinLists = tupleJoinLists.fullOuterJoin(inputs.get(i), JoinHint.REPARTITION_SORT_MERGE)
                    .where(listKeys).equalTo(flinkKeys[i])
                    .with(new TupleAppendOuterJoiner(i, numJoinInputs, inputFields[i], keyFields[i]))
                    .returns(tupleJoinListsTypeInfo).withForwardedFieldsFirst(listKeys).setParallelism(dop)
                    .name("coGroup-" + node.getID());
        }

        return tupleJoinLists;

    }

    private DataSet<Tuple2<Tuple, Tuple[]>> prepareInnerCrossInput(List<DataSet<Tuple>> inputs, FlowNode node,
            Fields[] inputFields, int dop) {

        int numJoinInputs = inputs.size();

        TypeInformation<Tuple2<Tuple, Tuple[]>> tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                new TupleTypeInfo(Fields.UNKNOWN),
                new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, 1)));

        int mapDop = ((Operator) inputs.get(0)).getParallelism();

        // prepare tuple list for join
        DataSet<Tuple2<Tuple, Tuple[]>> tupleJoinLists = inputs.get(0)
                .map(new JoinPrepareMapper(numJoinInputs, null, null)).returns(tupleJoinListsTypeInfo)
                .setParallelism(mapDop).name("coGroup-" + node.getID());

        for (int i = 1; i < inputs.size(); i++) {

            tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                    new TupleTypeInfo(Fields.UNKNOWN),
                    new TupleArrayTypeInfo(numJoinInputs, Arrays.copyOf(inputFields, i + 1)));

            tupleJoinLists = tupleJoinLists.crossWithTiny(inputs.get(i)).with(new TupleAppendCrosser(i))
                    .returns(tupleJoinListsTypeInfo).setParallelism(dop).name("coGroup-" + node.getID());
        }

        return tupleJoinLists;
    }

    private DataSet<Tuple3<Tuple, Integer, Tuple>> prepareBufferCoGroupInput(List<DataSet<Tuple>> inputs,
            FlowNode node, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys, int dop) {

        DataSet<Tuple3<Tuple, Integer, Tuple>> coGroupInput = null;

        for (int i = 0; i < inputs.size(); i++) {

            // get Flink DataSet
            DataSet<Tuple> input = inputs.get(i);

            // get keys
            int[] keyPos = inputFields[i].getPos(keyFields[i]);

            if (keyFields[i].isNone()) {
                // set default key
                keyFields[i] = new Fields("defaultKey");
            }

            TupleTypeInfo keysTypeInfo = inputFields[i].isDefined()
                    ? new TupleTypeInfo(inputFields[i].select(keyFields[i]))
                    : new TupleTypeInfo(Fields.UNKNOWN);

            TypeInformation<Tuple3<Tuple, Integer, Tuple>> keyedType = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                    keysTypeInfo, BasicTypeInfo.INT_TYPE_INFO, new TupleTypeInfo(inputFields[i]));

            int inputDop = ((Operator) input).getParallelism();

            // add mapper
            DataSet<Tuple3<Tuple, Integer, Tuple>> keyedInput = input.map(new BufferJoinKeyExtractor(i, keyPos))
                    .returns(keyedType).setParallelism(inputDop).name("coGroup-" + node.getID());

            // add to groupByInput
            if (coGroupInput == null) {
                coGroupInput = keyedInput;
            } else {
                coGroupInput = coGroupInput.union(keyedInput);
            }
        }

        return coGroupInput;
    }

    private DataSet<Tuple> translateCoGroup(DataSet<?> input, FlowNode node, int dop) {

        CoGroup coGroup = (CoGroup) getSingle(node.getSourceElements());

        // get out fields of node
        Scope outScope = getOutScope(node);
        Fields outFields;
        if (outScope.isEvery()) {
            outFields = outScope.getOutGroupingFields();
        } else {
            outFields = outScope.getOutValuesFields();
        }
        registerKryoTypes(outFields);

        // get key and value fields of inputs
        List<Scope> inScopes = getInputScopes(node, coGroup);
        Fields keyFields = coGroup.getKeySelectors().get(inScopes.get(0).getName());

        Joiner joiner = coGroup.getJoiner();

        if (!(joiner instanceof BufferJoin)) {
            if (keyFields != Fields.NONE) {

                String[] groupingKeys = new String[keyFields.size()];
                for (int i = 0; i < groupingKeys.length; i++) {
                    groupingKeys[i] = "f0." + i;
                }

                DataSet<Tuple> joinResult = ((DataSet<Tuple2<Tuple, Tuple[]>>) input).groupBy(groupingKeys)
                        .reduceGroup(new CoGroupReducer(node)).withParameters(this.getFlinkNodeConfig(node))
                        .setParallelism(dop).returns(new TupleTypeInfo(outFields)).name("cogroup-" + node.getID());

                return joinResult;
            } else {
                DataSet<Tuple> joinResult = ((DataSet<Tuple2<Tuple, Tuple[]>>) input)
                        .reduceGroup(new CoGroupReducer(node)).withParameters(this.getFlinkNodeConfig(node))
                        .setParallelism(1).returns(new TupleTypeInfo(outFields)).name("cogroup-" + node.getID());

                return joinResult;
            }
        } else {
            // Buffer Join
            if (keyFields != Fields.NONE) {

                return ((DataSet<Tuple3<Tuple, Integer, Tuple>>) input).groupBy("f0.*")
                        .sortGroup(1, Order.DESCENDING).reduceGroup(new CoGroupBufferReducer(node))
                        .withParameters(this.getFlinkNodeConfig(node)).setParallelism(dop)
                        .returns(new TupleTypeInfo(outFields)).name("coGroup-" + node.getID());
            } else {
                return ((DataSet<Tuple3<Tuple, Integer, Tuple>>) input).sortPartition(1, Order.DESCENDING)
                        .setParallelism(1).reduceGroup(new CoGroupBufferReducer(node))
                        .withParameters(this.getFlinkNodeConfig(node)).setParallelism(1)
                        .returns(new TupleTypeInfo(outFields)).name("coGroup-" + node.getID());
            }
        }
    }

    private DataSet<Tuple> translateHashJoin(List<DataSet<Tuple>> inputs, FlowNode node) {

        HashJoin hashJoin = (HashJoin) getCommonSuccessor(node.getSourceElements(), node);
        Joiner joiner = hashJoin.getJoiner();

        // check if joiner is a Scalding WrappedJoiner and
        //   try to extract the joiner which is wrapped inside
        if (joiner.getClass().getName().equals("com.twitter.scalding.WrappedJoiner")) {
            try {
                Field joinerField = joiner.getClass().getDeclaredField("joiner");
                joinerField.setAccessible(true);
                joiner = (Joiner) joinerField.get(joiner);
            } catch (NoSuchFieldException | IllegalAccessException nsfe) {
                nsfe.printStackTrace();
                LOG.warn("Could not extract joiner from Scalding's WrappedJoiner. "
                        + "Will continue without extracting joiner.");
            }
        }

        int numJoinInputs = hashJoin.isSelfJoin() ? hashJoin.getNumSelfJoins() + 1 : inputs.size();

        Fields[] inputFields = new Fields[numJoinInputs];
        Fields[] keyFields = new Fields[numJoinInputs];
        String[][] flinkKeys = new String[numJoinInputs][];

        List<DataSet<Tuple>> joinInputs = computeSpliceInputsFieldsKeys(hashJoin, node, inputs, inputFields,
                keyFields, flinkKeys);

        if (keyFields[0].isNone()) {
            // Cartesian product
            return translateInnerCrossProduct(node, joinInputs);
        } else if (joiner.getClass().equals(InnerJoin.class)) {
            // inner join with keys
            return translateInnerHashJoin(node, joinInputs, inputFields, keyFields, flinkKeys);
        } else if (joiner.getClass().equals(LeftJoin.class)) {
            return translateLeftHashJoin(node, joinInputs, inputFields, keyFields, flinkKeys);
        } else {
            System.out.println(joiner.getClass().getName());
            throw new FlowException(
                    "HashJoin does only support InnerJoin and LeftJoin but is " + joiner.getClass().getName());
        }
    }

    private DataSet<Tuple> translateInnerHashJoin(FlowNode node, List<DataSet<Tuple>> inputs, Fields[] inputFields,
            Fields[] keyFields, String[][] flinkKeys) {

        int numJoinInputs = inputs.size();

        // get out fields of node
        Scope outScope = getOutScope(node);
        Fields outFields;
        if (outScope.isEvery()) {
            outFields = outScope.getOutGroupingFields();
        } else {
            outFields = outScope.getOutValuesFields();
        }
        registerKryoTypes(outFields);

        int probeSideDOP = ((Operator) inputs.get(0)).getParallelism();

        if (numJoinInputs == 2) {
            // binary join

            return inputs.get(0).join(inputs.get(1), JoinHint.BROADCAST_HASH_SECOND).where(flinkKeys[0])
                    .equalTo(flinkKeys[1]).with(new BinaryHashJoinJoiner(node, inputFields[0], keyFields[0]))
                    .withParameters(this.getFlinkNodeConfig(node)).setParallelism(probeSideDOP)
                    .returns(new TupleTypeInfo(outFields)).name("hashjoin-" + node.getID());

        } else {
            // nary join

            TupleTypeInfo keysTypeInfo = inputFields[0].isDefined()
                    ? new TupleTypeInfo(inputFields[0].select(keyFields[0]))
                    : new TupleTypeInfo(Fields.UNKNOWN);
            keysTypeInfo.registerKeyFields(keyFields[0]);

            TypeInformation<Tuple2<Tuple, Tuple[]>> tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                    keysTypeInfo, new TupleArrayTypeInfo(numJoinInputs - 1, Arrays.copyOf(inputFields, 1)));

            int mapDop = ((Operator) inputs.get(0)).getParallelism();

            // prepare tuple list for join
            DataSet<Tuple2<Tuple, Tuple[]>> tupleJoinLists = inputs.get(0)
                    .map(new JoinPrepareMapper(numJoinInputs - 1, inputFields[0], keyFields[0]))
                    .returns(tupleJoinListsTypeInfo).setParallelism(mapDop).name("hashjoin-" + node.getID());

            for (int i = 0; i < flinkKeys[0].length; i++) {
                flinkKeys[0][i] = "f0." + i;
            }

            // join all inputs except last
            for (int i = 1; i < inputs.size() - 1; i++) {

                tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(keysTypeInfo,
                        new TupleArrayTypeInfo(numJoinInputs - 1, Arrays.copyOf(inputFields, i + 1)));

                tupleJoinLists = tupleJoinLists.join(inputs.get(i), JoinHint.BROADCAST_HASH_SECOND)
                        .where(flinkKeys[0]).equalTo(flinkKeys[i]).with(new TupleAppendJoiner(i))
                        .returns(tupleJoinListsTypeInfo).withForwardedFieldsFirst(flinkKeys[0])
                        .setParallelism(probeSideDOP).name("hashjoin-" + node.getID());
            }

            // join last input
            return tupleJoinLists.join(inputs.get(numJoinInputs - 1), JoinHint.BROADCAST_HASH_SECOND)
                    .where(flinkKeys[0]).equalTo(flinkKeys[numJoinInputs - 1])
                    .with(new NaryHashJoinJoiner(node, numJoinInputs)).withParameters(this.getFlinkNodeConfig(node))
                    .setParallelism(probeSideDOP).returns(new TupleTypeInfo(outFields))
                    .name("hashjoin-" + node.getID());
        }
    }

    private DataSet<Tuple> translateLeftHashJoin(FlowNode node, List<DataSet<Tuple>> inputs, Fields[] inputFields,
            Fields[] keyFields, String[][] flinkKeys) {

        int numJoinInputs = inputs.size();

        // get out fields of node
        Scope outScope = getOutScope(node);
        Fields outFields;
        if (outScope.isEvery()) {
            outFields = outScope.getOutGroupingFields();
        } else {
            outFields = outScope.getOutValuesFields();
        }
        registerKryoTypes(outFields);

        int probeSideDOP = ((Operator) inputs.get(0)).getParallelism();

        if (numJoinInputs == 2) {
            // binary join

            return inputs.get(0).leftOuterJoin(inputs.get(1), JoinHint.BROADCAST_HASH_SECOND).where(flinkKeys[0])
                    .equalTo(flinkKeys[1]).with(new BinaryHashJoinJoiner(node, inputFields[0], keyFields[0]))
                    .withParameters(this.getFlinkNodeConfig(node)).setParallelism(probeSideDOP)
                    .returns(new TupleTypeInfo(outFields)).name("hashjoin-" + node.getID());

        } else {
            // nary join

            TupleTypeInfo keysTypeInfo = inputFields[0].isDefined()
                    ? new TupleTypeInfo(inputFields[0].select(keyFields[0]))
                    : new TupleTypeInfo(Fields.UNKNOWN);
            keysTypeInfo.registerKeyFields(keyFields[0]);

            TypeInformation<Tuple2<Tuple, Tuple[]>> tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                    keysTypeInfo, new TupleArrayTypeInfo(numJoinInputs - 1, Arrays.copyOf(inputFields, 1)));

            // prepare tuple list for join
            DataSet<Tuple2<Tuple, Tuple[]>> tupleJoinLists = inputs.get(0)
                    .map(new JoinPrepareMapper(numJoinInputs - 1, inputFields[0], keyFields[0]))
                    .returns(tupleJoinListsTypeInfo).setParallelism(probeSideDOP).name("hashjoin-" + node.getID());

            for (int i = 0; i < flinkKeys[0].length; i++) {
                flinkKeys[0][i] = "f0." + i;
            }

            // join all inputs except last
            for (int i = 1; i < inputs.size() - 1; i++) {

                tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(keysTypeInfo,
                        new TupleArrayTypeInfo(numJoinInputs - 1, Arrays.copyOf(inputFields, i + 1)));

                tupleJoinLists = tupleJoinLists.join(inputs.get(i), JoinHint.BROADCAST_HASH_SECOND)
                        .where(flinkKeys[0]).equalTo(flinkKeys[i]).with(new TupleAppendJoiner(i))
                        .returns(tupleJoinListsTypeInfo).withForwardedFieldsFirst(flinkKeys[0])
                        .setParallelism(probeSideDOP).name("hashjoin-" + node.getID());
            }

            // join last input
            return tupleJoinLists.leftOuterJoin(inputs.get(numJoinInputs - 1), JoinHint.BROADCAST_HASH_SECOND)
                    .where(flinkKeys[0]).equalTo(flinkKeys[numJoinInputs - 1])
                    .with(new NaryHashJoinJoiner(node, numJoinInputs)).withParameters(this.getFlinkNodeConfig(node))
                    .setParallelism(probeSideDOP).returns(new TupleTypeInfo(outFields))
                    .name("hashjoin-" + node.getID());
        }
    }

    private DataSet<Tuple> translateInnerCrossProduct(FlowNode node, List<DataSet<Tuple>> inputs) {

        int numJoinInputs = inputs.size();

        // get out fields of node
        Scope outScope = getOutScope(node);
        Fields outFields;
        if (outScope.isEvery()) {
            outFields = outScope.getOutGroupingFields();
        } else {
            outFields = outScope.getOutValuesFields();
        }
        registerKryoTypes(outFields);

        int probeSideDOP = ((Operator) inputs.get(0)).getParallelism();

        TypeInformation<Tuple2<Tuple, Tuple[]>> tupleJoinListsTypeInfo = new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
                new TupleTypeInfo(Fields.UNKNOWN),
                ObjectArrayTypeInfo.getInfoFor(new TupleTypeInfo(Fields.UNKNOWN)));

        // prepare tuple list for join
        DataSet<Tuple2<Tuple, Tuple[]>> tupleJoinLists = inputs.get(0)
                .map(new JoinPrepareMapper(numJoinInputs, null, null)).returns(tupleJoinListsTypeInfo)
                .setParallelism(probeSideDOP).name("hashjoin-" + node.getID());

        for (int i = 1; i < inputs.size(); i++) {
            tupleJoinLists = tupleJoinLists.crossWithTiny(inputs.get(i)).with(new TupleAppendCrosser(i))
                    .returns(tupleJoinListsTypeInfo).setParallelism(probeSideDOP).name("hashjoin-" + node.getID());
        }

        return tupleJoinLists.mapPartition(new HashJoinMapper(node)).withParameters(this.getFlinkNodeConfig(node))
                .setParallelism(probeSideDOP).returns(new TupleTypeInfo(outFields))
                .name("hashjoin-" + node.getID());

    }

    private List<DataSet<Tuple>> computeSpliceInputsFieldsKeys(Splice splice, FlowNode node,
            List<DataSet<Tuple>> inputs, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys) {

        int numJoinInputs = splice.isSelfJoin() ? splice.getNumSelfJoins() + 1 : inputs.size();
        List<Scope> inScopes = getInputScopes(node, splice);
        List<DataSet<Tuple>> inputs2;

        // collect key and value fields of inputs
        if (!splice.isSelfJoin()) {
            // regular join with different inputs

            for (int i = 0; i < numJoinInputs; i++) {
                // get input scope
                Scope inScope = inScopes.get(i);

                // get join key fields
                inputFields[i] = ((TupleTypeInfo) inputs.get(i).getType()).getSchema();
                keyFields[i] = splice.getKeySelectors().get(inScope.getName());
                flinkKeys[i] = registerKeyFields(inputs.get(i), keyFields[i]);
            }

            inputs2 = inputs;
        } else {
            // self join

            Scope inScope = inScopes.get(0);
            // get join key fields
            inputFields[0] = ((TupleTypeInfo) inputs.get(0).getType()).getSchema();
            keyFields[0] = splice.getKeySelectors().get(inScope.getName());
            flinkKeys[0] = registerKeyFields(inputs.get(0), keyFields[0]);

            for (int i = 1; i < numJoinInputs; i++) {
                inputFields[i] = inputFields[0];
                keyFields[i] = keyFields[0];
                flinkKeys[i] = Arrays.copyOf(flinkKeys[0], flinkKeys[0].length);
            }

            // duplicate self join input to treat it like a regular join
            inputs2 = new ArrayList<>(numJoinInputs);
            for (int i = 0; i < numJoinInputs; i++) {
                inputs2.add(inputs.get(0));
            }
        }

        return inputs2;
    }

    private List<Scope> getInputScopes(FlowNode node, Splice splice) {

        Pipe[] inputs = splice.getPrevious();
        List<Scope> inScopes = new ArrayList<>(inputs.length);
        for (Pipe input : inputs) {
            boolean found = false;
            for (Scope inScope : node.getPreviousScopes(splice)) {
                if (inScope.getName().equals(input.getName())) {
                    inScopes.add(inScope);
                    found = true;
                    break;
                }
            }
            if (!found) {
                throw new RuntimeException("Input scope was not found");
            }
        }

        return inScopes;
    }

    private FlowElement[] getNodeInputsInOrder(FlowNode node, Splice splice) {

        Map<String, Integer> posMap = splice.getPipePos();
        FlowElement[] spliceInputs = new FlowElement[posMap.size()];
        ElementGraph eg = node.getElementGraph();

        for (FlowElement nodeSource : getSources(node)) {
            int idx = posMap.get(eg.getEdge(nodeSource, splice).getName());
            spliceInputs[idx] = nodeSource;
        }

        return spliceInputs;
    }

    private Set<FlowElement> getSources(FlowNode node) {
        return node.getSourceElements();
    }

    private Set<FlowElement> getSinks(FlowNode node) {
        return node.getSinkElements();
    }

    private Set<FlowElement> getInnerElements(FlowNode node) {
        Set<FlowElement> inner = new HashSet<>(node.getElementGraph().vertexSet());
        inner.removeAll(getSources(node));
        inner.removeAll(getSinks(node));
        Set<FlowElement> toRemove = new HashSet<>();
        for (FlowElement e : inner) {
            if (e instanceof Extent) {
                toRemove.add(e);
            }
        }
        inner.removeAll(toRemove);
        return inner;
    }

    private Scope getOutScope(FlowNode node) {

        Set<FlowElement> nodeSinks = node.getSinkElements();
        if (nodeSinks.size() != 1) {
            throw new RuntimeException("Only nodes with one output supported right now");
        }
        FlowElement sink = nodeSinks.iterator().next();

        Collection<Scope> outScopes = (Collection<Scope>) node.getPreviousScopes(sink);
        if (outScopes.size() != 1) {
            throw new RuntimeException("Only one incoming scope for last node of mapper allowed");
        }
        return outScopes.iterator().next();
    }

    private boolean allOfType(Set<FlowElement> set, Class<? extends FlowElement> type) {

        for (FlowElement e : set) {
            if (!(type.isInstance(e))) {
                return false;
            }
        }
        return true;
    }

    private FlowElement getCommonSuccessor(Set<FlowElement> set, FlowNode node) {

        ElementGraph graph = node.getElementGraph();
        FlowElement successor = null;
        for (FlowElement e : set) {
            List<FlowElement> successors = graph.successorListOf(e);
            if (successors.size() > 1) {
                return null;
            } else {
                if (successor == null) {
                    successor = successors.get(0);
                } else if (successor != successors.get(0)) {
                    return null;
                }
            }
        }
        return successor;
    }

    private <X> X getSingle(Set<X> set) {
        if (set.size() != 1) {
            throw new RuntimeException("Set size > 1");
        }
        return set.iterator().next();
    }

    private String[] registerKeyFields(DataSet<Tuple> input, Fields keyFields) {
        return ((TupleTypeInfo) input.getType()).registerKeyFields(keyFields);
    }

    private void registerKryoTypes(Fields fields) {

        if (fields.hasTypes()) {
            Class[] fieldTypeClasses = fields.getTypesClasses();
            for (Class fieldTypeClass : fieldTypeClasses) {
                if (!fieldTypeClass.isPrimitive() && !fieldTypeClass.equals(String.class)
                        && !Writable.class.isAssignableFrom(fieldTypeClass)) {
                    // register type if it is neither a primitive, String, or Writable

                    env.getConfig().registerKryoType(fieldTypeClass);
                }
            }
        }

    }

    private org.apache.flink.configuration.Configuration getFlinkNodeConfig(FlowNode node) {
        return FlinkConfigConverter.toFlinkConfig(this.getNodeConfig(node));
    }

    private Configuration getNodeConfig(FlowNode node) {

        Configuration nodeConfig = HadoopUtil.copyConfiguration(this.getConfig());
        ConfigurationSetter configSetter = new ConfigurationSetter(nodeConfig);
        this.initConfFromNodeConfigDef(node.getElementGraph(), configSetter);
        this.initConfFromStepConfigDef(configSetter);
        nodeConfig.set("cascading.flow.node.num", Integer.toString(node.getOrdinal()));

        return nodeConfig;
    }

    private static class ConfigurationSetter implements ConfigDef.Setter {
        private final Configuration conf;

        public ConfigurationSetter(Configuration conf) {
            this.conf = conf;
        }

        @Override
        public String set(String key, String value) {
            String oldValue = get(key);
            conf.set(key, value);

            return oldValue;
        }

        @Override
        public String update(String key, String value) {
            String oldValue = get(key);

            if (oldValue == null) {
                conf.set(key, value);
            } else if (!oldValue.contains(value)) {
                conf.set(key, oldValue + "," + value);
            }

            return oldValue;
        }

        @Override
        public String get(String key) {
            String value = conf.get(key);

            if (value == null || value.isEmpty()) {
                return null;
            }

            return value;
        }
    }

}