cascading.flow.hadoop.stream.graph.HadoopMapStreamGraph.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.hadoop.stream.graph.HadoopMapStreamGraph.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop.stream.graph;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import cascading.flow.FlowException;
import cascading.flow.FlowNode;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.flow.hadoop.stream.HadoopMemoryJoinGate;
import cascading.flow.hadoop.stream.element.HadoopCoGroupGate;
import cascading.flow.hadoop.stream.element.HadoopGroupByGate;
import cascading.flow.hadoop.stream.element.HadoopSinkStage;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.graph.ElementGraphs;
import cascading.flow.stream.duct.Gate;
import cascading.flow.stream.element.GroupingSpliceGate;
import cascading.flow.stream.element.SinkStage;
import cascading.flow.stream.element.SourceStage;
import cascading.flow.stream.graph.IORole;
import cascading.flow.stream.graph.NodeStreamGraph;
import cascading.pipe.CoGroup;
import cascading.pipe.GroupBy;
import cascading.pipe.HashJoin;
import cascading.tap.Tap;
import org.apache.hadoop.mapred.JobConf;

/**
 *
 */
public class HadoopMapStreamGraph extends NodeStreamGraph {
    private final Tap source;
    private SourceStage streamedHead;

    public HadoopMapStreamGraph(HadoopFlowProcess flowProcess, FlowNode node, Tap source) {
        super(flowProcess, node, source);
        this.source = source;

        buildGraph();

        setTraps();
        setScopes();

        printGraph(node.getID(), "map", flowProcess.getCurrentSliceNum());
        bind();
    }

    public SourceStage getStreamedHead() {
        return streamedHead;
    }

    protected void buildGraph() {
        streamedHead = handleHead(this.source, flowProcess);

        Set<Tap> tributaries = ElementGraphs.findSources(elementGraph, Tap.class);

        tributaries.remove(this.source); // we cannot stream and accumulate the same source

        // accumulated paths
        for (Object source : tributaries) {
            HadoopFlowProcess hadoopProcess = (HadoopFlowProcess) flowProcess;
            JobConf conf = hadoopProcess.getJobConf();

            // allows client side config to be used cluster side
            String property = conf.getRaw("cascading.node.accumulated.source.conf." + Tap.id((Tap) source));

            if (property == null)
                throw new IllegalStateException(
                        "accumulated source conf property missing for: " + ((Tap) source).getIdentifier());

            conf = getSourceConf(hadoopProcess, conf, property);
            flowProcess = new HadoopFlowProcess(hadoopProcess, conf);

            handleHead((Tap) source, flowProcess);
        }
    }

    private JobConf getSourceConf(HadoopFlowProcess flowProcess, JobConf conf, String property) {
        Map<String, String> priorConf;
        try {
            priorConf = (Map<String, String>) HadoopUtil.deserializeBase64(property, conf, HashMap.class, true);
        } catch (IOException exception) {
            throw new FlowException("unable to deserialize properties", exception);
        }

        return flowProcess.mergeMapIntoConfig(conf, priorConf);
    }

    private SourceStage handleHead(Tap source, FlowProcess flowProcess) {
        SourceStage sourceDuct = new SourceStage(flowProcess, source);

        addHead(sourceDuct);

        handleDuct(source, sourceDuct);

        return sourceDuct;
    }

    @Override
    protected SinkStage createSinkStage(Tap element) {
        return new HadoopSinkStage(flowProcess, element);
    }

    @Override
    protected Gate createCoGroupGate(CoGroup element, IORole role) {
        return new HadoopCoGroupGate(flowProcess, element, IORole.sink);
    }

    @Override
    protected Gate createGroupByGate(GroupBy element, IORole role) {
        return new HadoopGroupByGate(flowProcess, element, role);
    }

    @Override
    protected GroupingSpliceGate createNonBlockingJoinGate(HashJoin join) {
        return new HadoopMemoryJoinGate(flowProcess, join); // does not use a latch
    }
}