io.dstream.tez.TezDAGBuilder.java Source code

Java tutorial

Introduction

Here is the source code for io.dstream.tez.TezDAGBuilder.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.dstream.tez;

import java.net.URI;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.stream.Stream;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.dstream.SerializableStreamAssets.SerSupplier;
import io.dstream.support.SourceSupplier;
import io.dstream.support.UriSourceSupplier;
import io.dstream.tez.io.KeyWritable;
import io.dstream.tez.io.TezDelegatingPartitioner;
import io.dstream.tez.io.ValueWritable;
import io.dstream.tez.utils.HdfsSerializerUtils;

/**
 *
 */
public class TezDAGBuilder {

    private final Logger logger = LoggerFactory.getLogger(TezDAGBuilder.class);

    private final DAG dag;

    private final ExecutionContextAwareTezClient tezClient;

    private final OrderedPartitionedKVEdgeConfig edgeConf;

    private final TezDagExecutor dagExecutor;

    private Vertex lastVertex;

    private int inputOrderCounter;

    /**
     *
     * @param executionName
     * @param tezClient
     * @param executionConfig
     */
    public TezDAGBuilder(String executionName, ExecutionContextAwareTezClient tezClient,
            Properties executionConfig) {
        this.dag = DAG.create(executionName + "_" + System.currentTimeMillis());
        this.tezClient = tezClient;

        // TODO need to figure out when and why would the Edge be different and
        // how to configure it
        this.edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder("io.dstream.tez.io.KeyWritable",
                "io.dstream.tez.io.ValueWritable", TezDelegatingPartitioner.class.getName(), null).build();
        this.dagExecutor = new TezDagExecutor(this.tezClient, this.dag);
    }

    /**
     *
     * @param taskDescriptor
     */
    public void addTask(TaskDescriptor taskDescriptor) {
        if (taskDescriptor.getId() == 0) {
            this.determineInputFormatClass(taskDescriptor);
        }
        UserPayload payload = this.createPayloadFromTaskSerPath(Task.build(taskDescriptor), this.dag.getName());
        ProcessorDescriptor pd = ProcessorDescriptor.create(TezTaskProcessor.class.getName())
                .setUserPayload(payload);
        SerSupplier<?> sourceSupplier = taskDescriptor.getSourceSupplier();

        Vertex vertex = this.createVertex(taskDescriptor, pd);

        this.dag.addVertex(vertex);

        if (taskDescriptor.getId() == 0) {
            if (sourceSupplier instanceof UriSourceSupplier) {
                UriSourceSupplier uriSourceSupplier = (UriSourceSupplier) sourceSupplier;
                Stream<URI> uris = uriSourceSupplier.get();
                DataSourceDescriptor dataSource = this
                        .buildDataSourceDescriptorFromUris(taskDescriptor.getInputFormatClass(), uris);
                vertex.addDataSource(
                        this.inputOrderCounter++ + ":" + vertex.getName() + "_INPUT_" + Arrays.asList(uris),
                        dataSource);
            }
        } else {
            this.addEdge(vertex);
        }

        if (taskDescriptor.getDependentTasksChains() != null) {
            List<List<TaskDescriptor>> dependentTasksChains = taskDescriptor.getDependentTasksChains();
            dependentTasksChains.forEach(dependentTasks -> {
                dependentTasks.forEach(this::addTask);
                this.addEdge(vertex);
            });
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Created Vertex: " + vertex);
        }
        this.lastVertex = vertex;
    }

    /**
     *
     */
    private Vertex createVertex(TaskDescriptor taskDescriptor, ProcessorDescriptor pd) {
        String vertexName = taskDescriptor.getName() + "_" + taskDescriptor.getOperationName();
        Vertex vertex = (taskDescriptor.getId() == 0
                && taskDescriptor.getSourceSupplier() instanceof UriSourceSupplier)
                        ? Vertex.create(this.inputOrderCounter++ + ":" + vertexName, pd)
                        : Vertex.create(this.inputOrderCounter++ + ":" + vertexName, pd,
                                taskDescriptor.getParallelism());
        vertex.addTaskLocalFiles(this.tezClient.getLocalResources());
        return vertex;
    }

    /**
     *
     * @param vertex
     */
    private void addEdge(Vertex vertex) {
        Edge edge = Edge.create(this.lastVertex, vertex, this.edgeConf.createDefaultEdgeProperty());
        this.dag.addEdge(edge);
    }

    /**
     *
     */
    public void addDataSink(String outputPath) {
        this.createDataSink(this.lastVertex, this.tezClient.getClientName() + "_OUTPUT", KeyWritable.class,
                ValueWritable.class, SequenceFileOutputFormat.class, outputPath);

        this.lastVertex = null;
    }

    /**
     *
     * @return
     */
    public Runnable build() {
        return this.dagExecutor;
    }

    /**
     *
     */
    private DataSourceDescriptor buildDataSourceDescriptorFromUris(Class<?> inputFormatClass, Stream<URI> sources) {
        String inputPath = sources.map(uri -> uri.getPath()).reduce((a, b) -> a + "," + b).get();
        return MRInput.createConfigBuilder(this.tezClient.getTezConfiguration(), inputFormatClass, inputPath)
                .groupSplits(false).build();
    }

    /**
     *
     */
    private UserPayload createPayloadFromTaskSerPath(Task task, String dagName) {
        org.apache.hadoop.fs.Path mapTaskPath = HdfsSerializerUtils.serialize(task, this.tezClient.getFileSystem(),
                new org.apache.hadoop.fs.Path(dagName + "/tasks/" + task.getId() + "_" + task.getName() + ".ser"));
        return UserPayload.create(ByteBuffer.wrap(mapTaskPath.toString().getBytes()));
    }

    /**
     *
     */
    private void createDataSink(Vertex vertex, String name, Class<? extends Writable> keyClass,
            Class<? extends Writable> valueClass, Class<?> outputFormatClass, String outputPath) {
        JobConf dsConfig = this.buildJobConf(keyClass, valueClass);
        DataSinkDescriptor dataSink = MROutput.createConfigBuilder(dsConfig, outputFormatClass, outputPath).build();
        vertex.addDataSink(name, dataSink);
    }

    /**
     *
     */
    private JobConf buildJobConf(Class<? extends Writable> keyClass, Class<? extends Writable> valueClass) {
        JobConf jobConf = new JobConf(this.tezClient.getTezConfiguration());
        jobConf.setOutputKeyClass(keyClass);
        jobConf.setOutputValueClass(valueClass);
        return jobConf;
    }

    /**
     *
     */
    private void determineInputFormatClass(TaskDescriptor firstTask) {
        SourceSupplier<?> sourceSupplier = (SourceSupplier<?>) firstTask.getSourceSupplier();
        Class<?> sourceElementType = firstTask.getSourceElementType();
        if (sourceSupplier instanceof UriSourceSupplier) {
            if (sourceElementType.isAssignableFrom(String.class)) {
                firstTask.setInputFormatClass(TextInputFormat.class);
            } else {
                // TODO design a configurable component to handle other standard
                // and custom input types
                throw new IllegalArgumentException(
                        "Failed to determine Input Format class for source item type " + sourceElementType);
            }
        }
        //      else {
        //         throw new IllegalArgumentException("Non URI sources are not supported yet");
        //      }
    }
}