org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java Source code

Introduction

Here is the source code for org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java
Source

/**
 * Copyright 2015 Cerner Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.apps.spark.spi.streaming;

import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import org.apache.avro.Schema;
import org.apache.avro.specific.SpecificData;
import org.apache.avro.specific.SpecificRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.kitesdk.apps.AppContext;
import org.kitesdk.apps.AppException;
import org.kitesdk.apps.DataIn;
import org.kitesdk.apps.DataOut;
import org.kitesdk.apps.spark.AbstractStreamingSparkJob;
import org.kitesdk.apps.spark.SparkJobContext;
import org.kitesdk.apps.spark.kafka.KafkaOutput;
import org.kitesdk.apps.spark.spi.kryo.KryoAvroRegistrator;
import org.kitesdk.apps.spi.jobs.JobReflection;
import org.kitesdk.apps.spi.jobs.StreamingJobManager;
import org.kitesdk.apps.spi.oozie.ShareLibs;
import org.kitesdk.apps.streaming.StreamDescription;
import org.kitesdk.apps.streaming.StreamingJob;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.View;
import org.kitesdk.spark.backport.launcher.SparkLauncher;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

public class SparkStreamingJobManager implements StreamingJobManager<AbstractStreamingSparkJob> {

    private final StreamDescription description;

    private final AbstractStreamingSparkJob job;

    private final Method runMethod;

    private final SparkJobContext sparkJobContext;

    private final AppContext appContext;

    public SparkStreamingJobManager(StreamDescription description, AbstractStreamingSparkJob job, Method runMethod,
            AppContext context) {

        this.description = description;
        this.job = job;
        this.runMethod = runMethod;
        this.appContext = context;
        this.sparkJobContext = new SparkJobContext(description, job, context);
    }

    public static Path jobDescriptionFile(Path appRoot, String jobName) {
        return new Path(appRoot, "streaming/" + jobName + ".json");
    }

    public static StreamDescription loadDescription(FileSystem fs, Path appRoot, String jobName) {

        Path streamingJobPath = jobDescriptionFile(appRoot, jobName);

        StringBuilder builder = new StringBuilder();
        InputStream input = null;

        try {
            input = fs.open(streamingJobPath);

            InputStreamReader streamReader = new InputStreamReader(input);

            BufferedReader reader = new BufferedReader(streamReader);

            String line;

            while ((line = reader.readLine()) != null) {

                builder.append(line);
            }
        } catch (IOException e) {
            throw new AppException(e);
        } finally {
            Closeables.closeQuietly(input);
        }

        return StreamDescription.parseJson(builder.toString());
    }

    private static void writeDescription(FileSystem fs, Path appRoot, StreamDescription description) {

        Path streamingJobPath = jobDescriptionFile(appRoot, description.getJobName());

        try {
            fs.mkdirs(streamingJobPath.getParent());
        } catch (IOException e) {
            throw new AppException(e);
        }

        OutputStream output = null;

        try {
            output = fs.append(streamingJobPath);
            OutputStreamWriter writer = new OutputStreamWriter(output);
            writer.write(description.toString());

        } catch (IOException e) {
            throw new AppException(e);
        } finally {
            Closeables.closeQuietly(output);
        }
    }

    public static SparkStreamingJobManager create(StreamDescription description, AppContext context) {

        AbstractStreamingSparkJob job;

        try {
            job = (AbstractStreamingSparkJob) description.getJobClass().newInstance();
        } catch (InstantiationException e) {
            throw new AppException(e);
        } catch (IllegalAccessException e) {
            throw new AppException(e);
        }

        Method runMethod = JobReflection.resolveRunMethod(job.getClass());

        return new SparkStreamingJobManager(description, job, runMethod, context);
    }

    private static final List<File> getLibraryJars() {

        // Current implementation assumes that library files
        // are in the same directory, so locate it and
        // include it in the project library.

        // This is ugly, using the jobConf logic to identify the containing
        // JAR. There should be a better way to do this.
        JobConf jobConf = new JobConf();
        jobConf.setJarByClass(StreamingJob.class);
        String containingJar = jobConf.getJar();

        if (containingJar == null)
            return Collections.emptyList();

        File file = new File(containingJar).getParentFile();

        File[] jarFiles = file.listFiles();

        return Arrays.asList(jarFiles);
    }

    @Override
    public void install(FileSystem fs, Path appRoot) {

        Path descriptionFile = jobDescriptionFile(appRoot, job.getName());

        try {

            OutputStream output = fs.create(descriptionFile);

            try {

                OutputStreamWriter writer = new OutputStreamWriter(output);

                writer.append(description.toString());
                writer.flush();

            } finally {
                output.close();
            }

        } catch (IOException e) {
            throw new AppException(e);
        }
    }

    @Override
    public void start(FileSystem fs, Path appRoot) {
        JobConf jobConf = new JobConf();
        jobConf.setJarByClass(SparkStreamingJobMain.class);
        String containingJar = jobConf.getJar();

        Path libPath = new Path(appRoot, "lib");

        Path jarPath = new Path(libPath, new File(containingJar).getName());
        jarPath = fs.makeQualified(jarPath);

        SparkLauncher launcher = new SparkLauncher();

        launcher.setMainClass(SparkStreamingJobMain.class.getName());

        launcher.setAppResource(jarPath.toString());

        launcher.setMaster("yarn-cluster");

        try {
            // Add the library JARs from HDFS so we don't need to reload
            // them separately into Spark.
            FileStatus[] libJars = fs.listStatus(libPath);

            for (FileStatus jar : libJars) {

                launcher.addJar(jar.getPath().toString());
            }

            // Add the sharelib JARs, since they are not visible to Spark otherwise.
            List<Path> shareLibJars = ShareLibs.jars(sparkJobContext.getHadoopConf(), "hive2");

            for (Path sharelibJar : shareLibJars) {

                launcher.addJar(fs.makeQualified(sharelibJar).toString());
            }

        } catch (IOException e) {
            throw new AppException(e);
        }

        launcher.addAppArgs(appRoot.toString(), description.getJobName());

        // Explicitly set the metastore URI to be usable in the job.
        launcher.setConf("spark.hadoop.hive.metastore.uris",
                sparkJobContext.getHadoopConf().get("hive.metastore.uris"));

        // Add the Avro classes.
        List<Schema> schemas = JobReflection.getSchemas(job);
        StringBuilder avroClassesArg = new StringBuilder();

        avroClassesArg.append("-D").append(KryoAvroRegistrator.KITE_AVRO_CLASSES).append("=");

        boolean first = true;

        for (Schema schema : schemas) {

            if (!first) {
                avroClassesArg.append(",");
            }

            avroClassesArg.append(SpecificData.get().getClass(schema).getName());

            first = false;
        }

        launcher.setConf("spark.driver.extraJavaOptions", avroClassesArg.toString());
        launcher.setConf("spark.executor.extraJavaOptions", avroClassesArg.toString());

        try {

            Process process = launcher.launch();

            // Redirect the spark-submit output to be visible to the reader.
            Thread stdoutThread = writeOutput(process.getInputStream(), System.out);
            Thread stderrThread = writeOutput(process.getErrorStream(), System.err);

            int result = process.waitFor();

            stdoutThread.join();
            stderrThread.join();

            if (result != 0) {
                throw new AppException("spark-submit returned error status: " + result);
            }

        } catch (IOException e) {
            throw new AppException(e);
        } catch (InterruptedException e) {
            throw new AppException(e);
        }
    }

    /**
     * Writes the output to std out.
     */
    private static Thread writeOutput(final InputStream stream, final PrintStream target) {

        Thread thread = new Thread("spark-submit-output-redirect") {

            public void run() {

                Scanner scanner = new Scanner(stream);

                while (scanner.hasNextLine()) {
                    target.println(scanner.nextLine());
                }
            }
        };

        thread.setDaemon(true);
        thread.start();

        return thread;
    }

    private static boolean isStream(Class sourceType) {

        // DStream and Kafka outputs can be currently used to
        // stream data.
        return JavaDStream.class.isAssignableFrom(sourceType) || KafkaOutput.class.isAssignableFrom(sourceType);
    }

    private JavaDStream load(Map<String, String> inputSettings, StreamDescription description, DataIn input) {

        StreamDescription.Stream stream = description.getStreams().get(input.name());

        // Currently on Kafka is the only stream type supported.
        // Future enhancements may determine a different loader based
        // on properties provided by the caller.
        SparkKafkaStreamLoader loader = new SparkKafkaStreamLoader();

        if (input.type() == null) {
            throw new AppException("Job " + description.getJobClass().getName() + " must specify a type for input "
                    + input.name());
        }

        if (SpecificRecord.class.isAssignableFrom(input.type())) {

            Schema schema = SpecificData.get().getSchema(input.type());

            return loader.load(schema, inputSettings, sparkJobContext);

        } else {
            throw new UnsupportedOperationException(
                    "Current implementation only supports specific types in streams.");
        }
    }

    /**
     * Run the job in the local process. This is generally used for unit tests.
     */
    public void run() {

        Map<String, Class> sourceTypes = JobReflection.getTypes(runMethod);

        Map<String, Object> parameters = Maps.newHashMap();

        for (DataIn input : JobReflection.getInputs(runMethod).values()) {

            if (isStream(sourceTypes.get(input.name()))) {

                Map<String, String> inputSettings = sparkJobContext.getInputSettings(input.name());

                JavaDStream stream = load(inputSettings, description, input);

                parameters.put(input.name(), stream);

            } else {

                throw new UnsupportedOperationException(
                        "non-stream inputs are not yet supported in stream operations.");
            }
        }

        for (DataOut output : JobReflection.getOutputs(runMethod).values()) {

            if (isStream(sourceTypes.get(output.name()))) {

                Map<String, String> outputSettings = sparkJobContext.getOutputSettings(output.name());

                Schema schema = SpecificData.get().getSchema(output.type());

                parameters.put(output.name(), new KafkaOutput(schema, outputSettings));

            } else {

                URI uri = description.getViewUris().get(output.name());

                if (uri == null)
                    throw new AppException("No URI defined for output: " + output.name());

                View view = Datasets.load(uri);

                parameters.put(output.name(), view);
            }
        }

        Object[] args = JobReflection.getArgs(runMethod, parameters);

        job.setJobContext(sparkJobContext);

        // Run the job itself.
        try {
            runMethod.invoke(job, args);
        } catch (IllegalAccessException e) {
            throw new AppException(e);
        } catch (InvocationTargetException e) {
            throw new AppException(e);
        }
    }
}