org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java Source code

Introduction

Here is the source code for org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java
Source

/**
 * Copyright 2015 Cerner Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.apps.spark.spi.scheduled;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.codehaus.plexus.util.xml.XMLWriter;
import org.joda.time.Instant;
import org.kitesdk.apps.AppContext;
import org.kitesdk.apps.AppException;
import org.kitesdk.apps.JobContext;
import org.kitesdk.apps.scheduled.SchedulableJob;
import org.kitesdk.apps.scheduled.Schedule;
import org.kitesdk.apps.spark.SparkJobContext;
import org.kitesdk.apps.spi.jobs.JobReflection;
import org.kitesdk.apps.spi.jobs.SchedulableJobManager;
import org.kitesdk.apps.spi.oozie.OozieScheduling;
import org.kitesdk.apps.spi.oozie.ShareLibs;
import org.kitesdk.data.View;
import org.kitesdk.apps.spark.AbstractSchedulableSparkJob;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import static org.kitesdk.apps.spi.oozie.OozieScheduling.element;
import static org.kitesdk.apps.spi.oozie.OozieScheduling.property;

/**
 * Spark job manager.
 */
class SparkJobManager extends SchedulableJobManager {

    private volatile SparkJobContext sparkJobContext;

    public static SparkJobManager create(Class<? extends AbstractSchedulableSparkJob> jobClass,
            AppContext context) {

        AbstractSchedulableSparkJob job;

        try {
            job = jobClass.newInstance();
        } catch (InstantiationException e) {
            throw new AppException(e);
        } catch (IllegalAccessException e) {
            throw new AppException(e);
        }

        Method runMethod = JobReflection.resolveRunMethod(job.getClass());

        return new SparkJobManager(job, runMethod, context);
    }

    SparkJobManager(SchedulableJob job, Method runMethod, AppContext context) {
        super(job, runMethod, context);

    }

    @Override
    public JobContext getJobContext() {

        if (sparkJobContext == null) {
            sparkJobContext = new SparkJobContext(job, context);
        }

        return sparkJobContext;
    }

    @Override
    public void run(Instant nominalTime, Map<String, View> views) {

        try {

            job.setNominalTime(nominalTime);
            job.setJobContext(getJobContext());

            Object[] args = JobReflection.getArgs(runMethod, views);

            runMethod.invoke(job, args);
        } catch (IllegalAccessException e) {
            throw new AppException(e);
        } catch (InvocationTargetException e) {
            throw new AppException(e);
        } finally {
        }

        signalOutputViews(views);
    }

    @Override
    public void writeOozieActionBlock(XMLWriter writer, Schedule schedule) {

        writer.startElement("spark");
        writer.addAttribute("xmlns", "uri:oozie:spark-action:0.1");
        element(writer, "job-tracker", "${jobTracker}");
        element(writer, "name-node", "${nameNode}");

        // TODO: the job-xml should probably be job-specific configuration.
        // element(writer, "job-xml", "${appConfigPath}");

        // Make the nominal time visible to the workflow action.
        writer.startElement("configuration");

        // Use the spark and hive sharelibs since many actions use both.
        property(writer, "oozie.action.sharelib.for.spark", "spark,hive2");
        property(writer, "kiteAppRoot", "${kiteAppRoot}");

        OozieScheduling.writeJobConfiguration(writer, schedule, context.getHadoopConf());

        writer.endElement(); // configuration

        element(writer, "master", "yarn-cluster");
        element(writer, "name", schedule.getName());
        element(writer, "class", SparkScheduledJobMain.class.getCanonicalName());

        JobConf jobConf = new JobConf();
        jobConf.setJarByClass(schedule.getJobClass());
        String containingJar = jobConf.getJar();

        String jarName = containingJar != null ? "${kiteAppRoot}/lib/" + new File(containingJar).getName() : "";

        element(writer, "jar", jarName);
        element(writer, "spark-opts", getSparkConfString(schedule));
        element(writer, "arg", schedule.getJobClass().getName());

        writer.endElement(); // spark
    }

    private final String getSparkConfString(Schedule schedule) {

        // Pass the job settings as Hadoop settings to be used by the underlying
        // system.
        Map<String, String> settings = OozieScheduling.getJobSettings(schedule, context.getHadoopConf());

        StringBuilder builder = new StringBuilder();

        for (Map.Entry<String, String> setting : settings.entrySet()) {

            builder.append("--conf ").append("spark.hadoop.").append(setting.getKey()).append("=")
                    .append(setting.getValue()).append(" ");
        }

        // Include the comma-separated JARs to be used.
        builder.append("--jars ").append(getJarString(schedule.getJobClass()));

        return builder.toString();
    }

    private final String getJarString(Class jobClass) {

        List<File> libJars = getLibraryJars();

        StringBuilder builder = new StringBuilder();

        boolean first = true;

        for (File jarFile : libJars) {
            if (jarFile.getName().endsWith("jar")) {

                if (!first)
                    builder.append(",");

                builder.append("${kiteAppRoot}/lib/");
                builder.append(jarFile.getName());

                first = false;
            }
        }

        try {

            // Add sharelib JARs explicitly to the action as a workaround
            // for https://issues.apache.org/jira/browse/OOZIE-2277

            // No need to add the Spark shared library here because
            // it is automatically included in the spark action.
            List<Path> jars = ShareLibs.jars(context.getHadoopConf(), "hive2");

            for (Path jar : jars) {

                builder.append(",");
                builder.append(jar.toString());
            }

        } catch (IOException e) {

            throw new AppException(e);
        }

        return builder.toString();
    }

    private static final List<File> getLibraryJars() {

        // Current implementation assumes that library files
        // are in the same directory, so locate it and
        // include it in the project library.

        // This is ugly, using the jobConf logic to identify the containing
        // JAR. There should be a better way to do this.
        JobConf jobConf = new JobConf();
        jobConf.setJarByClass(SchedulableJob.class);
        String containingJar = jobConf.getJar();

        if (containingJar == null)
            return Collections.emptyList();

        File file = new File(containingJar).getParentFile();

        File[] jarFiles = file.listFiles();

        return Arrays.asList(jarFiles);
    }

}