co.cask.cdap.internal.app.runtime.spark.AbstractSparkContext.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.internal.app.runtime.spark.AbstractSparkContext.java

Source

/*
 * Copyright  2014 Cask Data, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.internal.app.runtime.spark;

import co.cask.cdap.api.data.batch.BatchReadable;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.spark.SparkContext;
import co.cask.cdap.api.spark.SparkSpecification;
import co.cask.cdap.app.runtime.Arguments;
import co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat;
import co.cask.cdap.internal.app.runtime.batch.dataset.DataSetOutputFormat;
import co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetInputFormat;
import co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetOutputFormat;
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.spark.SparkConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URL;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * An abstract class which implements {@link SparkContext} and provide a concrete implementation for the common
 * functionality between {@link JavaSparkContext} and {@link ScalaSparkContext}
 */
abstract class AbstractSparkContext implements SparkContext {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractSparkContext.class);
    private static final Pattern SPACES = Pattern.compile("\\s+");
    private static final String[] NO_ARGS = {};

    private final Configuration hConf;
    private final long logicalStartTime;
    private final SparkSpecification spec;
    private final Arguments runtimeArguments;
    final BasicSparkContext basicSparkContext;
    private final SparkConf sparkConf;

    public AbstractSparkContext() {
        hConf = loadHConf();
        // Create an instance of BasicSparkContext from the Hadoop Configuration file which was just loaded
        SparkContextProvider sparkContextProvider = new SparkContextProvider(hConf);
        basicSparkContext = sparkContextProvider.get();
        this.logicalStartTime = basicSparkContext.getLogicalStartTime();
        this.spec = basicSparkContext.getSpecification();
        this.runtimeArguments = basicSparkContext.getRuntimeArgs();
        this.sparkConf = initializeSparkConf();
    }

    /**
     * Initializes the {@link SparkConf} with proper settings.
     *
     * @return the initialized {@link SparkConf}
     */
    private SparkConf initializeSparkConf() {
        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName(basicSparkContext.getProgramName());
        return sparkConf;
    }

    public SparkConf getSparkConf() {
        return sparkConf;
    }

    Configuration getHConf() {
        return hConf;
    }

    /**
     * Adds the supplied {@link Configuration} file as an resource
     * This configuration is needed to read/write {@link Dataset} using {@link DataSetInputFormat}/{@link
     * DataSetOutputFormat} by {@link JavaSparkContext#readFromDataset(String, Class, Class)} or
     * {@link ScalaSparkContext#readFromDataset(String, Class, Class)}
     * This function requires that the hConf.xml file containing {@link Configuration} is present in the job jar.
     */
    private Configuration loadHConf() {
        // TODO: Inject through Guice in Distributed mode, see CDAP-3
        Configuration hConf = new Configuration();
        hConf.clear();

        URL url = Thread.currentThread().getContextClassLoader()
                .getResource(SparkRuntimeService.SPARK_HCONF_FILENAME);
        if (url == null) {
            LOG.error("Unable to find Hadoop Configuration file {} in the submitted jar.",
                    SparkRuntimeService.SPARK_HCONF_FILENAME);
            throw new RuntimeException(
                    "Hadoop Configuration file not found in the supplied jar. Please include Hadoop "
                            + "Configuration file with name " + SparkRuntimeService.SPARK_HCONF_FILENAME);
        }
        hConf.addResource(url);
        return hConf;
    }

    /**
     * Sets the input {@link Dataset} with splits in the {@link Configuration}
     *
     * @param datasetName the name of the {@link Dataset} to read from
     * @return updated {@link Configuration}
     * @throws {@link IllegalArgumentException} if the {@link Dataset} to read is not {@link BatchReadable}
     */
    Configuration setInputDataset(String datasetName) {
        Configuration hConf = new Configuration(getHConf());
        Dataset dataset = basicSparkContext.getDataSet(datasetName);
        List<Split> inputSplits;
        if (dataset instanceof BatchReadable) {
            BatchReadable curDataset = (BatchReadable) dataset;
            inputSplits = curDataset.getSplits();
        } else {
            throw new IllegalArgumentException("Failed to read dataset " + datasetName
                    + ". The dataset does not implement" + " BatchReadable");
        }
        hConf.setClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SparkDatasetInputFormat.class, InputFormat.class);
        hConf.set(SparkDatasetInputFormat.HCONF_ATTR_INPUT_DATASET, datasetName);
        hConf.set(SparkContextConfig.HCONF_ATTR_INPUT_SPLIT_CLASS, inputSplits.get(0).getClass().getName());
        hConf.set(SparkContextConfig.HCONF_ATTR_INPUT_SPLITS, new Gson().toJson(inputSplits));
        return hConf;
    }

    /**
     * Sets the output {@link Dataset} with splits in the {@link Configuration}
     *
     * @param datasetName the name of the {@link Dataset} to write to
     * @return updated {@link Configuration}
     */
    Configuration setOutputDataset(String datasetName) {
        Configuration hConf = new Configuration(getHConf());
        hConf.set(SparkDatasetOutputFormat.HCONF_ATTR_OUTPUT_DATASET, datasetName);
        hConf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, SparkDatasetOutputFormat.class, OutputFormat.class);
        return hConf;
    }

    /**
     * Returns value of the given argument key as a String[]
     *
     * @param argsKey {@link String} which is the key for the argument
     * @return String[] containing all the arguments which is indexed by their position as they were supplied
     */
    @Override
    public String[] getRuntimeArguments(String argsKey) {
        if (runtimeArguments.hasOption(argsKey)) {
            return SPACES.split(runtimeArguments.getOption(argsKey).trim());
        } else {
            LOG.warn("Argument with key {} not found in Runtime Arguments", argsKey);
            return NO_ARGS;
        }
    }

    @Override
    public SparkSpecification getSpecification() {
        return spec;
    }

    @Override
    public long getLogicalStartTime() {
        return logicalStartTime;
    }

    @Override
    public Map<String, String> getRuntimeArguments() {
        ImmutableMap.Builder<String, String> arguments = ImmutableMap.builder();
        for (Map.Entry<String, String> runtimeArgument : runtimeArguments) {
            arguments.put(runtimeArgument);
        }
        return arguments.build();
    }

}