ai.grakn.graph.internal.computer.GraknSparkComputer.java Source code

Introduction

Here is the source code for ai.grakn.graph.internal.computer.GraknSparkComputer.java
Source

/*
 * Grakn - A Distributed Semantic Database
 * Copyright (C) 2016  Grakn Labs Limited
 *
 * Grakn is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Grakn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Grakn. If not, see <http://www.gnu.org/licenses/gpl.txt>.
 */

package ai.grakn.graph.internal.computer;

import org.apache.commons.configuration.ConfigurationUtils;
import org.apache.commons.configuration.FileConfiguration;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.spark.HashPartitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.storage.StorageLevel;
import org.apache.tinkerpop.gremlin.hadoop.Constants;
import org.apache.tinkerpop.gremlin.hadoop.process.computer.AbstractHadoopGraphComputer;
import org.apache.tinkerpop.gremlin.hadoop.process.computer.util.ComputerSubmissionHelper;
import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopConfiguration;
import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorage;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.VertexWritable;
import org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil;
import org.apache.tinkerpop.gremlin.process.computer.ComputerResult;
import org.apache.tinkerpop.gremlin.process.computer.GraphComputer;
import org.apache.tinkerpop.gremlin.process.computer.MapReduce;
import org.apache.tinkerpop.gremlin.process.computer.Memory;
import org.apache.tinkerpop.gremlin.process.computer.VertexProgram;
import org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult;
import org.apache.tinkerpop.gremlin.process.computer.util.MapMemory;
import org.apache.tinkerpop.gremlin.spark.process.computer.payload.ViewIncomingPayload;
import org.apache.tinkerpop.gremlin.spark.structure.Spark;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputFormatRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputOutputHelper;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.OutputFormatRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.OutputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.PersistedInputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage;
import org.apache.tinkerpop.gremlin.structure.io.Storage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.Stream;

/**
 * <p>
 * This is a modified version of Spark Computer.
 * We change its behaviour so it can won't destroy the rdd after every job.
 * </p>
 *
 * @author Jason Liu
 * @author Marko A. Rodriguez
 */
public final class GraknSparkComputer extends AbstractHadoopGraphComputer {

    private static final Logger LOGGER = LoggerFactory.getLogger(GraknSparkComputer.class);

    private final org.apache.commons.configuration.Configuration sparkConfiguration;
    private boolean workersSet = false;

    private static GraknGraphRDD graknGraphRDD = null;

    private org.apache.commons.configuration.Configuration apacheConfiguration = null;
    private Configuration hadoopConfiguration = null;

    private String jobGroupId = null;

    public GraknSparkComputer(final HadoopGraph hadoopGraph) {
        super(hadoopGraph);
        this.sparkConfiguration = new HadoopConfiguration();
        ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration);

        this.apacheConfiguration = new HadoopConfiguration(this.sparkConfiguration);
        apacheConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_OUTPUT_FORMAT_HAS_EDGES, false);
        hadoopConfiguration = ConfUtil.makeHadoopConfiguration(apacheConfiguration);

        if (hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_INPUT_RDD, null) == null
                && hadoopConfiguration.get(Constants.GREMLIN_HADOOP_GRAPH_INPUT_FORMAT, null) != null
                && FileInputFormat.class.isAssignableFrom(hadoopConfiguration
                        .getClass(Constants.GREMLIN_HADOOP_GRAPH_INPUT_FORMAT, InputFormat.class))) {
            try {
                final String inputLocation = FileSystem.get(hadoopConfiguration)
                        .getFileStatus(new Path(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)))
                        .getPath().toString();
                apacheConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, inputLocation);
                hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, inputLocation);
            } catch (final IOException e) {
                throw new IllegalStateException(e.getMessage(), e);
            }
        }
    }

    @Override
    public GraphComputer workers(final int workers) {
        super.workers(workers);
        if (this.sparkConfiguration.containsKey(SparkLauncher.SPARK_MASTER)
                && this.sparkConfiguration.getString(SparkLauncher.SPARK_MASTER).startsWith("local")) {
            this.sparkConfiguration.setProperty(SparkLauncher.SPARK_MASTER, "local[" + this.workers + "]");
        }
        this.workersSet = true;
        return this;
    }

    @Override
    public Future<ComputerResult> submit() {
        this.validateStatePriorToExecution();

        return ComputerSubmissionHelper.runWithBackgroundThread(this::submitWithExecutor, "SparkSubmitter");
    }

    public void cancelJobs() {
        if (jobGroupId != null && graknGraphRDD != null && graknGraphRDD.sparkContext != null) {
            graknGraphRDD.sparkContext.cancelJobGroup(jobGroupId);
        }
    }

    private Future<ComputerResult> submitWithExecutor(Executor exec) {
        getGraphRDD(this);
        jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE));
        String jobDescription = this.vertexProgram == null ? this.mapReducers.toString()
                : this.vertexProgram + "+" + this.mapReducers;

        this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
                this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId);
        this.apacheConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
                this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));
        this.hadoopConfiguration.set(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
                this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));

        // create the completable future
        return CompletableFuture.supplyAsync(() -> {
            graknGraphRDD.sparkContext.setJobGroup(jobGroupId, jobDescription);
            final long startTime = System.currentTimeMillis();

            GraknSparkMemory memory = null;
            JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
            JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;

            ////////////////////////////////
            // process the vertex program //
            ////////////////////////////////
            if (null != this.vertexProgram) {
                // set up the vertex program and wire up configurations
                this.mapReducers.addAll(this.vertexProgram.getMapReducers());
                memory = new GraknSparkMemory(this.vertexProgram, this.mapReducers, graknGraphRDD.sparkContext);
                this.vertexProgram.setup(memory);
                memory.broadcastMemory(graknGraphRDD.sparkContext);
                final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
                this.vertexProgram.storeState(vertexProgramConfiguration);
                ConfigurationUtils.copy(vertexProgramConfiguration, apacheConfiguration);
                ConfUtil.mergeApacheIntoHadoopConfiguration(vertexProgramConfiguration, hadoopConfiguration);
                // execute the vertex program
                while (true) {
                    memory.setInTask(true);
                    viewIncomingRDD = GraknSparkExecutor.executeVertexProgramIteration(graknGraphRDD.loadedGraphRDD,
                            viewIncomingRDD, memory, vertexProgramConfiguration);
                    memory.setInTask(false);
                    if (this.vertexProgram.terminate(memory))
                        break;
                    else {
                        memory.incrIteration();
                        memory.broadcastMemory(graknGraphRDD.sparkContext);
                    }
                }
                // write the computed graph to the respective output (rdd or output format)
                final String[] elementComputeKeys = this.vertexProgram.getElementComputeKeys()
                        .toArray(new String[this.vertexProgram.getElementComputeKeys().size()]);
                computedGraphRDD = GraknSparkExecutor.prepareFinalGraphRDD(graknGraphRDD.loadedGraphRDD,
                        viewIncomingRDD, elementComputeKeys);
                if ((hadoopConfiguration.get(Constants.GREMLIN_HADOOP_GRAPH_OUTPUT_FORMAT, null) != null
                        || hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_OUTPUT_RDD, null) != null)
                        && !this.persist.equals(Persist.NOTHING)) {
                    try {
                        hadoopConfiguration
                                .getClass(Constants.GREMLIN_SPARK_GRAPH_OUTPUT_RDD, OutputFormatRDD.class,
                                        OutputRDD.class)
                                .newInstance().writeGraphRDD(apacheConfiguration, computedGraphRDD);
                    } catch (final InstantiationException | IllegalAccessException e) {
                        throw new IllegalStateException(e.getMessage(), e);
                    }
                }
            }

            final boolean computedGraphCreated = computedGraphRDD != null;
            if (!computedGraphCreated) {
                computedGraphRDD = graknGraphRDD.loadedGraphRDD;
            }

            final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);

            //////////////////////////////
            // process the map reducers //
            //////////////////////////////
            if (!this.mapReducers.isEmpty()) {
                for (final MapReduce mapReduce : this.mapReducers) {
                    // execute the map reduce job
                    final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(apacheConfiguration);
                    mapReduce.storeState(newApacheConfiguration);
                    // map
                    final JavaPairRDD mapRDD = GraknSparkExecutor.executeMap(computedGraphRDD, mapReduce,
                            newApacheConfiguration);
                    // combine
                    final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE)
                            ? GraknSparkExecutor.executeCombine(mapRDD, newApacheConfiguration)
                            : mapRDD;
                    // reduce
                    final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE)
                            ? GraknSparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration)
                            : combineRDD;
                    // write the map reduce output back to disk and computer result memory
                    try {
                        mapReduce.addResultToMemory(finalMemory,
                                hadoopConfiguration
                                        .getClass(Constants.GREMLIN_SPARK_GRAPH_OUTPUT_RDD, OutputFormatRDD.class,
                                                OutputRDD.class)
                                        .newInstance()
                                        .writeMemoryRDD(apacheConfiguration, mapReduce.getMemoryKey(), reduceRDD));
                    } catch (final InstantiationException | IllegalAccessException e) {
                        throw new IllegalStateException(e.getMessage(), e);
                    }
                }
            }

            // unpersist the computed graph if it will not be used again (no PersistedOutputRDD)
            if (!graknGraphRDD.outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) {
                computedGraphRDD.unpersist();
            }
            // delete any file system or rdd data if persist nothing
            String outputPath = sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
            if (null != outputPath && this.persist.equals(GraphComputer.Persist.NOTHING)) {
                if (graknGraphRDD.outputToHDFS) {
                    graknGraphRDD.fileSystemStorage.rm(outputPath);
                }
                if (graknGraphRDD.outputToSpark) {
                    graknGraphRDD.sparkContextStorage.rm(outputPath);
                }
            }
            // update runtime and return the newly computed graph
            finalMemory.setRuntime(System.currentTimeMillis() - startTime);
            return new DefaultComputerResult(
                    InputOutputHelper.getOutputGraph(apacheConfiguration, this.resultGraph, this.persist),
                    finalMemory.asImmutable());
        }, exec);
    }

    /////////////////

    private static void loadJars(final JavaSparkContext sparkContext, final Configuration hadoopConfiguration) {
        if (hadoopConfiguration.getBoolean(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, true)) {
            final String hadoopGremlinLocalLibs = null == System.getProperty(Constants.HADOOP_GREMLIN_LIBS)
                    ? System.getenv(Constants.HADOOP_GREMLIN_LIBS)
                    : System.getProperty(Constants.HADOOP_GREMLIN_LIBS);
            if (null == hadoopGremlinLocalLibs) {
                LOGGER.warn(Constants.HADOOP_GREMLIN_LIBS + " is not set -- proceeding regardless");
            } else {
                final String[] paths = hadoopGremlinLocalLibs.split(":");
                for (final String path : paths) {
                    final File file = new File(path);
                    if (file.exists()) {
                        Stream.of(file.listFiles()).filter(f -> f.getName().endsWith(Constants.DOT_JAR))
                                .forEach(f -> sparkContext.addJar(f.getAbsolutePath()));
                    } else {
                        LOGGER.warn(path + " does not reference a valid directory -- proceeding regardless");
                    }
                }
            }
        }
    }

    /**
     * When using a persistent context the running Context's configuration will override a passed
     * in configuration. Spark allows us to override these inherited properties via
     * SparkContext.setLocalProperty
     */
    private static void updateLocalConfiguration(final JavaSparkContext sparkContext,
            final SparkConf sparkConfiguration) {
        /*
         * While we could enumerate over the entire SparkConfiguration and copy into the Thread
         * Local properties of the Spark Context this could cause adverse effects with future
         * versions of Spark. Since the api for setting multiple local properties at once is
         * restricted as private, we will only set those properties we know can effect SparkGraphComputer
         * Execution rather than applying the entire configuration.
         */
        final String[] validPropertyNames = { "spark.job.description", "spark.jobGroup.id",
                "spark.job.interruptOnCancel", "spark.scheduler.pool" };

        for (String propertyName : validPropertyNames) {
            if (sparkConfiguration.contains(propertyName)) {
                String propertyValue = sparkConfiguration.get(propertyName);
                LOGGER.info("Setting Thread Local SparkContext Property - " + propertyName + " : " + propertyValue);

                sparkContext.setLocalProperty(propertyName, sparkConfiguration.get(propertyName));
            }
        }
    }

    public static void main(final String[] args) throws Exception {
        final FileConfiguration configuration = new PropertiesConfiguration(args[0]);
        new GraknSparkComputer(HadoopGraph.open(configuration))
                .program(VertexProgram.createVertexProgram(HadoopGraph.open(configuration), configuration)).submit()
                .get();
    }

    private static synchronized void getGraphRDD(GraknSparkComputer graknSparkComputer) {
        if (graknGraphRDD == null || GraknGraphRDD.commit || graknGraphRDD.sparkContext == null) {
            LOGGER.info("Creating a new Grakn Graph RDD");
            graknGraphRDD = new GraknGraphRDD(graknSparkComputer);
        }
    }

    public static void refresh() {
        if (!GraknGraphRDD.commit) {
            setCommitFlag();
            LOGGER.debug("Graph commit flag set!!!");
        }
    }

    private static synchronized void setCommitFlag() {
        if (!GraknGraphRDD.commit) {
            GraknGraphRDD.commit = true;
        }
    }

    public static synchronized void clear() {
        if (graknGraphRDD != null) {
            graknGraphRDD.loadedGraphRDD = null;
            graknGraphRDD = null;
        }
        Spark.close();
    }

    private static class GraknGraphRDD {

        private static boolean commit = false;

        private Storage fileSystemStorage;
        private Storage sparkContextStorage;

        private boolean outputToHDFS;
        private boolean outputToSpark;

        private String outputLocation;

        private SparkConf sparkConf;
        private JavaSparkContext sparkContext;

        private JavaPairRDD<Object, VertexWritable> loadedGraphRDD;

        private boolean inputFromSpark;

        private GraknGraphRDD(GraknSparkComputer graknSparkComputer) {

            fileSystemStorage = FileSystemStorage.open(graknSparkComputer.hadoopConfiguration);
            sparkContextStorage = SparkContextStorage.open(graknSparkComputer.apacheConfiguration);

            inputFromSpark = PersistedInputRDD.class.isAssignableFrom(graknSparkComputer.hadoopConfiguration
                    .getClass(Constants.GREMLIN_SPARK_GRAPH_INPUT_RDD, Object.class));
            outputToHDFS = FileOutputFormat.class.isAssignableFrom(graknSparkComputer.hadoopConfiguration
                    .getClass(Constants.GREMLIN_HADOOP_GRAPH_OUTPUT_FORMAT, Object.class));
            outputToSpark = PersistedOutputRDD.class.isAssignableFrom(graknSparkComputer.hadoopConfiguration
                    .getClass(Constants.GREMLIN_SPARK_GRAPH_OUTPUT_RDD, Object.class));

            // delete output location
            outputLocation = graknSparkComputer.hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
                    null);
            if (null != outputLocation) {
                if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
                    fileSystemStorage.rm(outputLocation);
                }
                if (outputToSpark && sparkContextStorage.exists(outputLocation)) {
                    sparkContextStorage.rm(outputLocation);
                }
            }

            // wire up a spark context
            sparkConf = new SparkConf();
            sparkConf.setAppName(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX);

            // create the spark configuration from the graph computer configuration
            graknSparkComputer.hadoopConfiguration
                    .forEach(entry -> sparkConf.set(entry.getKey(), entry.getValue()));

            sparkContext = new JavaSparkContext(SparkContext.getOrCreate(sparkConf));
            loadJars(sparkContext, graknSparkComputer.hadoopConfiguration);
            Spark.create(sparkContext.sc()); // this is the context RDD holder that prevents GC
            updateLocalConfiguration(sparkContext, sparkConf);

            boolean partitioned = false;
            try {
                loadedGraphRDD = graknSparkComputer.hadoopConfiguration
                        .getClass(Constants.GREMLIN_SPARK_GRAPH_INPUT_RDD, InputFormatRDD.class, InputRDD.class)
                        .newInstance().readGraphRDD(graknSparkComputer.apacheConfiguration, sparkContext);

                if (loadedGraphRDD.partitioner().isPresent()) {
                    LOGGER.info("Using the existing partitioner associated with the loaded graphRDD: "
                            + loadedGraphRDD.partitioner().get());
                } else {
                    loadedGraphRDD = loadedGraphRDD.partitionBy(
                            new HashPartitioner(graknSparkComputer.workersSet ? graknSparkComputer.workers
                                    : loadedGraphRDD.partitions().size()));
                    partitioned = true;
                }
                assert loadedGraphRDD.partitioner().isPresent();
                if (graknSparkComputer.workersSet) {
                    // ensures that the loaded graphRDD does not have more partitions than workers
                    if (loadedGraphRDD.partitions().size() > graknSparkComputer.workers) {
                        loadedGraphRDD = loadedGraphRDD.coalesce(graknSparkComputer.workers);
                    } else if (loadedGraphRDD.partitions().size() < graknSparkComputer.workers) {
                        loadedGraphRDD = loadedGraphRDD.repartition(graknSparkComputer.workers);
                    }
                }
                if (!inputFromSpark || partitioned) {
                    loadedGraphRDD = loadedGraphRDD
                            .persist(StorageLevel.fromString(graknSparkComputer.hadoopConfiguration
                                    .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_AND_DISK_SER")));
                }
                GraknGraphRDD.commit = false;
            } catch (final InstantiationException | IllegalAccessException e) {
                throw new IllegalStateException(e.getMessage(), e);
            }
        }
    }
}