com.linkedin.cubert.plan.physical.JobExecutor.java Source code

Introduction

Here is the source code for com.linkedin.cubert.plan.physical.JobExecutor.java
Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.plan.physical;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.node.ArrayNode;
import org.codehaus.jackson.node.ObjectNode;

import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.Index;
import com.linkedin.cubert.io.ConfigurationDiff;
import com.linkedin.cubert.io.CubertInputFormat;
import com.linkedin.cubert.io.SerializerUtils;
import com.linkedin.cubert.io.Storage;
import com.linkedin.cubert.io.StorageFactory;
import com.linkedin.cubert.utils.ExecutionConfig;
import com.linkedin.cubert.utils.FileSystemUtils;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.print;

/**
 * Parses and executes the physical plan of a single Map-Reduce job.
 * 
 * @author Maneesh Varshney
 */
public class JobExecutor {
    protected final JsonNode root;
    protected static final ArrayNode singletonArray = new ObjectMapper().createArrayNode();
    protected final Job job;
    protected final Configuration conf;
    protected final ConfigurationDiff confDiff;
    protected final FileSystem fs;
    private final ObjectMapper mapper;
    private final Path tmpDir;

    // A map of folder name to file name prefix
    private final Map<String, List<String>> teeFiles = new HashMap<String, List<String>>();
    private int teeFilePrefixCounter = 0;

    private boolean profileMode;

    public JobExecutor(String json, boolean profileMode)
            throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        this.job = new Job();
        this.conf = job.getConfiguration();
        this.confDiff = new ConfigurationDiff(conf);
        this.fs = FileSystem.get(conf);
        this.profileMode = profileMode;

        // Turn on the symlink feature
        DistributedCache.createSymlink(conf);

        job.setJarByClass(JobExecutor.class);
        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {

            conf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }

        mapper = new ObjectMapper();
        this.root = mapper.readValue(json, JsonNode.class);

        if (root.has("tmpDir")) {
            tmpDir = new Path(getText(root, "tmpDir"));
        } else {
            tmpDir = new Path(fs.getHomeDirectory(), "tmp/" + UUID.randomUUID().toString());
        }

        try {
            configureJob();
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }

    }

    public void printCubertConfProperties() {
        Iterator<Entry<String, String>> it = conf.iterator();
        while (it.hasNext()) {
            Entry<String, String> entry = it.next();
            if (entry.getKey().startsWith("cubert")) {
                print.f("%s => %s", entry.getKey(), entry.getValue());
            }
        }
    }

    public boolean run(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException {
        // Logger.getLogger("org.apache.hadoop.mapred.MapTask").setLevel(Level.WARN);
        // Logger.getLogger("org.apache.hadoop.mapred.Task").setLevel(Level.WARN);
        // Logger.getLogger("org.apache.hadoop.mapred.Merger").setLevel(Level.WARN);
        // Logger.getLogger("org.apache.hadoop.mapred.LocalJobRunner").setLevel(Level.WARN);
        // Logger.getLogger("org.apache.hadoop.filecache.TrackerDistributedCacheManager")
        // .setLevel(Level.WARN);

        boolean retval = false;
        try {
            retval = job.waitForCompletion(verbose);
        } finally {
            fs.delete(tmpDir, true);
        }

        if (!retval) {
            throw new InterruptedException("Job " + getText(root, "name") + " failed!");
        }

        moveTeeFiles();
        postJobHooks();
        doCompletionTasks();

        return retval;
    }

    protected void configureJob() throws IOException, ClassNotFoundException, URISyntaxException,
            InstantiationException, IllegalAccessException {
        setJobName();
        setLibjars();
        setHadoopConf();
        setPerfProfile();
        serializeExecutionConfig();
        cacheFiles();
        cacheIndex();
        prepareTeePaths();
        preJobHooks();

        int numReducers = root.get("reducers").getIntValue();
        job.setNumReduceTasks(numReducers);

        for (JsonNode map : root.path("map")) {
            setInput(((ObjectNode) map).get("input"));
        }
        setOutput();
        conf.set(CubertStrings.JSON_OUTPUT, root.get("output").toString());
        if (root.has("metadata"))
            conf.set(CubertStrings.JSON_METADATA, root.get("metadata").toString());

        conf.set(CubertStrings.JSON_MAP_OPERATOR_LIST, root.get("map").toString());
        job.setMapperClass(CubertMapper.class);

        if (root.has("shuffle") && !root.get("shuffle").isNull()) {
            setShuffle();
            conf.set(CubertStrings.JSON_SHUFFLE, root.get("shuffle").toString());

            conf.set(CubertStrings.JSON_REDUCE_OPERATOR_LIST, root.get("reduce").toString());
            job.setReducerClass(CubertReducer.class);

        }

        if (conf.get("mapreduce.map.output.compress") == null)
            conf.set("mapreduce.map.output.compress", "true");

        if (conf.get("mapreduce.output.fileoutputformat.compress") == null)
            conf.set("mapreduce.output.fileoutputformat.compress", "true");
    }

    private void serializeExecutionConfig() throws IOException {
        ExecutionConfig.writeConf(getConf());
    }

    private void preJobHooks() {
        ArrayNode preHooks = (ArrayNode) root.get("preJobHooks");
        if (preHooks != null)
            processJobCommands(preHooks);

    }

    private void postJobHooks() {
        ArrayNode postHooks = (ArrayNode) root.get("postJobHooks");
        if (postHooks != null)
            processJobCommands(postHooks);

    }

    private void doCompletionTasks() throws IOException {
        if (root.has("onCompletion") && !root.get("onCompletion").isNull())
            CompletionTasks.doCompletionTasks(root.get("onCompletion"));
    }

    private void processJobCommands(ArrayNode commands) {
        for (int i = 0; i < commands.size(); i++) {
            execJobCommand(commands.get(i));
        }

    }

    private void execJobCommand(JsonNode jsonNode) {

    }

    protected void setJobName() {
        job.setJobName(getText(root, "name"));
    }

    protected void setLibjars() throws IOException {
        if (!root.has("libjars"))
            return;

        FileSystem localFs = FileSystem.getLocal(conf);

        for (JsonNode node : asArray(root, "libjars")) {
            Path path = new Path(node.getTextValue());

            if (localFs.exists(path)) {
                Path dstPath = new Path(tmpDir, path.getName());
                fs.copyFromLocalFile(path, dstPath);

                path = dstPath;
            }

            DistributedCache.addFileToClassPath(path, conf, fs);

        }
    }

    protected void setHadoopConf() {
        if (!root.has("hadoopConf"))
            return;

        JsonNode node = get(root, "hadoopConf");
        Iterator<String> it = node.getFieldNames();
        while (it.hasNext()) {
            String name = it.next();
            String value = getText(node, name);

            conf.set(name, value);
        }
    }

    protected void setPerfProfile() {
        conf.set(CubertStrings.PROFILE_MODE, profileMode ? "true" : "false");
    }

    protected void cacheFiles() throws URISyntaxException, IOException {
        if (!root.has("cachedFiles") || root.get("cachedFiles").isNull() || root.get("cachedFiles").size() == 0)
            return;

        for (JsonNode cachedFile : root.path("cachedFiles")) {
            URI uri = new URI(cachedFile.getTextValue());
            print.f("CACHING file %s", uri);
            DistributedCache.addCacheFile(uri, conf);
        }
    }

    protected void cacheIndex() throws IOException, InstantiationException, IllegalAccessException,
            ClassNotFoundException, URISyntaxException {
        if (!root.has("cacheIndex"))
            return;

        for (JsonNode indexNode : root.path("cacheIndex")) {
            // extract the index named by "index" from the location specified in "path";
            Index indexToCache = Index.extractFromRelation(conf, getText(indexNode, "path"));

            String indexName = JsonUtils.getText(indexNode, "name");

            Path indexPath = new Path(tmpDir, UUID.randomUUID().toString());
            SerializerUtils.serializeToFile(conf, indexPath, indexToCache);

            DistributedCache.addCacheFile(new URI(indexPath.toString() + "#" + indexName), conf);

            // tmpFiles.add(indexPath);

            conf.set(CubertStrings.JSON_CACHE_INDEX_PREFIX + indexName, indexPath.getName());

            print.f("Caching index at path [%s] as [%s]", getText(indexNode, "path"), indexPath.toString());
        }

    }

    protected void setInput(JsonNode input) throws IOException, ClassNotFoundException {
        JsonNode params = input.get("params");
        if (params == null)
            params = mapper.createObjectNode();

        // RelationType type = RelationType.valueOf(getText(input, "type"));
        List<Path> paths = FileSystemUtils.getPaths(fs, input.get("path"));

        if (paths.isEmpty()) {
            throw new IOException("No input paths are defined");
        }

        job.setInputFormatClass(CubertInputFormat.class);

        // storage specific configuration
        confDiff.startDiff();

        Storage storage = StorageFactory.get(getText(input, "type"));
        storage.prepareInput(job, conf, params, paths);

        if (params.has("combined") && Boolean.parseBoolean(getText(params, "combined"))) {
            conf.setBoolean(CubertStrings.COMBINED_INPUT, true);

            long originalMaxCombinedSplitSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", -1);

            if (originalMaxCombinedSplitSize == -1) {
                throw new IllegalStateException(
                        "CONFIG ERROR: property mapreduce.input.fileinputformat.split.maxsize is not set when using combined input format");
            }
        }

        // add input paths to the job
        FileInputFormat.setInputPaths(job, paths.toArray(new Path[] {}));

        confDiff.endDiff();
    }

    protected void setOutput() throws IOException {
        JsonNode output = get(root, "output");
        JsonNode params = output.get("params");
        if (params == null)
            params = mapper.createObjectNode();

        Path outputPath = new Path(getText(output, "path"));
        FileOutputFormat.setOutputPath(job, outputPath);

        if (params.has("overwrite") && Boolean.parseBoolean(getText(params, "overwrite"))) {
            fs.delete(outputPath, true);
        }

        BlockSchema schema = new BlockSchema(output.get("schema"));

        Storage storage = StorageFactory.get(getText(output, "type"));
        storage.prepareOutput(job, conf, params, schema, outputPath);
    }

    protected void setShuffle() {
        job.setPartitionerClass(CubertPartitioner.class);

        JsonNode shuffle = get(root, "shuffle");
        Storage storage = StorageFactory.get(getText(shuffle, "type"));
        storage.prepareOutput(job, conf, null, null, null);

        if (shuffle.has("aggregates")) {
            job.setCombinerClass(CubertCombiner.class);
        }
    }

    protected void setNumReducers(int numReducers) {
        job.setNumReduceTasks(numReducers);

    }

    protected void setCompression(Class<? extends CompressionCodec> codecClass) {
        if (codecClass != null) {
            conf.setBoolean("mapred.output.compress", true);
            conf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
        }

    }

    private void prepareTeePaths() {
        for (JsonNode mapNode : root.path("map")) {
            prepareTeePaths(mapNode.get("operators"));
        }

        if (root.has("shuffle") && !root.get("shuffle").isNull()) {
            prepareTeePaths(root.get("reduce"));
        }
    }

    private void prepareTeePaths(JsonNode operators) {
        for (JsonNode operatorNode : operators) {
            String name = operatorNode.get("operator").getTextValue();
            if (name.equals("TEE")) {
                String path = operatorNode.get("path").getTextValue();

                String teePrefix = String.format("tee-%04d", teeFilePrefixCounter++);
                ((ObjectNode) operatorNode).put("prefix", teePrefix);
                List<String> prefixList = teeFiles.get(path);
                if (prefixList == null) {
                    prefixList = new ArrayList<String>();
                    teeFiles.put(path, prefixList);
                }
                prefixList.add(teePrefix);
            }
        }
    }

    private void moveTeeFiles() throws IOException {
        if (teeFiles.size() == 0)
            return;

        Path outputDir = new Path(root.get("output").get("path").getTextValue());

        for (String dir : teeFiles.keySet()) {
            // delete the old directory
            Path teeDir = new Path(dir);
            if (fs.exists(teeDir))
                fs.delete(teeDir, true);

            fs.mkdirs(teeDir);

            for (String prefix : teeFiles.get(dir)) {
                Path globPath = new Path(outputDir, prefix + "*");
                FileStatus[] fileStatusList = fs.globStatus(globPath);
                for (FileStatus fileStatus : fileStatusList) {
                    fs.rename(fileStatus.getPath(), teeDir);
                }
            }

        }
    }

    protected Job getJob() {
        return job;
    }

    protected Configuration getConf() {
        return conf;
    }

    public static JsonNode get(JsonNode node, String property) {
        JsonNode val = node.get(property);
        if (val == null) {
            throw new IllegalArgumentException("Property " + property + " is not defined in " + node);
        }
        return val;
    }

    public static String getText(JsonNode node, String property, String defaultValue) {
        if (!node.has(property))
            return defaultValue;
        return get(node, property).getTextValue();
    }

    public static String getText(JsonNode node, String property) {
        return get(node, property).getTextValue();
    }

    public static JsonNode asArray(JsonNode node, String property) {
        JsonNode n = node.get(property);
        if (n.isArray())
            return node.path(property);
        else {
            singletonArray.removeAll();
            singletonArray.add(n);
            return singletonArray;
        }
    }
}