cascading.flow.tez.util.TezUtil.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.tez.util.TezUtil.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.tez.util;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import cascading.CascadingException;
import cascading.flow.FlowException;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.tap.hadoop.io.MultiInputSplit;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.lib.MRReader;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.runtime.api.AbstractLogicalInput;
import org.apache.tez.runtime.api.AbstractLogicalOutput;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.LogicalOutput;
import org.apache.tez.runtime.api.MergedLogicalInput;
import org.apache.tez.runtime.api.ProcessorContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static cascading.flow.hadoop.util.HadoopUtil.getCommonPaths;
import static org.apache.hadoop.yarn.api.ApplicationConstants.CLASS_PATH_SEPARATOR;
import static org.apache.hadoop.yarn.api.ApplicationConstants.Environment.CLASSPATH;
import static org.apache.hadoop.yarn.api.ApplicationConstants.Environment.PWD;
import static org.apache.tez.common.TezUtils.createConfFromByteString;
import static org.apache.tez.common.TezUtils.createConfFromUserPayload;
import static org.apache.tez.mapreduce.hadoop.MRInputHelpers.parseMRInputPayload;

/**
 *
 */
public class TezUtil {
    private static final Logger LOG = LoggerFactory.getLogger(TezUtil.class);

    /**
     * Attempting to localize all new JobConf calls
     *
     * @param configuration
     * @return
     */
    public static JobConf asJobConf(Configuration configuration) {
        return new JobConf(configuration);
    }

    public static TezConfiguration createTezConf(Map<Object, Object> properties, TezConfiguration defaultJobconf) {
        TezConfiguration jobConf = defaultJobconf == null ? new TezConfiguration()
                : new TezConfiguration(defaultJobconf);

        if (properties == null)
            return jobConf;

        Set<Object> keys = new HashSet<Object>(properties.keySet());

        // keys will only be grabbed if both key/value are String, so keep orig keys
        if (properties instanceof Properties)
            keys.addAll(((Properties) properties).stringPropertyNames());

        for (Object key : keys) {
            Object value = properties.get(key);

            if (value == null && properties instanceof Properties && key instanceof String)
                value = ((Properties) properties).getProperty((String) key);

            if (value == null) // don't stuff null values
                continue;

            // don't let these objects pass, even though toString is called below.
            if (value instanceof Class || value instanceof TezConfiguration)
                continue;

            jobConf.set(key.toString(), value.toString());
        }

        return jobConf;
    }

    public static UserGroupInformation getCurrentUser() {
        try {
            return UserGroupInformation.getCurrentUser();
        } catch (IOException exception) {
            throw new CascadingException("unable to get current user", exception);
        }
    }

    public static String getEdgeSourceID(LogicalInput input, Configuration configuration) {
        String id = configuration.get("cascading.node.source");

        if (id == null)
            throw new IllegalStateException("no source id found: " + input.getClass().getName());

        return id;
    }

    public static String getEdgeSinkID(LogicalOutput output, Configuration configuration) {
        String id = configuration.get("cascading.node.sink");

        if (id == null)
            throw new IllegalStateException("no sink id found: " + output.getClass().getName());

        return id;
    }

    public static Configuration getInputConfiguration(LogicalInput input) {
        try {
            if (input instanceof MergedLogicalInput)
                input = (LogicalInput) Util.getFirst(((MergedLogicalInput) input).getInputs());

            if (input instanceof MRInput)
                return createConfFromByteString(parseMRInputPayload(((MRInput) input).getContext().getUserPayload())
                        .getConfigurationBytes());

            if (input instanceof AbstractLogicalInput)
                return createConfFromUserPayload(((AbstractLogicalInput) input).getContext().getUserPayload());
        } catch (IOException exception) {
            throw new FlowException("unable to unpack payload", exception);
        }

        throw new IllegalStateException("unknown input type: " + input.getClass().getName());
    }

    public static Configuration getOutputConfiguration(LogicalOutput output) {
        try {
            if (output instanceof MROutput)
                return TezUtils.createConfFromUserPayload(((MROutput) output).getContext().getUserPayload());

            if (output instanceof AbstractLogicalOutput)
                return createConfFromUserPayload(((AbstractLogicalOutput) output).getContext().getUserPayload());
        } catch (IOException exception) {
            throw new FlowException("unable to unpack payload", exception);
        }

        throw new IllegalStateException("unknown input type: " + output.getClass().getName());
    }

    public static void setSourcePathForSplit(MRInput input, MRReader reader, Configuration configuration) {
        Path path = null;

        if (Util.returnInstanceFieldIfExistsSafe(input, "useNewApi")) {
            org.apache.hadoop.mapreduce.InputSplit newInputSplit = (org.apache.hadoop.mapreduce.InputSplit) reader
                    .getSplit();

            if (newInputSplit instanceof org.apache.hadoop.mapreduce.lib.input.FileSplit)
                path = ((org.apache.hadoop.mapreduce.lib.input.FileSplit) newInputSplit).getPath();
        } else {
            org.apache.hadoop.mapred.InputSplit oldInputSplit = (org.apache.hadoop.mapred.InputSplit) reader
                    .getSplit();

            if (oldInputSplit instanceof org.apache.hadoop.mapred.FileSplit)
                path = ((org.apache.hadoop.mapred.FileSplit) oldInputSplit).getPath();
        }

        if (path != null)
            configuration.set(MultiInputSplit.CASCADING_SOURCE_PATH, path.toString());
    }

    public static Map<Path, Path> addToClassPath(Configuration config, String stagingRoot, String resourceSubPath,
            Collection<String> classpath, LocalResourceType resourceType, Map<String, LocalResource> localResources,
            Map<String, String> environment) {
        if (classpath == null)
            return null;

        // given to fully qualified
        Map<String, Path> localPaths = new HashMap<>();
        Map<String, Path> remotePaths = new HashMap<>();

        HadoopUtil.resolvePaths(config, classpath, stagingRoot, resourceSubPath, localPaths, remotePaths);

        try {
            LocalFileSystem localFS = HadoopUtil.getLocalFS(config);

            for (String fileName : localPaths.keySet()) {
                Path artifact = localPaths.get(fileName);
                Path remotePath = remotePaths.get(fileName);

                if (remotePath == null)
                    remotePath = artifact;

                addResource(localResources, environment, fileName, localFS.getFileStatus(artifact), remotePath,
                        resourceType);
            }

            FileSystem defaultFS = HadoopUtil.getDefaultFS(config);

            for (String fileName : remotePaths.keySet()) {
                Path artifact = remotePaths.get(fileName);
                Path localPath = localPaths.get(fileName);

                if (localPath != null)
                    continue;

                addResource(localResources, environment, fileName, defaultFS.getFileStatus(artifact), artifact,
                        resourceType);
            }
        } catch (IOException exception) {
            throw new FlowException("unable to set remote resource paths", exception);
        }

        return getCommonPaths(localPaths, remotePaths);
    }

    protected static void addResource(Map<String, LocalResource> localResources, Map<String, String> environment,
            String fileName, FileStatus stats, Path fullPath, LocalResourceType type) throws IOException {
        if (localResources.containsKey(fileName))
            throw new FlowException("duplicate filename added to classpath resources: " + fileName);

        URL yarnUrlFromPath = ConverterUtils.getYarnUrlFromPath(fullPath);
        long len = stats.getLen();
        long modificationTime = stats.getModificationTime();

        LocalResource resource = LocalResource.newInstance(yarnUrlFromPath, type,
                LocalResourceVisibility.APPLICATION, len, modificationTime);

        if (type == LocalResourceType.PATTERN) {
            // todo: parametrize this for dynamic inclusion below
            String pattern = "(?:classes/|lib/).*";

            resource.setPattern(pattern);

            if (environment != null) {
                String current = "";

                current += PWD.$$() + File.separator + fileName + File.separator + "*" + CLASS_PATH_SEPARATOR;
                current += PWD.$$() + File.separator + fileName + File.separator + "lib" + File.separator + "*"
                        + CLASS_PATH_SEPARATOR;
                current += PWD.$$() + File.separator + fileName + File.separator + "classes" + File.separator + "*"
                        + CLASS_PATH_SEPARATOR;

                String classPath = environment.get(CLASSPATH.name());

                if (classPath == null)
                    classPath = "";
                else if (!classPath.startsWith(CLASS_PATH_SEPARATOR))
                    classPath += CLASS_PATH_SEPARATOR;

                classPath += current;

                LOG.info("adding to cluster side classpath: {} ", classPath);

                environment.put(CLASSPATH.name(), classPath);
            }
        }

        localResources.put(fileName, resource);
    }

    public static void setMRProperties(ProcessorContext context, Configuration config, boolean isMapperOutput) {
        TaskAttemptID taskAttemptId = org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl
                .createMockTaskAttemptID(context.getApplicationId().getClusterTimestamp(),
                        context.getTaskVertexIndex(), context.getApplicationId().getId(), context.getTaskIndex(),
                        context.getTaskAttemptNumber(), isMapperOutput);

        config.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
        config.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
        config.setBoolean(JobContext.TASK_ISMAP, isMapperOutput);
        config.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId());
    }
}