com.ibm.bi.dml.yarn.ropt.YarnClusterAnalyzer.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.yarn.ropt.YarnClusterAnalyzer.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.yarn.ropt;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;

import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.matrix.mapred.MRConfigurationNames;

/**
 * Central place for analyzing and obtaining static infrastructure properties
 * such as memory and number of logical processors.
 * 
 * 
 */
public class YarnClusterAnalyzer {

    public static final long DEFAULT_JVM_SIZE = 512 * 1024 * 1024;
    public static final int CPU_HYPER_FACTOR = 1;

    //static local master node properties
    public static int _localPar = -1;
    public static long _localJVMMaxMem = -1;

    //default hadoop cluster properties
    public static int _remotePar = -1;
    //public static int  _remoteParMap    = -1;
    //public static int  _remoteParReduce = -1;
    public static long _remoteJVMMaxMemMap = -1;
    public static long _remoteJVMMaxMemReduce = -1;
    public static long _remoteMRSortMem = -1;
    public static boolean _localJT = false;
    public static long _blocksize = -1;

    // Map from StatementBlock.ID to remoteJVMMaxMem (in bytes)
    // Encodes MR job memory settings in the execution plan, if not found here, use the default MR setting in _remoteJVMMaxMem
    public static HashMap<Long, Long> remoteJVMMaxMemPlan = new HashMap<Long, Long>();
    public static HashSet<Long> probedSb = new HashSet<Long>();

    public static List<Long> nodesMaxPhySorted = null; // Original maximum physical memory per node in Byte, sorted
    public static List<Double> nodesMaxBudgetSorted = null; // Converted to maximum budget per node in Byte, sorted
    public static int minimumMRContainerPhyMB = -1; // Suggested minimum mappers physical memory
    public static long mrAMPhy = -1; // The default physical memory size of MR AM

    public static long clusterTotalMem = -1;
    public static int clusterTotalNodes = -1;
    public static int clusterTotalCores = -1;
    public static long minimalPhyAllocate = -1;
    public static long maximumPhyAllocate = -1;

    //client for resource utilization updates
    private static YarnClient _client = null;

    //static initialization, called for each JVM (on each node)
    static {
        //analyze local node properties
        analyzeLocalMachine();

        //analyze remote Hadoop cluster properties
        //analyzeYarnCluster(true); //note: due to overhead - analyze on-demand
    }

    public static List<Long> getNodesMaxPhySorted() {
        if (nodesMaxPhySorted == null)
            analyzeYarnCluster(true);
        return nodesMaxPhySorted;
    }

    public static List<Double> getNodesMaxBudgetSorted() {
        if (nodesMaxBudgetSorted == null)
            analyzeYarnCluster(true);
        return nodesMaxBudgetSorted;
    }

    public static long getMRARPhy() {
        if (mrAMPhy == -1)
            analyzeYarnCluster(true);
        return mrAMPhy;
    }

    public static long getClusterTotalMem() {
        if (clusterTotalMem == -1)
            analyzeYarnCluster(true);
        return clusterTotalMem;
    }

    public static long getMaxPhyAllocate() {
        if (maximumPhyAllocate == -1)
            analyzeYarnCluster(true);
        return maximumPhyAllocate;
    }

    public static int getMinMRContarinerPhyMB() {
        if (minimumMRContainerPhyMB == -1)
            analyzeYarnCluster(true);
        return minimumMRContainerPhyMB;
    }
    ///////
    //methods for obtaining parallelism properties

    /**
     * Gets the number of logical processors of the current node,
     * including hyper-threading if enabled.
     * 
     * @return
     */
    public static int getLocalParallelism() {
        return _localPar;
    }

    /**
     * Gets the number of cluster nodes (number of tasktrackers). If multiple tasktracker
     * are started per node, each tasktracker is viewed as individual node.
     * 
     * @return
     */
    public static int getRemoteParallelNodes() {
        if (_remotePar == -1)
            analyzeYarnCluster(true);

        return _remotePar;
    }

    /**
     * Gets the total number of available map slots.
     * 
     * @return
     */
    public static int getRemoteParallelMapTasks(long jobLookupId) {
        if (clusterTotalCores == -1)
            analyzeYarnCluster(true);
        int ret = getRemoteParallelTasksGivenMem(getRemoteMaxMemoryMap(jobLookupId));
        //System.out.print("  jvm size " + OptimizerUtils.toMB(getRemoteMaxMemory(jobLookupId)) + " -> " + ret + " map tasks");
        if (ret >= clusterTotalCores * CPU_HYPER_FACTOR)
            ret = clusterTotalCores * CPU_HYPER_FACTOR;

        //System.out.println(jobLookupId + " got " + ret + " jvm = " + OptimizerUtils.toMB(getRemoteMaxMemoryMap(jobLookupId)));
        return ret;
    }

    /**
     * Gets the total number of available reduce slots.
     * 
     * @return
     */
    public static int getRemoteParallelReduceTasks(long jobLookupId) {
        if (clusterTotalCores == -1)
            analyzeYarnCluster(true);
        int ret = getRemoteParallelTasksGivenMem(getRemoteMaxMemoryReduce(jobLookupId));
        if (ret >= clusterTotalCores * CPU_HYPER_FACTOR)
            ret = clusterTotalCores * CPU_HYPER_FACTOR;
        return ret;
    }

    public static long getYarnPhyAllocate(long requestPhy) {
        if (minimalPhyAllocate == -1)
            analyzeYarnCluster(true);
        if (requestPhy > maximumPhyAllocate)
            throw new RuntimeException("Requested " + OptimizerUtils.toMB(requestPhy)
                    + "MB, while the maximum yarn allocate is " + OptimizerUtils.toMB(maximumPhyAllocate) + "MB");

        long ret = (long) Math.ceil((double) requestPhy / minimalPhyAllocate);
        ret = ret * minimalPhyAllocate;
        if (ret > maximumPhyAllocate)
            ret = maximumPhyAllocate;
        return ret;
    }

    /**
     * Gets the totals number of parallel tasks given its max memory size.
     * 
     * @return
     */
    public static int getRemoteParallelTasksGivenMem(long remoteTaskJvmMemory) {
        long taskPhy = getYarnPhyAllocate(ResourceOptimizer.jvmToPhy(remoteTaskJvmMemory, false));
        long cpPhy = getYarnPhyAllocate(ResourceOptimizer.jvmToPhy(getLocalMaxMemory(), false));
        long mrAMPhy = getYarnPhyAllocate(getMRARPhy());

        if (nodesMaxPhySorted == null)
            analyzeYarnCluster(true);

        if (nodesMaxPhySorted.isEmpty())
            return -1;
        if (nodesMaxPhySorted.size() == 1) {
            long tmp = nodesMaxPhySorted.get(0) - cpPhy - mrAMPhy;
            if (tmp < 0)
                return -1;
            return (int) (tmp / taskPhy);
        }
        // At least have two nodes
        long first = nodesMaxPhySorted.get(0) - cpPhy;
        long second = nodesMaxPhySorted.get(1);

        if (first >= second)
            first -= mrAMPhy;
        else
            second -= mrAMPhy;
        if (first < 0 || second < 0)
            return -1;
        long taskCount = first / taskPhy + second / taskPhy;
        int counter = 0;
        for (Long node : nodesMaxPhySorted) {
            if (counter++ < 2)
                continue; // skip first two nodes
            taskCount += node / taskPhy;
        }

        //System.out.println(OptimizerUtils.toMB(cpPhy) + " " + OptimizerUtils.toMB(mrAMPhy) + " " + OptimizerUtils.toMB(taskPhy) + " " + OptimizerUtils.toMB(nodesMaxPhySorted.get(1)));
        return (int) taskCount;
    }

    public static boolean checkValidMemPlan(boolean hasMRJob) {
        if (nodesMaxPhySorted == null)
            analyzeYarnCluster(true);
        if (!hasMRJob)
            return nodesMaxPhySorted
                    .get(0) >= getYarnPhyAllocate(ResourceOptimizer.jvmToPhy(getLocalMaxMemory(), false));
        return getRemoteParallelTasksGivenMem(getMaximumRemoteMaxMemory()) > 0;
    }

    ///////
    //methods for obtaining memory properties

    /**
     * Gets the maximum memory [in bytes] of the current JVM.
     * 
     * @return
     */
    public static long getLocalMaxMemory() {
        return _localJVMMaxMem;
    }

    /**
     * 
     * @param localMem
     */
    public static void setLocalMaxMemory(long localMem) {
        _localJVMMaxMem = localMem;
    }

    /**
     * Gets the maximum memory [in bytes] of all given hadoop task memory settings.
     * 
     * @return
     */
    public static long getMaximumRemoteMaxMemory() {
        if (_remoteJVMMaxMemMap == -1)
            analyzeYarnCluster(true);

        long ret = (_remoteJVMMaxMemMap > _remoteJVMMaxMemReduce) ? _remoteJVMMaxMemMap : _remoteJVMMaxMemReduce;
        for (Map.Entry<Long, Long> entry : remoteJVMMaxMemPlan.entrySet()) {
            if (ret < entry.getValue())
                ret = entry.getValue();
        }
        return ret;
    }

    /**
     * Gets the maximum memory [in bytes] of a hadoop map task JVM.
     * 
     * @return
     */
    public static long getRemoteMaxMemoryMap(long jobLookupId) {
        if (_remoteJVMMaxMemMap == -1)
            analyzeYarnCluster(true);

        long ret = getSpecifiedRemoteMaxMemory(jobLookupId);
        if (ret == -1)
            ret = _remoteJVMMaxMemMap;
        return ret;
    }

    /**
     * Gets the maximum memory [in bytes] of a hadoop reduce task JVM.
     * 
     * @return
     */
    public static long getRemoteMaxMemoryReduce(long jobLookupId) {
        if (_remoteJVMMaxMemReduce == -1)
            analyzeYarnCluster(true);

        long ret = getSpecifiedRemoteMaxMemory(jobLookupId);
        if (ret == -1)
            ret = _remoteJVMMaxMemReduce;
        return ret;
    }

    /**
     * Gets the maximum memory [in bytes] of a hadoop task JVM.
     * 
     * @return
     */
    public static long getSpecifiedRemoteMaxMemory(long jobLookupId) {
        probedSb.add(jobLookupId);

        // Look up specified MR job setting
        Long ret = remoteJVMMaxMemPlan.get(jobLookupId);
        if (ret != null)
            return ret;

        // Look up specified default MR setting
        ret = remoteJVMMaxMemPlan.get((long) -1);
        if (ret != null)
            return ret;

        // No specified setting found
        return -1;
    }

    public static void setRemoteMaxMemPlan(HashMap<Long, Double> budgetMRPlan) {
        remoteJVMMaxMemPlan.clear();
        for (Map.Entry<Long, Double> entry : budgetMRPlan.entrySet()) {
            long mapJvm = ResourceOptimizer.budgetToJvm(entry.getValue());
            remoteJVMMaxMemPlan.put(entry.getKey(), mapJvm);
        }
    }

    public static void resetSBProbedSet() {
        probedSb.clear();
    }

    public static HashSet<Long> getSBProbedSet() {
        return probedSb;
    }

    public static void printProbedSet(String message) {
        ArrayList<Long> probed = new ArrayList<Long>(probedSb);
        Collections.sort(probed);
        System.out.print(message);
        for (Long id : probed)
            System.out.print(id + ",");
        System.out.println();
    }

    /**
     * Gets the maximum memory requirement [in bytes] of a given hadoop job.
     * 
     * @param conf
     * @return
     */
    /*public static long getRemoteMaxMemory( JobConf job )
    {
       return (1024*1024) * Math.max(
                       job.getMemoryForMapTask(),
                       job.getMemoryForReduceTask() );         
    }*/

    /**
     * Gets the maximum sort buffer memory requirement [in bytes] of a hadoop task.
     * 
     * @return
     */
    public static long getRemoteMaxMemorySortBuffer() {
        if (_remoteMRSortMem == -1)
            analyzeYarnCluster(true);

        return _remoteMRSortMem;
    }

    ///////
    //methods for obtaining constraints or respective defaults

    /**
     * Gets the maximum local parallelism constraint.
     * 
     * @return
     */
    public static int getCkMaxCP() {
        //default value (if not specified)
        return getLocalParallelism();
    }

    /**
     * Gets the maximum remote parallelism constraint
     * 
     * @return
     */
    public static int getCkMaxMR(long jobLookupId) {
        //default value (if not specified)
        return getRemoteParallelMapTasks(jobLookupId);
    }

    /**
     * Gets the maximum memory constraint [in bytes].
     * 
     * @return
     */
    public static long getCmMax(long jobLookupId) {
        //default value (if not specified)
        return Math.min(getLocalMaxMemory(), getRemoteMaxMemoryMap(jobLookupId));
    }

    /**
     * Gets the HDFS blocksize of the used cluster in bytes.
     * 
     * @return
     */
    public static long getHDFSBlockSize() {
        if (_blocksize == -1)
            analyzeYarnCluster(true);

        return _blocksize;
    }

    /**
     * 
     * @param javaOpts
     * @return
     */
    public static long extractMaxMemoryOpt(String javaOpts) {
        long ret = -1; //mem in bytes

        try {
            StringTokenizer st = new StringTokenizer(javaOpts, " ");
            while (st.hasMoreTokens()) {
                String arg = st.nextToken();
                if (!arg.startsWith("-Xmx")) //search for max mem
                    continue;

                arg = arg.substring(4); //cut off "-Xmx"
                //parse number and unit
                if (arg.endsWith("g") || arg.endsWith("G"))
                    ret = Long.parseLong(arg.substring(0, arg.length() - 1)) * 1024 * 1024 * 1024;
                else if (arg.endsWith("m") || arg.endsWith("M"))
                    ret = Long.parseLong(arg.substring(0, arg.length() - 1)) * 1024 * 1024;
                else if (arg.endsWith("k") || arg.endsWith("K"))
                    ret = Long.parseLong(arg.substring(0, arg.length() - 1)) * 1024;
                else
                    ret = Long.parseLong(arg.substring(0, arg.length() - 2));
            }

            if (ret < 0) // no argument found
            {
                ret = DEFAULT_JVM_SIZE;
            }
        } catch (Exception ex) {
            //if anything breaks during parsing (e.g., because args not specified correctly)
            ret = DEFAULT_JVM_SIZE;
        }

        return ret;
    }

    /**
     * 
     * @param job
     * @param key
     * @param bytes
     */
    public static void setMaxMemoryOpt(JobConf job, String key, long bytes) {
        String javaOptsOld = job.get(key);
        String javaOptsNew = null;

        //StringTokenizer st = new StringTokenizer( javaOptsOld, " " );
        String[] tokens = javaOptsOld.split(" "); //account also for no ' '
        StringBuilder sb = new StringBuilder();
        for (String arg : tokens) {
            if (arg.startsWith("-Xmx")) //search for max mem
            {
                sb.append("-Xmx");
                sb.append((bytes / (1024 * 1024)));
                sb.append("M");
            } else
                sb.append(arg);

            sb.append(" ");
        }
        javaOptsNew = sb.toString().trim();
        job.set(key, javaOptsNew);
    }

    ///////
    //internal methods for analysis

    /**
     * Analyzes properties of local machine and JVM.
     */
    private static void analyzeLocalMachine() {
        _localPar = Runtime.getRuntime().availableProcessors();
        _localJVMMaxMem = Runtime.getRuntime().maxMemory();
    }

    public static void analyzeYarnCluster(boolean verbose) {
        YarnConfiguration conf = new YarnConfiguration();
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(conf);
        yarnClient.start();
        analyzeYarnCluster(yarnClient, conf, verbose);
    }

    public static long getMinAllocationBytes() {
        if (minimalPhyAllocate < 0)
            analyzeYarnCluster(false);
        return minimalPhyAllocate;
    }

    public static long getMaxAllocationBytes() {
        if (maximumPhyAllocate < 0)
            analyzeYarnCluster(false);
        return maximumPhyAllocate;
    }

    public static long getNumCores() {
        if (clusterTotalCores < 0)
            analyzeYarnCluster(false);
        return clusterTotalCores;
    }

    public static long getNumNodes() {
        if (clusterTotalNodes < 0)
            analyzeYarnCluster(false);
        return clusterTotalNodes;
    }

    public static YarnClusterConfig getClusterConfig() {
        YarnClusterConfig cc = new YarnClusterConfig();
        cc.setMinAllocationMB(getMinAllocationBytes() / (1024 * 1024));
        cc.setMaxAllocationMB(getMaxAllocationBytes() / (1024 * 1024));
        cc.setNumNodes(getNumNodes());
        cc.setNumCores(getNumCores() * CPU_HYPER_FACTOR);

        return cc;
    }

    /**
     * 
     * @return
     * @throws YarnException
     * @throws IOException
     */
    public static double getClusterUtilization() throws IOException {
        double util = 0;

        try {
            if (_client == null)
                _client = createYarnClient();
            List<NodeReport> nodesReport = _client.getNodeReports();

            double maxMem = 0;
            double currMem = 0;
            long maxCores = 0;
            long currCores = 0;
            for (NodeReport node : nodesReport) {
                Resource max = node.getCapability();
                Resource used = node.getUsed();
                maxMem += max.getMemory();
                currMem += used.getMemory();
                maxCores += max.getVirtualCores();
                currCores += used.getVirtualCores();
            }

            util = Math.max(Math.min(1, currMem / maxMem), //memory util
                    Math.min(1, (double) currCores / maxCores)); //vcore util    
        } catch (Exception ex) {
            throw new IOException(ex);
        }

        return util;
    }

    /**
     * Analyzes properties of Yarn cluster and Hadoop configurations.
     */
    public static void analyzeYarnCluster(YarnClient yarnClient, YarnConfiguration conf, boolean verbose) {
        try {
            List<NodeReport> nodesReport = yarnClient.getNodeReports();
            if (verbose)
                System.out.println("There are " + nodesReport.size() + " nodes in the cluster");
            if (nodesReport.isEmpty())
                throw new YarnException("There are zero available nodes in the yarn cluster");

            nodesMaxPhySorted = new ArrayList<Long>(nodesReport.size());
            clusterTotalMem = 0;
            clusterTotalCores = 0;
            clusterTotalNodes = 0;
            minimumMRContainerPhyMB = -1;
            for (NodeReport node : nodesReport) {
                Resource resource = node.getCapability();
                Resource used = node.getUsed();
                if (used == null)
                    used = Resource.newInstance(0, 0);
                int mb = resource.getMemory();
                int cores = resource.getVirtualCores();
                if (mb <= 0)
                    throw new YarnException("A node has non-positive memory " + mb);

                int myMinMRPhyMB = mb / cores / CPU_HYPER_FACTOR;
                if (minimumMRContainerPhyMB < myMinMRPhyMB)
                    minimumMRContainerPhyMB = myMinMRPhyMB; // minimumMRContainerPhyMB needs to be the largest among the mins

                clusterTotalMem += (long) mb * 1024 * 1024;
                nodesMaxPhySorted.add((long) mb * 1024 * 1024);
                clusterTotalCores += cores;
                clusterTotalNodes++;
                if (verbose)
                    System.out.println("\t" + node.getNodeId() + " has " + mb + " MB (" + used.getMemory()
                            + " MB used) memory and " + resource.getVirtualCores() + " (" + used.getVirtualCores()
                            + " used) cores");

            }
            Collections.sort(nodesMaxPhySorted, Collections.reverseOrder());

            nodesMaxBudgetSorted = new ArrayList<Double>(nodesMaxPhySorted.size());
            for (int i = 0; i < nodesMaxPhySorted.size(); i++)
                nodesMaxBudgetSorted.add(ResourceOptimizer.phyToBudget(nodesMaxPhySorted.get(i)));

            _remotePar = nodesReport.size();
            if (_remotePar == 0)
                throw new YarnException("There are no available nodes in the yarn cluster");

            // Now get the default cluster settings
            _remoteMRSortMem = (1024 * 1024) * conf.getLong("io.sort.mb", 100); //100MB

            //handle jvm max mem (map mem budget is relevant for map-side distcache and parfor)
            //(for robustness we probe both: child and map configuration parameters)
            String javaOpts1 = conf.get("mapred.child.java.opts"); //internally mapred/mapreduce synonym
            String javaOpts2 = conf.get("mapreduce.map.java.opts", null); //internally mapred/mapreduce synonym
            String javaOpts3 = conf.get("mapreduce.reduce.java.opts", null); //internally mapred/mapreduce synonym
            if (javaOpts2 != null) //specific value overrides generic
                _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts2);
            else
                _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts1);
            if (javaOpts3 != null) //specific value overrides generic
                _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts3);
            else
                _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts1);

            //HDFS blocksize
            String blocksize = conf.get(MRConfigurationNames.DFS_BLOCK_SIZE, "134217728");
            _blocksize = Long.parseLong(blocksize);

            minimalPhyAllocate = (long) 1024 * 1024
                    * conf.getInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB,
                            YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB);
            maximumPhyAllocate = (long) 1024 * 1024
                    * conf.getInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
                            YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB);
            mrAMPhy = (long) conf.getInt("yarn.app.mapreduce.am.resource.mb", 1536) * 1024 * 1024;

        } catch (Exception e) {
            throw new RuntimeException("Unable to analyze yarn cluster ", e);
        }

        /*
         * This is for AppMaster to query available resource in the cluster during heartbeat 
         * 
        AMRMClient<ContainerRequest> rmClient = AMRMClient.createAMRMClient();
        rmClient.init(conf);
        rmClient.start();
        AllocateResponse response = rmClient.allocate(0);
        int nodeCount = response.getNumClusterNodes();
        Resource resource = response.getAvailableResources();
        List<NodeReport> nodeUpdate = response.getUpdatedNodes();
            
        LOG.info("This is a " + nodeCount + " node cluster with totally " +
        resource.getMemory() + " memory and " + resource.getVirtualCores() + " cores");
        LOG.info(nodereport.size() + " updatedNode reports received");
        for (NodeReport node : nodeUpdate) {
           resource = node.getCapability();
           LOG.info(node.getNodeId() + " updated with " + resource.getMemory() + " memory and " + resource.getVirtualCores() + " cores");
        }*/
    }

    /**
     * 
     * @return
     */
    private static YarnClient createYarnClient() {
        YarnConfiguration conf = new YarnConfiguration();
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(conf);
        yarnClient.start();
        return yarnClient;
    }
}