gobblin.yarn.YarnService.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.yarn.YarnService.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.yarn;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.AbstractMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.util.Records;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
import com.google.common.eventbus.EventBus;
import com.google.common.eventbus.Subscribe;
import com.google.common.io.Closer;
import com.google.common.util.concurrent.AbstractIdleService;

import com.typesafe.config.Config;

import gobblin.configuration.ConfigurationKeys;

import gobblin.cluster.GobblinClusterConfigurationKeys;
import gobblin.cluster.GobblinClusterMetricTagNames;
import gobblin.cluster.GobblinClusterUtils;
import gobblin.cluster.HelixUtils;
import gobblin.metrics.GobblinMetrics;
import gobblin.metrics.Tag;
import gobblin.metrics.event.EventSubmitter;
import gobblin.util.ConfigUtils;
import gobblin.util.ExecutorsUtils;
import gobblin.util.JvmUtils;
import gobblin.cluster.event.ClusterManagerShutdownRequest;
import gobblin.yarn.event.ContainerShutdownRequest;
import gobblin.yarn.event.NewContainerRequest;

/**
 * This class is responsible for all Yarn-related stuffs including ApplicationMaster registration,
 * ApplicationMaster un-registration, Yarn container management, etc.
 *
 * @author Yinan Li
 */
public class YarnService extends AbstractIdleService {

    private static final Logger LOGGER = LoggerFactory.getLogger(YarnService.class);

    private static final Splitter SPLITTER = Splitter.on(',').omitEmptyStrings().trimResults();

    private final String applicationName;
    private final String applicationId;

    private final Config config;

    private final EventBus eventBus;

    private final Configuration yarnConfiguration;
    private final FileSystem fs;

    private final Optional<GobblinMetrics> gobblinMetrics;
    private final Optional<EventSubmitter> eventSubmitter;

    private final AMRMClientAsync<AMRMClient.ContainerRequest> amrmClientAsync;
    private final NMClientAsync nmClientAsync;
    private final ExecutorService containerLaunchExecutor;

    private final int initialContainers;
    private final int requestedContainerMemoryMbs;
    private final int requestedContainerCores;
    private final boolean containerHostAffinityEnabled;

    private final int helixInstanceMaxRetries;

    private final Optional<String> containerJvmArgs;

    private volatile Optional<Resource> maxResourceCapacity = Optional.absent();

    // Security tokens for accessing HDFS
    private final ByteBuffer tokens;

    private final Closer closer = Closer.create();

    private final Object allContainersStopped = new Object();

    // A map from container IDs to pairs of Container instances and Helix participant IDs of the containers
    private final ConcurrentMap<ContainerId, Map.Entry<Container, String>> containerMap = Maps.newConcurrentMap();

    // A generator for an integer ID of a Helix instance (participant)
    private final AtomicInteger helixInstanceIdGenerator = new AtomicInteger(0);

    // A map from Helix instance names to the number times the instances are retried to be started
    private final ConcurrentMap<String, AtomicInteger> helixInstanceRetryCount = Maps.newConcurrentMap();

    // A queue of unused Helix instance names. An unused Helix instance name gets put
    // into the queue if the container running the instance completes. Unused Helix
    // instance names get picked up when replacement containers get allocated.
    private final ConcurrentLinkedQueue<String> unusedHelixInstanceNames = Queues.newConcurrentLinkedQueue();

    private volatile boolean shutdownInProgress = false;

    public YarnService(Config config, String applicationName, String applicationId,
            YarnConfiguration yarnConfiguration, FileSystem fs, EventBus eventBus) throws Exception {
        this.applicationName = applicationName;
        this.applicationId = applicationId;

        this.config = config;

        this.eventBus = eventBus;

        this.gobblinMetrics = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY)
                ? Optional.of(buildGobblinMetrics())
                : Optional.<GobblinMetrics>absent();

        this.eventSubmitter = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY)
                ? Optional.of(buildEventSubmitter())
                : Optional.<EventSubmitter>absent();

        this.yarnConfiguration = yarnConfiguration;
        this.fs = fs;

        this.amrmClientAsync = closer
                .register(AMRMClientAsync.createAMRMClientAsync(1000, new AMRMClientCallbackHandler()));
        this.amrmClientAsync.init(this.yarnConfiguration);
        this.nmClientAsync = closer.register(NMClientAsync.createNMClientAsync(new NMClientCallbackHandler()));
        this.nmClientAsync.init(this.yarnConfiguration);

        this.initialContainers = config.getInt(GobblinYarnConfigurationKeys.INITIAL_CONTAINERS_KEY);
        this.requestedContainerMemoryMbs = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_MEMORY_MBS_KEY);
        this.requestedContainerCores = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_CORES_KEY);
        this.containerHostAffinityEnabled = config
                .getBoolean(GobblinYarnConfigurationKeys.CONTAINER_HOST_AFFINITY_ENABLED);

        this.helixInstanceMaxRetries = config.getInt(GobblinYarnConfigurationKeys.HELIX_INSTANCE_MAX_RETRIES);

        this.containerJvmArgs = config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY)
                ? Optional.of(config.getString(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY))
                : Optional.<String>absent();

        this.containerLaunchExecutor = Executors.newFixedThreadPool(10,
                ExecutorsUtils.newThreadFactory(Optional.of(LOGGER), Optional.of("ContainerLaunchExecutor")));

        this.tokens = getSecurityTokens();
    }

    @SuppressWarnings("unused")
    @Subscribe
    public void handleNewContainerRequest(NewContainerRequest newContainerRequest) {
        if (!this.maxResourceCapacity.isPresent()) {
            LOGGER.error(String.format(
                    "Unable to handle new container request as maximum resource capacity is not available: "
                            + "[memory (MBs) requested = %d, vcores requested = %d]",
                    this.requestedContainerMemoryMbs, this.requestedContainerCores));
            return;
        }

        requestContainer(newContainerRequest.getReplacedContainer().transform(new Function<Container, String>() {

            @Override
            public String apply(Container container) {
                return container.getNodeId().getHost();
            }
        }));
    }

    @SuppressWarnings("unused")
    @Subscribe
    public void handleContainerShutdownRequest(ContainerShutdownRequest containerShutdownRequest) {
        for (Container container : containerShutdownRequest.getContainers()) {
            LOGGER.info(
                    String.format("Stopping container %s running on %s", container.getId(), container.getNodeId()));
            this.nmClientAsync.stopContainerAsync(container.getId(), container.getNodeId());
        }
    }

    @Override
    protected void startUp() throws Exception {
        LOGGER.info("Starting the YarnService");

        // Register itself with the EventBus for container-related requests
        this.eventBus.register(this);

        this.amrmClientAsync.start();
        this.nmClientAsync.start();

        // The ApplicationMaster registration response is used to determine the maximum resource capacity of the cluster
        RegisterApplicationMasterResponse response = this.amrmClientAsync
                .registerApplicationMaster(GobblinClusterUtils.getHostname(), -1, "");
        LOGGER.info("ApplicationMaster registration response: " + response);
        this.maxResourceCapacity = Optional.of(response.getMaximumResourceCapability());

        LOGGER.info("Requesting initial containers");
        requestInitialContainers(this.initialContainers);
    }

    @Override
    protected void shutDown() throws IOException {
        LOGGER.info("Stopping the YarnService");

        this.shutdownInProgress = true;

        try {
            ExecutorsUtils.shutdownExecutorService(this.containerLaunchExecutor, Optional.of(LOGGER));

            // Stop the running containers
            for (Map.Entry<Container, String> entry : this.containerMap.values()) {
                LOGGER.info(String.format("Stopping container %s running participant %s", entry.getKey().getId(),
                        entry.getValue()));
                this.nmClientAsync.stopContainerAsync(entry.getKey().getId(), entry.getKey().getNodeId());
            }

            if (!this.containerMap.isEmpty()) {
                synchronized (this.allContainersStopped) {
                    try {
                        // Wait 5 minutes for the containers to stop
                        this.allContainersStopped.wait(5 * 60 * 1000);
                        LOGGER.info("All of the containers have been stopped");
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                    }
                }
            }

            this.amrmClientAsync.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
        } catch (IOException | YarnException e) {
            LOGGER.error("Failed to unregister the ApplicationMaster", e);
        } finally {
            try {
                this.closer.close();
            } finally {
                if (this.gobblinMetrics.isPresent()) {
                    this.gobblinMetrics.get().stopMetricsReporting();
                }
            }
        }
    }

    private GobblinMetrics buildGobblinMetrics() {
        // Create tags list
        ImmutableList.Builder<Tag<?>> tags = new ImmutableList.Builder<>();
        tags.add(new Tag<>(GobblinClusterMetricTagNames.APPLICATION_ID, this.applicationId));
        tags.add(new Tag<>(GobblinClusterMetricTagNames.APPLICATION_NAME, this.applicationName));

        // Intialize Gobblin metrics and start reporters
        GobblinMetrics gobblinMetrics = GobblinMetrics.get(this.applicationId, null, tags.build());
        gobblinMetrics.startMetricReporting(ConfigUtils.configToProperties(config));

        return gobblinMetrics;
    }

    private EventSubmitter buildEventSubmitter() {
        return new EventSubmitter.Builder(this.gobblinMetrics.get().getMetricContext(),
                GobblinYarnEventConstants.EVENT_NAMESPACE).build();
    }

    private void requestInitialContainers(int containersRequested) {
        for (int i = 0; i < containersRequested; i++) {
            requestContainer(Optional.<String>absent());
        }
    }

    private void requestContainer(Optional<String> preferredNode) {
        Priority priority = Records.newRecord(Priority.class);
        priority.setPriority(0);

        Resource capability = Records.newRecord(Resource.class);
        int maxMemoryCapacity = this.maxResourceCapacity.get().getMemory();
        capability
                .setMemory(this.requestedContainerMemoryMbs <= maxMemoryCapacity ? this.requestedContainerMemoryMbs
                        : maxMemoryCapacity);
        int maxCoreCapacity = this.maxResourceCapacity.get().getVirtualCores();
        capability.setVirtualCores(
                this.requestedContainerCores <= maxCoreCapacity ? this.requestedContainerCores : maxCoreCapacity);

        String[] preferredNodes = preferredNode.isPresent() ? new String[] { preferredNode.get() } : null;
        this.amrmClientAsync
                .addContainerRequest(new AMRMClient.ContainerRequest(capability, preferredNodes, null, priority));
    }

    private ContainerLaunchContext newContainerLaunchContext(Container container, String helixInstanceName)
            throws IOException {
        Path appWorkDir = GobblinClusterUtils.getAppWorkDirPath(this.fs, this.applicationName, this.applicationId);
        Path containerWorkDir = new Path(appWorkDir, GobblinYarnConfigurationKeys.CONTAINER_WORK_DIR_NAME);

        Map<String, LocalResource> resourceMap = Maps.newHashMap();

        addContainerLocalResources(new Path(appWorkDir, GobblinYarnConfigurationKeys.LIB_JARS_DIR_NAME),
                resourceMap);
        addContainerLocalResources(new Path(containerWorkDir, GobblinYarnConfigurationKeys.APP_JARS_DIR_NAME),
                resourceMap);
        addContainerLocalResources(new Path(containerWorkDir, GobblinYarnConfigurationKeys.APP_FILES_DIR_NAME),
                resourceMap);

        if (this.config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_FILES_REMOTE_KEY)) {
            addRemoteAppFiles(this.config.getString(GobblinYarnConfigurationKeys.CONTAINER_FILES_REMOTE_KEY),
                    resourceMap);
        }

        ContainerLaunchContext containerLaunchContext = Records.newRecord(ContainerLaunchContext.class);
        containerLaunchContext.setLocalResources(resourceMap);
        containerLaunchContext.setEnvironment(YarnHelixUtils.getEnvironmentVariables(this.yarnConfiguration));
        containerLaunchContext.setCommands(Lists.newArrayList(buildContainerCommand(container, helixInstanceName)));

        if (UserGroupInformation.isSecurityEnabled()) {
            containerLaunchContext.setTokens(this.tokens.duplicate());
        }

        return containerLaunchContext;
    }

    private void addContainerLocalResources(Path destDir, Map<String, LocalResource> resourceMap)
            throws IOException {
        if (!this.fs.exists(destDir)) {
            LOGGER.warn(String.format("Path %s does not exist so no container LocalResource to add", destDir));
            return;
        }

        FileStatus[] statuses = this.fs.listStatus(destDir);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                YarnHelixUtils.addFileAsLocalResource(this.fs, status.getPath(), LocalResourceType.FILE,
                        resourceMap);
            }
        }
    }

    private void addRemoteAppFiles(String hdfsFileList, Map<String, LocalResource> resourceMap) throws IOException {
        for (String hdfsFilePath : SPLITTER.split(hdfsFileList)) {
            Path srcFilePath = new Path(hdfsFilePath);
            YarnHelixUtils.addFileAsLocalResource(srcFilePath.getFileSystem(this.yarnConfiguration), srcFilePath,
                    LocalResourceType.FILE, resourceMap);
        }
    }

    private ByteBuffer getSecurityTokens() throws IOException {
        Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
        Closer closer = Closer.create();
        try {
            DataOutputBuffer dataOutputBuffer = closer.register(new DataOutputBuffer());
            credentials.writeTokenStorageToStream(dataOutputBuffer);

            // Remove the AM->RM token so that containers cannot access it
            Iterator<Token<?>> tokenIterator = credentials.getAllTokens().iterator();
            while (tokenIterator.hasNext()) {
                Token<?> token = tokenIterator.next();
                if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
                    tokenIterator.remove();
                }
            }

            return ByteBuffer.wrap(dataOutputBuffer.getData(), 0, dataOutputBuffer.getLength());
        } catch (Throwable t) {
            throw closer.rethrow(t);
        } finally {
            closer.close();
        }
    }

    private String buildContainerCommand(Container container, String helixInstanceName) {
        String containerProcessName = GobblinYarnTaskRunner.class.getSimpleName();
        return new StringBuilder().append(ApplicationConstants.Environment.JAVA_HOME.$()).append("/bin/java")
                .append(" -Xmx").append(container.getResource().getMemory()).append("M").append(" ")
                .append(JvmUtils.formatJvmArguments(this.containerJvmArgs)).append(" ")
                .append(GobblinYarnTaskRunner.class.getName()).append(" --")
                .append(GobblinClusterConfigurationKeys.APPLICATION_NAME_OPTION_NAME).append(" ")
                .append(this.applicationName).append(" --")
                .append(GobblinClusterConfigurationKeys.HELIX_INSTANCE_NAME_OPTION_NAME).append(" ")
                .append(helixInstanceName).append(" 1>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR)
                .append(File.separator).append(containerProcessName).append(".").append(ApplicationConstants.STDOUT)
                .append(" 2>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR).append(File.separator)
                .append(containerProcessName).append(".").append(ApplicationConstants.STDERR).toString();
    }

    /**
     * Check the exit status of a completed container and see if the replacement container
     * should try to be started on the same node. Some exit status indicates a disk or
     * node failure and in such cases the replacement container should try to be started on
     * a different node.
     */
    private boolean shouldStickToTheSameNode(int containerExitStatus) {
        switch (containerExitStatus) {
        case ContainerExitStatus.DISKS_FAILED:
            return false;
        case ContainerExitStatus.ABORTED:
            // Mostly likely this exit status is due to node failures because the
            // application itself will not release containers.
            return false;
        default:
            // Stick to the same node for other cases if host affinity is enabled.
            return this.containerHostAffinityEnabled;
        }
    }

    /**
     * Handle the completion of a container. A new container will be requested to replace the one
     * that just exited. Depending on the exit status and if container host affinity is enabled,
     * the new container may or may not try to be started on the same node.
     *
     * A container completes in either of the following conditions: 1) some error happens in the
     * container and caused the container to exit, 2) the container gets killed due to some reason,
     * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
     * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
     * A replacement container is needed in all but the last case.
     */
    private void handleContainerCompletion(ContainerStatus containerStatus) {
        Map.Entry<Container, String> completedContainerEntry = this.containerMap
                .remove(containerStatus.getContainerId());
        String completedInstanceName = completedContainerEntry.getValue();

        LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
                containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));

        if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
            LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
                    containerStatus.getContainerId(), containerStatus.getDiagnostics()));
        }

        if (this.shutdownInProgress) {
            return;
        }

        int retryCount = this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0))
                .incrementAndGet();

        // Populate event metadata
        Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
        if (this.eventSubmitter.isPresent()) {
            eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
            eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID,
                    completedInstanceName);
            eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
                    retryCount + "");
        }

        if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
            if (this.eventSubmitter.isPresent()) {
                this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                        eventMetadataBuilder.get().build());
            }

            LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
            return;
        }

        // Add the Helix instance name of the completed container to the queue of unused
        // instance names so they can be reused by a replacement container.
        this.unusedHelixInstanceNames.offer(completedInstanceName);

        if (this.eventSubmitter.isPresent()) {
            this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                    eventMetadataBuilder.get().build());
        }

        LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s",
                containerStatus.getContainerId(), completedInstanceName));
        this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus())
                ? Optional.of(completedContainerEntry.getKey())
                : Optional.<Container>absent()));
    }

    private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(
            ContainerStatus containerStatus) {
        ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
        eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID,
                containerStatus.getContainerId().toString());
        eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
                containerStatus.getState().toString());
        if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
            eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
                    containerStatus.getExitStatus() + "");
        }
        if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
            eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
                    containerStatus.getDiagnostics());
        }

        return eventMetadataBuilder;
    }

    /**
     * A custom implementation of {@link AMRMClientAsync.CallbackHandler}.
     */
    private class AMRMClientCallbackHandler implements AMRMClientAsync.CallbackHandler {

        private volatile boolean done = false;

        @Override
        public void onContainersCompleted(List<ContainerStatus> statuses) {
            for (ContainerStatus containerStatus : statuses) {
                handleContainerCompletion(containerStatus);
            }
        }

        @Override
        public void onContainersAllocated(List<Container> containers) {
            for (final Container container : containers) {
                if (eventSubmitter.isPresent()) {
                    eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_ALLOCATION,
                            GobblinYarnMetricTagNames.CONTAINER_ID, container.getId().toString());
                }

                LOGGER.info(String.format("Container %s has been allocated", container.getId()));

                String instanceName = unusedHelixInstanceNames.poll();
                if (Strings.isNullOrEmpty(instanceName)) {
                    // No unused instance name, so generating a new one.
                    instanceName = HelixUtils.getHelixInstanceName(GobblinYarnTaskRunner.class.getSimpleName(),
                            helixInstanceIdGenerator.incrementAndGet());
                }

                final String finalInstanceName = instanceName;
                containerMap.put(container.getId(),
                        new AbstractMap.SimpleImmutableEntry<>(container, finalInstanceName));

                containerLaunchExecutor.submit(new Runnable() {
                    @Override
                    public void run() {
                        try {
                            LOGGER.info("Starting container " + container.getId());

                            nmClientAsync.startContainerAsync(container,
                                    newContainerLaunchContext(container, finalInstanceName));
                        } catch (IOException ioe) {
                            LOGGER.error("Failed to start container " + container.getId(), ioe);
                        }
                    }
                });
            }
        }

        @Override
        public void onShutdownRequest() {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.SHUTDOWN_REQUEST);
            }

            LOGGER.info("Received shutdown request from the ResourceManager");
            this.done = true;
            eventBus.post(new ClusterManagerShutdownRequest());
        }

        @Override
        public void onNodesUpdated(List<NodeReport> updatedNodes) {
            for (NodeReport nodeReport : updatedNodes) {
                LOGGER.info("Received node update report: " + nodeReport);
            }
        }

        @Override
        public float getProgress() {
            return this.done ? 1.0f : 0.0f;
        }

        @Override
        public void onError(Throwable t) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.ERROR,
                        GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION,
                        Throwables.getStackTraceAsString(t));
            }

            LOGGER.error("Received error: " + t, t);
            this.done = true;
            eventBus.post(new ClusterManagerShutdownRequest());
        }
    }

    /**
     * A custom implementation of {@link NMClientAsync.CallbackHandler}.
     */
    private class NMClientCallbackHandler implements NMClientAsync.CallbackHandler {

        @Override
        public void onContainerStarted(ContainerId containerId, Map<String, ByteBuffer> allServiceResponse) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STARTED,
                        GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString());
            }

            LOGGER.info(String.format("Container %s has been started", containerId));
        }

        @Override
        public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STATUS_RECEIVED,
                        buildContainerStatusEventMetadata(containerStatus).build());
            }

            LOGGER.info(
                    String.format("Received container status for container %s: %s", containerId, containerStatus));
        }

        @Override
        public void onContainerStopped(ContainerId containerId) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STOPPED,
                        GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString());
            }

            LOGGER.info(String.format("Container %s has been stopped", containerId));
            containerMap.remove(containerId);
            if (containerMap.isEmpty()) {
                synchronized (allContainersStopped) {
                    allContainersStopped.notify();
                }
            }
        }

        @Override
        public void onStartContainerError(ContainerId containerId, Throwable t) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_START_ERROR,
                        GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
                        GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION,
                        Throwables.getStackTraceAsString(t));
            }

            LOGGER.error(String.format("Failed to start container %s due to error %s", containerId, t));
            containerMap.remove(containerId);
        }

        @Override
        public void onGetContainerStatusError(ContainerId containerId, Throwable t) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_GET_STATUS_ERROR,
                        GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
                        GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION,
                        Throwables.getStackTraceAsString(t));
            }

            LOGGER.error(String.format("Failed to get status for container %s due to error %s", containerId, t));
        }

        @Override
        public void onStopContainerError(ContainerId containerId, Throwable t) {
            if (eventSubmitter.isPresent()) {
                eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STOP_ERROR,
                        GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
                        GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION,
                        Throwables.getStackTraceAsString(t));
            }

            LOGGER.error(String.format("Failed to stop container %s due to error %s", containerId, t));
        }
    }
}