package org.apache.flink.yarn;

import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.BootstrapTools;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.resourcemanager.JobLeaderIdService;
import org.apache.flink.runtime.resourcemanager.ResourceManager;
import org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.duration.FiniteDuration;
import org.apache.flink.util.ExceptionUtils;

import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;

 * The yarn implementation of the resource manager. Used when the system is started
 * via the resource framework YARN.
public class YarnResourceManager extends ResourceManager<ResourceID> implements AMRMClientAsync.CallbackHandler {
    protected final Logger LOG = LoggerFactory.getLogger(getClass());

    /** The process environment variables */
    private final Map<String, String> ENV;

    /** The default registration timeout for task executor in seconds. */
    private final static int DEFAULT_TASK_MANAGER_REGISTRATION_DURATION = 300;

    /** The heartbeat interval while the resource master is waiting for containers */
    private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500;

    /** The default heartbeat interval during regular operation */
    private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000;

    /** The default memory of task executor to allocate (in MB) */
    private static final int DEFAULT_TSK_EXECUTOR_MEMORY_SIZE = 1024;

    /** Environment variable name of the final container id used by the YarnResourceManager.
     * Container ID generation may vary across Hadoop versions. */

    /** Environment variable name of the hostname given by the YARN.
     * In task executor we use the hostnames given by YARN consistently throughout akka */
    final static String ENV_FLINK_NODE_ID = "_FLINK_NODE_ID";

    /** Default heartbeat interval between this resource manager and the YARN ResourceManager */
    private final int yarnHeartbeatIntervalMillis;

    private final Configuration flinkConfig;

    private final YarnConfiguration yarnConfig;

    /** Client to communicate with the Resource Manager (YARN's master) */
    private AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient;

    /** Client to communicate with the Node manager and launch TaskExecutor processes */
    private NMClient nodeManagerClient;

    /** The number of containers requested, but not yet granted */
    private int numPendingContainerRequests;

    final private Map<ResourceProfile, Integer> resourcePriorities = new HashMap<>();

    public YarnResourceManager(Configuration flinkConfig, Map<String, String> env, RpcService rpcService,
            ResourceManagerConfiguration resourceManagerConfiguration,
            HighAvailabilityServices highAvailabilityServices, SlotManagerFactory slotManagerFactory,
            MetricRegistry metricRegistry, JobLeaderIdService jobLeaderIdService,
            FatalErrorHandler fatalErrorHandler) {
        super(rpcService, resourceManagerConfiguration, highAvailabilityServices, slotManagerFactory,
                metricRegistry, jobLeaderIdService, fatalErrorHandler);
        this.flinkConfig = flinkConfig;
        this.yarnConfig = new YarnConfiguration();
        this.ENV = env;
        final int yarnHeartbeatIntervalMS = flinkConfig.getInteger(ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS,
                DEFAULT_YARN_HEARTBEAT_INTERVAL_MS / 1000) * 1000;

        final long yarnExpiryIntervalMS = yarnConfig.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS,

        if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) {
                    "The heartbeat interval of the Flink Application master ({}) is greater "
                            + "than YARN's expiry interval ({}). The application is likely to be killed by YARN.",
                    yarnHeartbeatIntervalMS, yarnExpiryIntervalMS);
        yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMS;
        numPendingContainerRequests = 0;

    protected void initialize() throws ResourceManagerException {
        resourceManagerClient = AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis, this);
        try {
            //TODO: change akka address to tcp host and port, the getAddress() interface should return a standard tcp address
            Tuple2<String, Integer> hostPort = parseHostPort(getAddress());
            //TODO: the third paramter should be the webmonitor address
            resourceManagerClient.registerApplicationMaster(hostPort.f0, hostPort.f1, getAddress());
        } catch (Exception e) {
  "registerApplicationMaster fail", e);

        // create the client to communicate with the node managers
        nodeManagerClient = NMClient.createNMClient();

    public void shutDown() throws Exception {
        // shut down all components
        Throwable firstException = null;
        if (resourceManagerClient != null) {
            try {
            } catch (Throwable t) {
                firstException = t;
        if (nodeManagerClient != null) {
            try {
            } catch (Throwable t) {
                if (firstException == null) {
                    firstException = t;
                } else {
        if (firstException != null) {
            ExceptionUtils.rethrowException(firstException, "Error while shutting down YARN resource manager");

    protected void shutDownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {

        // first, de-register from YARN
        FinalApplicationStatus yarnStatus = getYarnStatus(finalStatus);"Unregistering application from the YARN Resource Manager");
        try {
            resourceManagerClient.unregisterApplicationMaster(yarnStatus, optionalDiagnostics, "");
        } catch (Throwable t) {
            LOG.error("Could not unregister the application master.", t);

    public void startNewWorker(ResourceProfile resourceProfile) {
        // Priority for worker containers - priorities are intra-application
        //TODO: set priority according to the resource allocated
        Priority priority = Priority.newInstance(generatePriority(resourceProfile));
        int mem = resourceProfile.getMemoryInMB() < 0 ? DEFAULT_TSK_EXECUTOR_MEMORY_SIZE
                : (int) resourceProfile.getMemoryInMB();
        int vcore = resourceProfile.getCpuCores() < 1 ? 1 : (int) resourceProfile.getCpuCores();
        Resource capability = Resource.newInstance(mem, vcore);
        requestYarnContainer(capability, priority);

    protected ResourceID workerStarted(ResourceID resourceID) {
        return resourceID;

    // AMRMClientAsync CallbackHandler methods
    public float getProgress() {
        // Temporarily need not record the total size of asked and allocated containers
        return 1;

    public void onContainersCompleted(List<ContainerStatus> list) {
        for (ContainerStatus container : list) {
            if (container.getExitStatus() < 0) {
                notifyWorkerFailed(new ResourceID(container.getContainerId().toString()),

    public void onContainersAllocated(List<Container> containers) {
        for (Container container : containers) {
            numPendingContainerRequests = Math.max(0, numPendingContainerRequests - 1);
  "Received new container: {} - Remaining pending container requests: {}", container.getId(),
            try {
                /** Context information used to start a TaskExecutor Java process */
                ContainerLaunchContext taskExecutorLaunchContext = createTaskExecutorLaunchContext(
                        container.getResource(), container.getId().toString(), container.getNodeId().getHost());
                nodeManagerClient.startContainer(container, taskExecutorLaunchContext);
            } catch (Throwable t) {
                // failed to launch the container, will release the failed one and ask for a new one
                LOG.error("Could not start TaskManager in container {},", container, t);
                requestYarnContainer(container.getResource(), container.getPriority());
        if (numPendingContainerRequests <= 0) {

    public void onShutdownRequest() {
        try {
        } catch (Exception e) {
            LOG.warn("Fail to shutdown the YARN resource manager.", e);

    public void onNodesUpdated(List<NodeReport> list) {
        // We are not interested in node updates

    public void onError(Throwable error) {

    //Utility methods
     * Converts a Flink application status enum to a YARN application status enum.
     * @param status The Flink application status.
     * @return The corresponding YARN application status.
    private FinalApplicationStatus getYarnStatus(ApplicationStatus status) {
        if (status == null) {
            return FinalApplicationStatus.UNDEFINED;
        } else {
            switch (status) {
            case SUCCEEDED:
                return FinalApplicationStatus.SUCCEEDED;
            case FAILED:
                return FinalApplicationStatus.FAILED;
            case CANCELED:
                return FinalApplicationStatus.KILLED;
                return FinalApplicationStatus.UNDEFINED;

    // parse the host and port from akka address, 
    // the akka address is like akka.tcp://flink@$a
    private static Tuple2<String, Integer> parseHostPort(String address) {
        String[] hostPort = address.split("@")[1].split(":");
        String host = hostPort[0];
        String port = hostPort[1].split("/")[0];
        return new Tuple2(host, Integer.valueOf(port));

    private void requestYarnContainer(Resource resource, Priority priority) {
        resourceManagerClient.addContainerRequest(new AMRMClient.ContainerRequest(resource, null, null, priority));
        // make sure we transmit the request fast and receive fast news of granted allocations

        numPendingContainerRequests++;"Requesting new TaskManager container pending requests: {}", numPendingContainerRequests);

    private ContainerLaunchContext createTaskExecutorLaunchContext(Resource resource, String containerId,
            String host) throws Exception {
        // init the ContainerLaunchContext
        final String currDir = ENV.get(ApplicationConstants.Environment.PWD.key());

        final ContaineredTaskManagerParameters taskManagerParameters = ContaineredTaskManagerParameters
                .create(flinkConfig, resource.getMemory(), 1);
                "TaskExecutor{} will be started with container size {} MB, JVM heap size {} MB, "
                        + "JVM direct memory limit {} MB",
                containerId, taskManagerParameters.taskManagerTotalMemoryMB(),
        int timeout = flinkConfig.getInteger(ConfigConstants.TASK_MANAGER_MAX_REGISTRATION_DURATION,
        FiniteDuration teRegistrationTimeout = new FiniteDuration(timeout, TimeUnit.SECONDS);
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(flinkConfig, "", 0,
                1, teRegistrationTimeout);
        LOG.debug("TaskManager configuration: {}", taskManagerConfig);

        ContainerLaunchContext taskExecutorLaunchContext = Utils.createTaskExecutorContext(flinkConfig, yarnConfig,
                ENV, taskManagerParameters, taskManagerConfig, currDir, YarnTaskExecutorRunner.class, LOG);

        // set a special environment variable to uniquely identify this container
        taskExecutorLaunchContext.getEnvironment().put(ENV_FLINK_CONTAINER_ID, containerId);
        taskExecutorLaunchContext.getEnvironment().put(ENV_FLINK_NODE_ID, host);
        return taskExecutorLaunchContext;

     * Generate priority by given resource profile.
     * Priority is only used for distinguishing request of different resource.
     * @param resourceProfile The resource profile of a request
     * @return The priority of this resource profile.
    private int generatePriority(ResourceProfile resourceProfile) {
        if (resourcePriorities.containsKey(resourceProfile)) {
            return resourcePriorities.get(resourceProfile);
        } else {
            int priority = resourcePriorities.size();
            resourcePriorities.put(resourceProfile, priority);
            return priority;
