org.apache.twill.yarn.YarnTwillController.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.twill.yarn.YarnTwillController.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.twill.yarn;

import com.google.common.base.Stopwatch;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.Uninterruptibles;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.twill.api.ResourceReport;
import org.apache.twill.api.RunId;
import org.apache.twill.api.TwillController;
import org.apache.twill.api.logging.LogHandler;
import org.apache.twill.internal.AbstractTwillController;
import org.apache.twill.internal.Constants;
import org.apache.twill.internal.ProcessController;
import org.apache.twill.internal.appmaster.ApplicationMasterLiveNodeData;
import org.apache.twill.internal.appmaster.TrackerService;
import org.apache.twill.internal.state.SystemMessages;
import org.apache.twill.internal.yarn.YarnAppClient;
import org.apache.twill.internal.yarn.YarnApplicationReport;
import org.apache.twill.internal.yarn.YarnUtils;
import org.apache.twill.zookeeper.NodeData;
import org.apache.twill.zookeeper.ZKClient;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import javax.annotation.Nullable;

/**
 * A {@link org.apache.twill.api.TwillController} that controllers application running on Hadoop YARN.
 */
final class YarnTwillController extends AbstractTwillController implements TwillController {

    private static final Logger LOG = LoggerFactory.getLogger(YarnTwillController.class);

    private final String appName;
    private final Callable<ProcessController<YarnApplicationReport>> startUp;
    private final long startTimeout;
    private final TimeUnit startTimeoutUnit;
    private volatile ApplicationMasterLiveNodeData amLiveNodeData;
    private ProcessController<YarnApplicationReport> processController;

    // Thread for polling yarn for application status if application got ZK session expire.
    // Only used by the instanceUpdate/Delete method, which is from serialized call from ZK callback.
    private Thread statusPollingThread;

    /**
     * Creates an instance with an existing {@link ApplicationMasterLiveNodeData}.
     */
    YarnTwillController(String appName, RunId runId, ZKClient zkClient,
            final ApplicationMasterLiveNodeData amLiveNodeData, final YarnAppClient yarnAppClient) {
        super(appName, runId, zkClient, amLiveNodeData.getKafkaZKConnect() != null,
                Collections.<LogHandler>emptyList());
        this.appName = appName;
        this.amLiveNodeData = amLiveNodeData;
        this.startUp = new Callable<ProcessController<YarnApplicationReport>>() {
            @Override
            public ProcessController<YarnApplicationReport> call() throws Exception {
                return yarnAppClient.createProcessController(YarnUtils
                        .createApplicationId(amLiveNodeData.getAppIdClusterTime(), amLiveNodeData.getAppId()));
            }
        };
        this.startTimeout = Constants.APPLICATION_MAX_START_SECONDS;
        this.startTimeoutUnit = TimeUnit.SECONDS;
    }

    YarnTwillController(String appName, RunId runId, ZKClient zkClient, boolean logCollectionEnabled,
            Iterable<LogHandler> logHandlers, Callable<ProcessController<YarnApplicationReport>> startUp,
            long startTimeout, TimeUnit startTimeoutUnit) {
        super(appName, runId, zkClient, logCollectionEnabled, logHandlers);
        this.appName = appName;
        this.startUp = startUp;
        this.startTimeout = startTimeout;
        this.startTimeoutUnit = startTimeoutUnit;
    }

    /**
     * Sends a message to application to notify the secure store has be updated.
     */
    ListenableFuture<Void> secureStoreUpdated() {
        return sendMessage(SystemMessages.SECURE_STORE_UPDATED, null);
    }

    @Nullable
    ApplicationMasterLiveNodeData getApplicationMasterLiveNodeData() {
        return amLiveNodeData;
    }

    @Override
    protected void doStartUp() {
        super.doStartUp();

        // Submit and poll the status of the yarn application
        try {
            processController = startUp.call();

            YarnApplicationReport report = processController.getReport();
            ApplicationId appId = report.getApplicationId();
            LOG.info("Application {} with id {} submitted", appName, appId);

            YarnApplicationState state = report.getYarnApplicationState();
            Stopwatch stopWatch = new Stopwatch().start();

            LOG.debug("Checking yarn application status for {} {}", appName, appId);
            while (!hasRun(state) && stopWatch.elapsedTime(startTimeoutUnit) < startTimeout) {
                report = processController.getReport();
                state = report.getYarnApplicationState();
                LOG.debug("Yarn application status for {} {}: {}", appName, appId, state);
                TimeUnit.SECONDS.sleep(1);
            }
            LOG.info("Yarn application {} {} is in state {}", appName, appId, state);
            if (state != YarnApplicationState.RUNNING) {
                LOG.info("Yarn application {} {} is not in running state. Shutting down controller.", appName,
                        appId);
                forceShutDown();
            }
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    @Override
    protected synchronized void doShutDown() {
        if (processController == null) {
            LOG.warn("No process controller for application that is not submitted.");
            return;
        }

        // Stop polling if it is running.
        stopPollStatus();

        // Wait for the stop message being processed
        try {
            Uninterruptibles.getUninterruptibly(getStopMessageFuture(), Constants.APPLICATION_MAX_STOP_SECONDS,
                    TimeUnit.SECONDS);
        } catch (Exception e) {
            LOG.error("Failed to wait for stop message being processed.", e);
            // Kill the application through yarn
            kill();
        }

        FinalApplicationStatus finalStatus;
        // Poll application status from yarn
        try (ProcessController<YarnApplicationReport> processController = this.processController) {
            Stopwatch stopWatch = new Stopwatch().start();
            long maxTime = TimeUnit.MILLISECONDS.convert(Constants.APPLICATION_MAX_STOP_SECONDS, TimeUnit.SECONDS);

            YarnApplicationReport report = processController.getReport();
            finalStatus = report.getFinalApplicationStatus();
            ApplicationId appId = report.getApplicationId();
            while (finalStatus == FinalApplicationStatus.UNDEFINED
                    && stopWatch.elapsedTime(TimeUnit.MILLISECONDS) < maxTime) {
                LOG.debug("Yarn application final status for {} {}: {}", appName, appId, finalStatus);
                TimeUnit.SECONDS.sleep(1);
                finalStatus = processController.getReport().getFinalApplicationStatus();
            }

            // Application not finished after max stop time, kill the application
            if (finalStatus == FinalApplicationStatus.UNDEFINED) {
                kill();
                finalStatus = FinalApplicationStatus.KILLED;
            }
        } catch (Exception e) {
            LOG.warn("Exception while waiting for application report: {}", e.getMessage(), e);
            kill();
            finalStatus = FinalApplicationStatus.KILLED;
        }

        super.doShutDown();

        if (finalStatus == FinalApplicationStatus.FAILED) {
            // If we know the app status is failed, throw an exception to make this controller goes into error state.
            // All other final status are not treated as failure as we can't be sure.
            setTerminationStatus(TerminationStatus.FAILED);
            throw new RuntimeException(
                    String.format("Yarn application completed with failure %s, %s.", appName, getRunId()));
        }
        setTerminationStatus(finalStatus == FinalApplicationStatus.SUCCEEDED ? TerminationStatus.SUCCEEDED
                : TerminationStatus.KILLED);
    }

    @Override
    public void kill() {
        if (processController != null) {
            YarnApplicationReport report = processController.getReport();
            LOG.info("Killing application {} {}", appName, report.getApplicationId());
            processController.cancel();
        } else {
            LOG.warn("No process controller for application that is not submitted.");
        }
    }

    @Override
    protected void instanceNodeUpdated(NodeData nodeData) {
        ApplicationMasterLiveNodeData data = ApplicationMasterLiveNodeDecoder.decode(nodeData);
        if (data != null) {
            amLiveNodeData = data;
        }
    }

    @Override
    protected void instanceNodeFailed(Throwable cause) {
        // Resort to polling from Yarn for the application status.
        if (processController == null) {
            LOG.warn("No process controller for application that is not submitted.");
            return;
        }
        YarnApplicationReport report = processController.getReport();

        // It happens if the application has ZK session expire or the node is deleted due to application termination.
        LOG.info("Failed to access application {} {} live node in ZK, resort to polling. Failure reason: {}",
                appName, report.getApplicationId(), cause == null ? "Unknown" : cause.getMessage());

        startPollStatus(report.getApplicationId());
    }

    private synchronized void startPollStatus(ApplicationId appId) {
        if (statusPollingThread == null) {
            statusPollingThread = new Thread(createStatusPollingRunnable(),
                    String.format("%s-%s-yarn-poller", appName, appId));
            statusPollingThread.setDaemon(true);
            statusPollingThread.start();
        }
    }

    private synchronized void stopPollStatus() {
        if (statusPollingThread != null) {
            statusPollingThread.interrupt();
            statusPollingThread = null;
        }
    }

    private Runnable createStatusPollingRunnable() {
        return new Runnable() {

            @Override
            public void run() {
                YarnApplicationReport report = processController.getReport();
                ApplicationId appId = report.getApplicationId();
                boolean shutdown = false;
                boolean watchInstanceNode = false;

                try {
                    LOG.debug("Polling status from Yarn for {} {}.", appName, appId);
                    while (!Thread.currentThread().isInterrupted()) {
                        if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) {
                            shutdown = true;
                            break;
                        }
                        // Make a sync exists call to instance node and re-watch if the node exists
                        try {
                            // The timeout is arbitrary, as it's just for avoiding block forever
                            Stat stat = zkClient.exists(getInstancePath()).get(5, TimeUnit.SECONDS);
                            if (stat != null) {
                                watchInstanceNode = true;
                                break;
                            }
                        } catch (ExecutionException e) {
                            // Ignore the exception, as any exception won't affect the status polling.
                            LOG.debug("Failed in exists call on ZK path {}.", getInstancePath(), e);
                        } catch (TimeoutException e) {
                            LOG.debug("Timeout in exists call on ZK path {}.", getInstancePath(), e);
                        }

                        TimeUnit.SECONDS.sleep(1);
                        report = processController.getReport();
                    }
                } catch (InterruptedException e) {
                    // OK to ignore.
                    LOG.debug("Status polling thread interrupted for application {} {}", appName, appId);
                }

                LOG.debug("Stop polling status from Yarn for {} {}.", appName, appId);

                if (shutdown) {
                    LOG.info("Yarn application {} {} completed. Shutting down controller.", appName, appId);
                    forceShutDown();
                } else if (watchInstanceNode) {
                    LOG.info("Rewatch instance node for {} {} at {}", appName, appId, getInstancePath());
                    synchronized (YarnTwillController.this) {
                        statusPollingThread = null;
                        watchInstanceNode();
                    }
                }
            }
        };
    }

    private boolean hasRun(YarnApplicationState state) {
        switch (state) {
        case RUNNING:
        case FINISHED:
        case FAILED:
        case KILLED:
            return true;
        }
        return false;
    }

    @Override
    public ResourceReport getResourceReport() {
        // Only has resource report if the app is running.
        if (state() != State.RUNNING) {
            return null;
        }
        ResourceReportClient resourcesClient = getResourcesClient();
        return (resourcesClient == null) ? null : resourcesClient.get();
    }

    /**
     * Returns the {@link ResourceReportClient} for fetching resource report from the AM.
     * It first consults the RM for the tracking URL and get the resource report from there.
     */
    @Nullable
    private ResourceReportClient getResourcesClient() {
        YarnApplicationReport report = processController.getReport();
        List<URL> urls = new ArrayList<>(2);

        // Try getting the report from the proxy tracking URL as well as the original tracking URL directly
        // This is mainly to workaround for unit-test that the proxy tracking URL doesn't work well with local address.
        for (String url : Arrays.asList(report.getTrackingUrl(), report.getOriginalTrackingUrl())) {
            if (url != null && !url.equals("N/A")) {
                try {
                    URL trackingUrl = new URL(url);
                    String path = trackingUrl.getPath();
                    if (path.endsWith("/")) {
                        path = path.substring(0, path.length() - 1);
                    }
                    urls.add(new URL(trackingUrl.getProtocol(), trackingUrl.getHost(), trackingUrl.getPort(),
                            path + TrackerService.PATH));
                } catch (MalformedURLException e) {
                    LOG.debug("Invalid tracking URL {} from YARN application report for {}:{}", url, appName,
                            getRunId());
                }
            }
        }

        if (urls.isEmpty()) {
            return null;
        }

        return new ResourceReportClient(urls);
    }
}