Example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.

Prototype

FinalApplicationStatus FAILED

To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.

Click Source Link

Document

Application which failed.

Usage

From source file:org.apache.oozie.action.hadoop.TestLauncherMain.java

License:Apache License

@Test
public void testKillChildYarnJobs() throws Exception {
    YarnClient yc = Mockito.mock(YarnClient.class);
    ApplicationReport ar = Mockito.mock(ApplicationReport.class);
    Mockito.when(yc.getApplicationReport(Mockito.any(ApplicationId.class))).thenReturn(ar);

    Mockito.when(ar.getFinalApplicationStatus()).thenReturn(FinalApplicationStatus.UNDEFINED)
            .thenReturn(FinalApplicationStatus.FAILED).thenReturn(FinalApplicationStatus.KILLED);

    ApplicationId appz[] = { ApplicationId.newInstance(System.currentTimeMillis(), 1),
            ApplicationId.newInstance(System.currentTimeMillis(), 2),
            ApplicationId.newInstance(System.currentTimeMillis(), 3) };

    Collection<ApplicationId> result = LauncherMain.checkAndKillChildYarnJobs(yc, null, Arrays.asList(appz));

    assertEquals(1, result.size());/*from   w ww  . j a v a 2 s.  c  om*/
    assertEquals(appz[0].getId(), result.iterator().next().getId());
}

From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java

License:Apache License

private void onRuntimeError(final Throwable throwable) {

    // SHUTDOWN YARN
    try {/*from   www.  j  a  v  a  2s .c  om*/
        this.reefEventHandlers.close();
        this.resourceManager.unregisterApplicationMaster(FinalApplicationStatus.FAILED, throwable.getMessage(),
                null);
    } catch (final Exception e) {
        LOG.log(Level.WARNING, "Error shutting down YARN application", e);
    } finally {
        this.resourceManager.stop();
    }

    final RuntimeStatusProto.Builder runtimeStatusBuilder = RuntimeStatusProto.newBuilder()
            .setState(ReefServiceProtos.State.FAILED).setName(RUNTIME_NAME);

    final Encoder<Throwable> codec = new ObjectSerializableCodec<>();
    runtimeStatusBuilder.setError(ReefServiceProtos.RuntimeErrorProto.newBuilder().setName(RUNTIME_NAME)
            .setMessage(throwable.getMessage()).setException(ByteString.copyFrom(codec.encode(throwable)))
            .build()).build();

    this.reefEventHandlers.onRuntimeStatus(runtimeStatusBuilder.build());
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*from w  w w.j av a2  s . c  om*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test Task Manager should request a new container when a task fails with unknown exit code
 * When host-affinity is not enabled, it will always request for ANY_HOST
 */// ww w  .  j av a2 s .  c  om
@Test
public void testNewContainerRequestedOnFailureWithUnknownCode() throws Exception {
    Config conf = getConfig();
    SamzaTaskManager taskManager = new SamzaTaskManager(conf, state, amRmClientAsync, new YarnConfiguration());
    MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync,
            TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(conf));
    getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator);

    Thread thread = new Thread(allocator);
    getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread);

    // onInit triggers a request
    taskManager.onInit();

    assertFalse(taskManager.shouldShutdown());
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());

    Container container = TestUtil
            .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123);
    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    // Create first container failure
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());

    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    assertTrue(state.jobHealthy.get());

    // Create a second failure
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here"));

    // The above failure should trigger a job shutdown because our retry count is set to 1
    assertEquals(0, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertFalse(state.jobHealthy.get());
    assertTrue(taskManager.shouldShutdown());
    assertEquals(FinalApplicationStatus.FAILED, state.status);

    taskManager.onShutdown();
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test Task Manager should request a new container when a task fails with unknown exit code
 * When host-affinity is enabled, it will always request for the same host that it was last seen on
 *//*  ww  w .  j  av  a 2s.  c o m*/
@Test
public void testSameContainerRequestedOnFailureWithUnknownCode() throws Exception {
    Config conf = getConfigWithHostAffinity();
    SamzaTaskManager taskManager = new SamzaTaskManager(conf, state, amRmClientAsync, new YarnConfiguration());
    MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync,
            TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(conf));
    getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator);

    Thread thread = new Thread(allocator);
    getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread);

    // onInit triggers a request
    taskManager.onInit();

    assertFalse(taskManager.shouldShutdown());
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());

    Container container = TestUtil
            .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123);
    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    // Create first container failure
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals("abc", allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());

    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    assertTrue(state.jobHealthy.get());

    // Create a second failure
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here"));

    // The above failure should trigger a job shutdown because our retry count is set to 1
    assertEquals(0, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertFalse(state.jobHealthy.get());
    assertTrue(taskManager.shouldShutdown());
    assertEquals(FinalApplicationStatus.FAILED, state.status);

    taskManager.onShutdown();
}

From source file:org.apache.slider.server.appmaster.actions.ActionStopSlider.java

License:Apache License

/**
 * Build from an exception./* w ww  .  j  av a  2  s . co  m*/
 * <p>
 * If the exception implements
 * {@link ExitCodeProvider} then the exit code is extracted from that
 * @param ex exception.
 */
public ActionStopSlider(Exception ex) {
    super("stop");
    if (ex instanceof ExitCodeProvider) {
        setExitCode(((ExitCodeProvider) ex).getExitCode());
    } else {
        setExitCode(LauncherExitCodes.EXIT_EXCEPTION_THROWN);
    }
    setFinalApplicationStatus(FinalApplicationStatus.FAILED);
    setMessage(ex.getMessage());
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

@Override //AMRMClientAsync
public void onError(Throwable e) {
    //callback says it's time to finish
    LOG_YARN.error("AMRMClientAsync.onError() received " + e, e);
    signalAMComplete(new ActionStopSlider("stop", EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED,
            "AMRMClientAsync.onError() received " + e));
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

/**
 * Received on listening service termination.
 * @param service the service that has changed.
 *///  ww  w.java  2  s .co  m
@Override //ServiceStateChangeListener
public void stateChanged(Service service) {
    if (service == providerService && service.isInState(STATE.STOPPED)) {
        //its the current master process in play
        int exitCode = providerService.getExitCode();
        int mappedProcessExitCode = exitCode;

        boolean shouldTriggerFailure = !amCompletionFlag.get() && (mappedProcessExitCode != 0);

        if (shouldTriggerFailure) {
            String reason = "Spawned process failed with raw " + exitCode + " mapped to "
                    + mappedProcessExitCode;
            ActionStopSlider stop = new ActionStopSlider("stop", mappedProcessExitCode,
                    FinalApplicationStatus.FAILED, reason);
            //this wasn't expected: the process finished early
            spawnedProcessExitedBeforeShutdownTriggered = true;
            log.info("Process has exited with exit code {} mapped to {} -triggering termination", exitCode,
                    mappedProcessExitCode);

            //tell the AM the cluster is complete 
            signalAMComplete(stop);
        } else {
            //we don't care
            log.info("Process has exited with exit code {} mapped to {} -ignoring", exitCode,
                    mappedProcessExitCode);
        }
    } else {
        super.stateChanged(service);
    }
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

/**
 * Handle any exception in a thread. If the exception provides an exit
 * code, that is the one that will be used
 * @param thread thread throwing the exception
 * @param exception exception/*from   w  w w  .ja va 2  s . com*/
 */
public void onExceptionInThread(Thread thread, Exception exception) {
    log.error("Exception in {}: {}", thread.getName(), exception, exception);

    // if there is a teardown in progress, ignore it
    if (amCompletionFlag.get()) {
        log.info("Ignoring exception: shutdown in progress");
    } else {
        int exitCode = EXIT_EXCEPTION_THROWN;
        if (exception instanceof ExitCodeProvider) {
            exitCode = ((ExitCodeProvider) exception).getExitCode();
        }
        signalAMComplete(
                new ActionStopSlider("stop", exitCode, FinalApplicationStatus.FAILED, exception.toString()));
    }
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

/**
 * Start the chaos monkey/*from ww w  .j a v  a 2s  .c o  m*/
 * @return true if it started
 */
private boolean maybeStartMonkey() {
    MapOperations internals = getGlobalInternalOptions();

    Boolean enabled = internals.getOptionBool(InternalKeys.CHAOS_MONKEY_ENABLED,
            InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED);
    if (!enabled) {
        log.info("Chaos monkey disabled");
        return false;
    }

    long monkeyInterval = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_INTERVAL,
            InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_DAYS, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_HOURS,
            InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_MINUTES, 0);
    if (monkeyInterval == 0) {
        log.debug("Chaos monkey not configured with a time interval...not enabling");
        return false;
    }

    long monkeyDelay = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_DELAY, 0, 0, 0, (int) monkeyInterval);

    log.info("Adding Chaos Monkey scheduled every {} seconds ({} hours -delay {}", monkeyInterval,
            monkeyInterval / (60 * 60), monkeyDelay);
    monkey = new ChaosMonkeyService(metrics, actionQueues);
    initAndAddService(monkey);

    // configure the targets

    // launch failure: special case with explicit failure triggered now
    int amLaunchFailProbability = internals
            .getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_LAUNCH_FAILURE, 0);
    if (amLaunchFailProbability > 0 && monkey.chaosCheck(amLaunchFailProbability)) {
        log.info("Chaos Monkey has triggered AM Launch failure");
        // trigger a failure
        ActionStopSlider stop = new ActionStopSlider("stop", 0, TimeUnit.SECONDS, LauncherExitCodes.EXIT_FALSE,
                FinalApplicationStatus.FAILED, E_TRIGGERED_LAUNCH_FAILURE);
        queue(stop);
    }

    int amKillProbability = internals.getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_FAILURE,
            InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE);
    monkey.addTarget("AM killer", new ChaosKillAM(actionQueues, -1), amKillProbability);
    int containerKillProbability = internals.getOptionInt(
            InternalKeys.CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE,
            InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE);
    monkey.addTarget("Container killer", new ChaosKillContainer(appState, actionQueues, rmOperationHandler),
            containerKillProbability);

    // and schedule it
    if (monkey.schedule(monkeyDelay, monkeyInterval, TimeUnit.SECONDS)) {
        log.info("Chaos Monkey is running");
        return true;
    } else {
        log.info("Chaos monkey not started");
        return false;
    }
}