Example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED

List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED.

Prototype

int DISKS_FAILED

To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED.

Click Source Link

Document

When threshold number of the nodemanager-local-directories or threshold number of the nodemanager-log-directories become bad.

Usage

From source file:gobblin.yarn.YarnService.java

License:Apache License

/**
 * Check the exit status of a completed container and see if the replacement container
 * should try to be started on the same node. Some exit status indicates a disk or
 * node failure and in such cases the replacement container should try to be started on
 * a different node./*w  w  w. jav a2s .c o m*/
 */
private boolean shouldStickToTheSameNode(int containerExitStatus) {
    switch (containerExitStatus) {
    case ContainerExitStatus.DISKS_FAILED:
        return false;
    case ContainerExitStatus.ABORTED:
        // Mostly likely this exit status is due to node failures because the
        // application itself will not release containers.
        return false;
    default:
        // Stick to the same node for other cases if host affinity is enabled.
        return this.containerHostAffinityEnabled;
    }
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*w  w  w. j a  v  a2 s .co m*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test AM requests a new container when a task fails
 * Error codes with same behavior - Disk failure, preemption and aborted
 *//*  ww  w.j  a va  2 s. co  m*/
@Test
public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception {
    Map<String, String> config = new HashMap<>();
    config.putAll(getConfig());
    config.remove("yarn.container.retry.count");

    SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync,
            new YarnConfiguration());
    MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync,
            TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config)));
    getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator);

    Thread thread = new Thread(allocator);
    getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread);

    // Start the task manager
    taskManager.onInit();
    assertFalse(taskManager.shouldShutdown());
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());

    Container container = TestUtil
            .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123);
    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    // Create container failure - with ContainerExitStatus.DISKS_FAILED
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.PREEMPTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(),
            ContainerExitStatus.PREEMPTED, "Task Preempted by RM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.ABORTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED,
            "Task Aborted by the NM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    taskManager.onShutdown();
}

From source file:org.apache.slider.server.appmaster.state.ContainerOutcome.java

License:Apache License

/**
 * Build a container outcome from an exit status.
 * The values in {@link ContainerExitStatus} are used
 * here./*from   w  w  w .  j  av  a2 s. c  o m*/
 * @param exitStatus exit status
 * @return an enumeration of the outcome.
 */
public static ContainerOutcome fromExitStatus(int exitStatus) {
    switch (exitStatus) {
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.KILLED_BY_APPMASTER:
    case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER:
    case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION:
        // could either be a release or node failure. Treat as completion
        return Completed;
    case ContainerExitStatus.DISKS_FAILED:
        return Node_failure;
    case ContainerExitStatus.PREEMPTED:
        return Preempted;
    case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
    case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
        return Failed_limits_exceeded;
    default:
        return exitStatus == 0 ? Completed : Failed;
    }
}

From source file:org.apache.tez.dag.app.rm.container.AMContainerEventCompleted.java

License:Apache License

public boolean isDiskFailed() {
    return (exitStatus == ContainerExitStatus.DISKS_FAILED);
}

From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test(timeout = 5000)//from   w  w w.jav  a 2 s. c om
public void testContainerCompletedAtLaunchingSpecificClusterError() {
    WrappedContainer wc = new WrappedContainer();
    List<Event> outgoingEvents;

    wc.launchContainer();

    wc.assignTaskAttempt(wc.taskAttemptID);

    wc.containerCompleted(ContainerExitStatus.DISKS_FAILED, TaskAttemptTerminationCause.NODE_DISK_ERROR);
    wc.verifyState(AMContainerState.COMPLETED);
    verify(wc.tal).registerRunningContainer(wc.containerID);
    verify(wc.tal).unregisterRunningContainer(wc.containerID);

    outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1);
    verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM);
    Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR,
            ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause());

    assertFalse(wc.amContainer.isInErrorState());

    // Container launched generated by NM call.
    wc.containerLaunched();
    wc.verifyNoOutgoingEvents();

    assertFalse(wc.amContainer.isInErrorState());
}

From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test(timeout = 5000)/*from  w  ww  . ja va 2  s . c om*/
public void testContainerDiskFailedAtRunning() {
    WrappedContainer wc = new WrappedContainer();
    List<Event> outgoingEvents;

    wc.launchContainer();

    wc.assignTaskAttempt(wc.taskAttemptID);
    wc.containerLaunched();
    wc.verifyState(AMContainerState.RUNNING);

    wc.containerCompleted(ContainerExitStatus.DISKS_FAILED, TaskAttemptTerminationCause.NODE_DISK_ERROR);
    wc.verifyState(AMContainerState.COMPLETED);
    verify(wc.tal).registerRunningContainer(wc.containerID);
    verify(wc.tal).unregisterRunningContainer(wc.containerID);
    verify(wc.chh).register(wc.containerID);
    verify(wc.chh).unregister(wc.containerID);

    outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1);
    Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR,
            ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause());
    verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM);

    assertFalse(wc.amContainer.isInErrorState());

    // Pending task complete. (Ideally, container should be dead at this point
    // and this event should not be generated. Network timeout on NM-RM heartbeat
    // can cause it to be genreated)
    wc.taskAttemptSucceeded(wc.taskAttemptID);
    wc.verifyNoOutgoingEvents();
    wc.verifyHistoryStopEvent();

    assertFalse(wc.amContainer.isInErrorState());
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java

License:Apache License

@Override
public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) {
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed. ";
        }/*from ww  w  . j  a  v  a2 s .  c  o m*/
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java

License:Apache License

public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
    // SchedulerId isn't used here since no node updates are sent out
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed, exitCode=" + exitStatus + ". ";
        }//from www . j  ava 2  s  . c o  m
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java

License:Apache License

@Test(timeout = 5000)
public void testContainerDiskFailed() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);/*from ww  w .j a v a  2s .  co  m*/
    schedulerHandler.start();

    String diagnostics = "NM disk failed.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED);
    schedulerHandler.containerCompleted(mockTask, mockStatus);
    Assert.assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    Assert.assertEquals(mockCId, completedEvent.getContainerId());
    Assert.assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics());
    Assert.assertFalse(completedEvent.isPreempted());
    Assert.assertTrue(completedEvent.isDiskFailed());
    Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause());

    schedulerHandler.stop();
    schedulerHandler.close();
}