Example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED

List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED.

Prototype

int PREEMPTED

To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED.

Click Source Link

Document

Containers preempted by the framework.

Usage

From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> containerStatuses) {
    List<RMEvent> changes = new ArrayList<RMEvent>();
    for (ContainerStatus containerStatus : containerStatuses) {
        ContainerId containerId = containerStatus.getContainerId();
        UUID resourceId = containerToResourceMap.remove(containerId);
        // we have the containerId only if we did not release it.
        if (resourceId != null) {
            switch (containerStatus.getExitStatus()) {
            case ContainerExitStatus.SUCCESS:
                LOG.warn("It should never happen, container for resource '{}' " + "exited on its own",
                        resourceId);/*from   w w w .  j  ava 2s  . com*/
                //reporting it as LOST for the client to take corrective measures.
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            case ContainerExitStatus.PREEMPTED:
                LOG.warn("Container for resource '{}' has been preempted", resourceId);
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED));
                break;
            case ContainerExitStatus.ABORTED:
            default:
                LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId,
                        containerStatus.getExitStatus());
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            }
        }
    }
    llamaCallback.onEvent(changes);
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*from   w  w w . j  a  v  a2  s  .c  om*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test AM requests a new container when a task fails
 * Error codes with same behavior - Disk failure, preemption and aborted
 *///from ww w . ja  va  2 s .com
@Test
public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception {
    Map<String, String> config = new HashMap<>();
    config.putAll(getConfig());
    config.remove("yarn.container.retry.count");

    SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync,
            new YarnConfiguration());
    MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync,
            TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config)));
    getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator);

    Thread thread = new Thread(allocator);
    getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread);

    // Start the task manager
    taskManager.onInit();
    assertFalse(taskManager.shouldShutdown());
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());

    Container container = TestUtil
            .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123);
    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    // Create container failure - with ContainerExitStatus.DISKS_FAILED
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.PREEMPTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(),
            ContainerExitStatus.PREEMPTED, "Task Preempted by RM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.ABORTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED,
            "Task Aborted by the NM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    taskManager.onShutdown();
}

From source file:org.apache.slider.server.appmaster.state.ContainerOutcome.java

License:Apache License

/**
 * Build a container outcome from an exit status.
 * The values in {@link ContainerExitStatus} are used
 * here.// w ww .  ja  v a 2 s. co  m
 * @param exitStatus exit status
 * @return an enumeration of the outcome.
 */
public static ContainerOutcome fromExitStatus(int exitStatus) {
    switch (exitStatus) {
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.KILLED_BY_APPMASTER:
    case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER:
    case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION:
        // could either be a release or node failure. Treat as completion
        return Completed;
    case ContainerExitStatus.DISKS_FAILED:
        return Node_failure;
    case ContainerExitStatus.PREEMPTED:
        return Preempted;
    case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
    case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
        return Failed_limits_exceeded;
    default:
        return exitStatus == 0 ? Completed : Failed;
    }
}

From source file:org.apache.tez.dag.app.rm.container.AMContainerEventCompleted.java

License:Apache License

public boolean isPreempted() {
    return (exitStatus == ContainerExitStatus.PREEMPTED
            || errCause == TaskAttemptTerminationCause.INTERNAL_PREEMPTION);
}

From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test(timeout = 5000)// w w w. ja va  2  s  . c o m
public void testContainerPreemptedAtRunning() {
    WrappedContainer wc = new WrappedContainer();
    List<Event> outgoingEvents;

    wc.launchContainer();

    wc.assignTaskAttempt(wc.taskAttemptID);
    wc.containerLaunched();
    wc.verifyState(AMContainerState.RUNNING);

    wc.containerCompleted(ContainerExitStatus.PREEMPTED, TaskAttemptTerminationCause.EXTERNAL_PREEMPTION);
    wc.verifyState(AMContainerState.COMPLETED);
    verify(wc.tal).registerRunningContainer(wc.containerID);
    verify(wc.tal).unregisterRunningContainer(wc.containerID);
    verify(wc.chh).register(wc.containerID);
    verify(wc.chh).unregister(wc.containerID);

    outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1);
    Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION,
            ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause());
    verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM);

    assertFalse(wc.amContainer.isInErrorState());

    // Pending task complete. (Ideally, container should be dead at this point
    // and this event should not be generated. Network timeout on NM-RM heartbeat
    // can cause it to be genreated)
    wc.taskAttemptSucceeded(wc.taskAttemptID);
    wc.verifyNoOutgoingEvents();
    wc.verifyHistoryStopEvent();

    assertFalse(wc.amContainer.isInErrorState());
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java

License:Apache License

@Override
public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) {
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed. ";
        }/*from w  w w  .  j  a  v  a 2  s .  c  o m*/
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java

License:Apache License

public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
    // SchedulerId isn't used here since no node updates are sent out
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed, exitCode=" + exitStatus + ". ";
        }//  ww w . ja v  a2s  .  com
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java

License:Apache License

@Test(timeout = 5000)
public void testContainerPreempted() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);/*  w ww.j a v  a2 s.c  o m*/
    schedulerHandler.start();

    String diagnostics = "Container preempted by RM.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED);
    schedulerHandler.containerCompleted(mockTask, mockStatus);
    Assert.assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    Assert.assertEquals(mockCId, completedEvent.getContainerId());
    Assert.assertEquals("Container preempted externally. Container preempted by RM.",
            completedEvent.getDiagnostics());
    Assert.assertTrue(completedEvent.isPreempted());
    Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause());
    Assert.assertFalse(completedEvent.isDiskFailed());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java

License:Apache License

@Test(timeout = 5000)
public void testContainerPreempted() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);/*  www  .j  a  v  a  2 s .c  om*/
    schedulerHandler.start();

    String diagnostics = "Container preempted by RM.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED);
    schedulerHandler.containerCompleted(0, mockTask, mockStatus);
    assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    assertEquals(mockCId, completedEvent.getContainerId());
    assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics());
    assertTrue(completedEvent.isPreempted());
    assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause());
    Assert.assertFalse(completedEvent.isDiskFailed());

    schedulerHandler.stop();
    schedulerHandler.close();
}