Example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED

List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED.

Prototype

int ABORTED

To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED.

Click Source Link

Document

Containers killed by the framework, either due to being released by the application or being 'lost' due to node failures etc.

Usage

From source file:alluxio.yarn.ApplicationMaster.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    for (ContainerStatus status : statuses) {
        // Releasing worker containers because we already have workers on their host will generate a
        // callback to this method, so we use debug instead of error.
        if (status.getExitStatus() == ContainerExitStatus.ABORTED) {
            LOG.debug("Aborted container {}", status.getContainerId());
        } else {//w ww  . ja  v  a 2 s  . co m
            LOG.error("Container {} completed with exit status {}", status.getContainerId(),
                    status.getExitStatus());
        }
    }
}

From source file:com.cloudera.kitten.appmaster.service.ApplicationMasterServiceImpl1.java

License:Open Source License

@Override
public void onContainersCompleted(List<ContainerStatus> containerStatuses) {
    LOG.info(containerStatuses.size() + " containers have completed");
    for (ContainerStatus status : containerStatuses) {
        int exitStatus = status.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                totalFailures.incrementAndGet();
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
            }/* w w w .j  av a  2 s  .c  om*/
        } else {
            // nothing to do
            // container completed successfully
            containerAllocation.get(status.getContainerId()).containerCompleted(status.getContainerId());
            LOG.info("Container id = " + status.getContainerId() + " completed successfully");
        }
    }
}

From source file:com.cloudera.kitten.appmaster.service.WorkflowService.java

License:Open Source License

@Override
public void onContainersCompleted(List<ContainerStatus> containerStatuses) {
    LOG.info(containerStatuses.size() + " containers have completed");
    for (ContainerStatus status : containerStatuses) {
        int exitStatus = status.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                totalFailures.incrementAndGet();
                containerAllocation.remove(status.getContainerId()).containerCompleted(status.getContainerId());
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
            }/*from www .  ja  va2  s.  c o m*/
        } else {
            // nothing to do
            // container completed successfully
            LOG.info("Container id = " + status.getContainerId() + " completed successfully");
            containerAllocation.remove(status.getContainerId()).containerCompleted(status.getContainerId());
        }
    }
}

From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> containerStatuses) {
    List<RMEvent> changes = new ArrayList<RMEvent>();
    for (ContainerStatus containerStatus : containerStatuses) {
        ContainerId containerId = containerStatus.getContainerId();
        UUID resourceId = containerToResourceMap.remove(containerId);
        // we have the containerId only if we did not release it.
        if (resourceId != null) {
            switch (containerStatus.getExitStatus()) {
            case ContainerExitStatus.SUCCESS:
                LOG.warn("It should never happen, container for resource '{}' " + "exited on its own",
                        resourceId);/*from  w w  w.j  av a 2  s .c  om*/
                //reporting it as LOST for the client to take corrective measures.
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            case ContainerExitStatus.PREEMPTED:
                LOG.warn("Container for resource '{}' has been preempted", resourceId);
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED));
                break;
            case ContainerExitStatus.ABORTED:
            default:
                LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId,
                        containerStatus.getExitStatus());
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            }
        }
    }
    llamaCallback.onEvent(changes);
}

From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java

License:Apache License

public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);

        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                // shell script failed
                // counts as completed
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
                //TODO: Add retry
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();

                // we do not need to release the container as it would be done
                // by the RM
            }//from  w  w  w .  java 2s. com
        } else {
            //nothing to do
            // container completed successfully
            numCompletedContainers.incrementAndGet();
            LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
    }
}

From source file:gobblin.yarn.YarnService.java

License:Apache License

/**
 * Check the exit status of a completed container and see if the replacement container
 * should try to be started on the same node. Some exit status indicates a disk or
 * node failure and in such cases the replacement container should try to be started on
 * a different node./*from ww  w  .  ja  v  a 2s  .  c om*/
 */
private boolean shouldStickToTheSameNode(int containerExitStatus) {
    switch (containerExitStatus) {
    case ContainerExitStatus.DISKS_FAILED:
        return false;
    case ContainerExitStatus.ABORTED:
        // Mostly likely this exit status is due to node failures because the
        // application itself will not release containers.
        return false;
    default:
        // Stick to the same node for other cases if host affinity is enabled.
        return this.containerHostAffinityEnabled;
    }
}

From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        GenericApplicationMaster.LOG.info("Got container status for containerID="
                + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus="
                + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        synchronized (_genericApplicationMaster.allocatedContainerSet) {
            _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId());
            SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap
                    .remove(containerStatus.getContainerId());
            if (stopResponseFuture != null) {
                ContainerStopResponse value = new ContainerStopResponse();
                stopResponseFuture.set(value);
            } else {
                SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap
                        .remove(containerStatus.getContainerId());
                if (releaseResponseFuture != null) {
                    ContainerReleaseResponse value = new ContainerReleaseResponse();
                    releaseResponseFuture.set(value);
                }//w  w  w . j a va  2 s. c o m
            }
        }
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {

            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason

                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            GenericApplicationMaster.LOG.info(
                    "Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
    }
}

From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*from   w w w  .  j a v a2  s  .c  o  m*/
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());
        removeContainer(containerStatus.getContainerId());
        LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId());
        serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + "");
        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                // shell script failed
                // counts as completed
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
        if (timelineClient != null) {
            YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId,
                    appSubmitterUgi);
        }
    }
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*w  ww.ja v  a  2 s.  co  m*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test AM requests a new container when a task fails
 * Error codes with same behavior - Disk failure, preemption and aborted
 *///w ww .  ja  va2  s. c  o  m
@Test
public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception {
    Map<String, String> config = new HashMap<>();
    config.putAll(getConfig());
    config.remove("yarn.container.retry.count");

    SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync,
            new YarnConfiguration());
    MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync,
            TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config)));
    getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator);

    Thread thread = new Thread(allocator);
    getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread);

    // Start the task manager
    taskManager.onInit();
    assertFalse(taskManager.shouldShutdown());
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());

    Container container = TestUtil
            .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123);
    taskManager.onContainerAllocated(container);

    // Allow container to run and update state
    Thread.sleep(300);

    // Create container failure - with ContainerExitStatus.DISKS_FAILED
    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.PREEMPTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(),
            ContainerExitStatus.PREEMPTED, "Task Preempted by RM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    // Create container failure - with ContainerExitStatus.ABORTED
    taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED,
            "Task Aborted by the NM"));

    // The above failure should trigger a container request
    assertEquals(1, allocator.containerRequestState.getRequestsQueue().size());
    assertEquals(2, testAMRMClient.requests.size());
    assertEquals(0, testAMRMClient.getRelease().size());
    assertFalse(taskManager.shouldShutdown());
    assertFalse(state.jobHealthy.get());
    assertEquals(ContainerRequestState.ANY_HOST,
            allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost());

    taskManager.onShutdown();
}