List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED
int ABORTED
To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus ABORTED.
Click Source Link
From source file:alluxio.yarn.ApplicationMaster.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> statuses) { for (ContainerStatus status : statuses) { // Releasing worker containers because we already have workers on their host will generate a // callback to this method, so we use debug instead of error. if (status.getExitStatus() == ContainerExitStatus.ABORTED) { LOG.debug("Aborted container {}", status.getContainerId()); } else {//w ww . ja v a 2 s . co m LOG.error("Container {} completed with exit status {}", status.getContainerId(), status.getExitStatus()); } } }
From source file:com.cloudera.kitten.appmaster.service.ApplicationMasterServiceImpl1.java
License:Open Source License
@Override public void onContainersCompleted(List<ContainerStatus> containerStatuses) { LOG.info(containerStatuses.size() + " containers have completed"); for (ContainerStatus status : containerStatuses) { int exitStatus = status.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { totalFailures.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason }/* w w w .j av a 2 s .c om*/ } else { // nothing to do // container completed successfully containerAllocation.get(status.getContainerId()).containerCompleted(status.getContainerId()); LOG.info("Container id = " + status.getContainerId() + " completed successfully"); } } }
From source file:com.cloudera.kitten.appmaster.service.WorkflowService.java
License:Open Source License
@Override public void onContainersCompleted(List<ContainerStatus> containerStatuses) { LOG.info(containerStatuses.size() + " containers have completed"); for (ContainerStatus status : containerStatuses) { int exitStatus = status.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { totalFailures.incrementAndGet(); containerAllocation.remove(status.getContainerId()).containerCompleted(status.getContainerId()); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason }/*from www . ja va2 s. c o m*/ } else { // nothing to do // container completed successfully LOG.info("Container id = " + status.getContainerId() + " completed successfully"); containerAllocation.remove(status.getContainerId()).containerCompleted(status.getContainerId()); } } }
From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> containerStatuses) { List<RMEvent> changes = new ArrayList<RMEvent>(); for (ContainerStatus containerStatus : containerStatuses) { ContainerId containerId = containerStatus.getContainerId(); UUID resourceId = containerToResourceMap.remove(containerId); // we have the containerId only if we did not release it. if (resourceId != null) { switch (containerStatus.getExitStatus()) { case ContainerExitStatus.SUCCESS: LOG.warn("It should never happen, container for resource '{}' " + "exited on its own", resourceId);/*from w w w.j av a 2 s .c om*/ //reporting it as LOST for the client to take corrective measures. changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; case ContainerExitStatus.PREEMPTED: LOG.warn("Container for resource '{}' has been preempted", resourceId); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED)); break; case ContainerExitStatus.ABORTED: default: LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId, containerStatus.getExitStatus()); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; } } } llamaCallback.onEvent(changes); }
From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java
License:Apache License
public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason //TODO: Add retry numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); // we do not need to release the container as it would be done // by the RM }//from w w w . java 2s. com } else { //nothing to do // container completed successfully numCompletedContainers.incrementAndGet(); LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } }
From source file:gobblin.yarn.YarnService.java
License:Apache License
/** * Check the exit status of a completed container and see if the replacement container * should try to be started on the same node. Some exit status indicates a disk or * node failure and in such cases the replacement container should try to be started on * a different node./*from ww w . ja v a 2s . c om*/ */ private boolean shouldStickToTheSameNode(int containerExitStatus) { switch (containerExitStatus) { case ContainerExitStatus.DISKS_FAILED: return false; case ContainerExitStatus.ABORTED: // Mostly likely this exit status is due to node failures because the // application itself will not release containers. return false; default: // Stick to the same node for other cases if host affinity is enabled. return this.containerHostAffinityEnabled; } }
From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { GenericApplicationMaster.LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); synchronized (_genericApplicationMaster.allocatedContainerSet) { _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId()); SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap .remove(containerStatus.getContainerId()); if (stopResponseFuture != null) { ContainerStopResponse value = new ContainerStopResponse(); stopResponseFuture.set(value); } else { SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap .remove(containerStatus.getContainerId()); if (releaseResponseFuture != null) { ContainerReleaseResponse value = new ContainerReleaseResponse(); releaseResponseFuture.set(value); }//w w w . j a va 2 s. c o m } } // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully GenericApplicationMaster.LOG.info( "Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } }
From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java
License:Apache License
@SuppressWarnings("unchecked") @Override/*from w w w . j a v a2 s .c o m*/ public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); removeContainer(containerStatus.getContainerId()); LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId()); serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + ""); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } if (timelineClient != null) { YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId, appSubmitterUgi); } } }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*w ww.ja v a 2 s. co m*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test AM requests a new container when a task fails * Error codes with same behavior - Disk failure, preemption and aborted *///w ww . ja va2 s. c o m @Test public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception { Map<String, String> config = new HashMap<>(); config.putAll(getConfig()); config.remove("yarn.container.retry.count"); SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync, new YarnConfiguration()); MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync, TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config))); getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator); Thread thread = new Thread(allocator); getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread); // Start the task manager taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); Container container = TestUtil .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); // Create container failure - with ContainerExitStatus.DISKS_FAILED taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.PREEMPTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.PREEMPTED, "Task Preempted by RM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.ABORTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED, "Task Aborted by the NM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); taskManager.onShutdown(); }