List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED
int PREEMPTED
To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus PREEMPTED.
Click Source Link
From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> containerStatuses) { List<RMEvent> changes = new ArrayList<RMEvent>(); for (ContainerStatus containerStatus : containerStatuses) { ContainerId containerId = containerStatus.getContainerId(); UUID resourceId = containerToResourceMap.remove(containerId); // we have the containerId only if we did not release it. if (resourceId != null) { switch (containerStatus.getExitStatus()) { case ContainerExitStatus.SUCCESS: LOG.warn("It should never happen, container for resource '{}' " + "exited on its own", resourceId);/*from w w w . j ava 2s . com*/ //reporting it as LOST for the client to take corrective measures. changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; case ContainerExitStatus.PREEMPTED: LOG.warn("Container for resource '{}' has been preempted", resourceId); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED)); break; case ContainerExitStatus.ABORTED: default: LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId, containerStatus.getExitStatus()); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; } } } llamaCallback.onEvent(changes); }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*from w w w . j a v a2 s .c om*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test AM requests a new container when a task fails * Error codes with same behavior - Disk failure, preemption and aborted *///from ww w . ja va 2 s .com @Test public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception { Map<String, String> config = new HashMap<>(); config.putAll(getConfig()); config.remove("yarn.container.retry.count"); SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync, new YarnConfiguration()); MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync, TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config))); getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator); Thread thread = new Thread(allocator); getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread); // Start the task manager taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); Container container = TestUtil .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); // Create container failure - with ContainerExitStatus.DISKS_FAILED taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.PREEMPTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.PREEMPTED, "Task Preempted by RM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.ABORTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED, "Task Aborted by the NM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); taskManager.onShutdown(); }
From source file:org.apache.slider.server.appmaster.state.ContainerOutcome.java
License:Apache License
/** * Build a container outcome from an exit status. * The values in {@link ContainerExitStatus} are used * here.// w ww . ja v a 2 s. co m * @param exitStatus exit status * @return an enumeration of the outcome. */ public static ContainerOutcome fromExitStatus(int exitStatus) { switch (exitStatus) { case ContainerExitStatus.ABORTED: case ContainerExitStatus.KILLED_BY_APPMASTER: case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER: case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION: // could either be a release or node failure. Treat as completion return Completed; case ContainerExitStatus.DISKS_FAILED: return Node_failure; case ContainerExitStatus.PREEMPTED: return Preempted; case ContainerExitStatus.KILLED_EXCEEDED_PMEM: case ContainerExitStatus.KILLED_EXCEEDED_VMEM: return Failed_limits_exceeded; default: return exitStatus == 0 ? Completed : Failed; } }
From source file:org.apache.tez.dag.app.rm.container.AMContainerEventCompleted.java
License:Apache License
public boolean isPreempted() { return (exitStatus == ContainerExitStatus.PREEMPTED || errCause == TaskAttemptTerminationCause.INTERNAL_PREEMPTION); }
From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java
License:Apache License
@SuppressWarnings("rawtypes") @Test(timeout = 5000)// w w w. ja va 2 s . c o m public void testContainerPreemptedAtRunning() { WrappedContainer wc = new WrappedContainer(); List<Event> outgoingEvents; wc.launchContainer(); wc.assignTaskAttempt(wc.taskAttemptID); wc.containerLaunched(); wc.verifyState(AMContainerState.RUNNING); wc.containerCompleted(ContainerExitStatus.PREEMPTED, TaskAttemptTerminationCause.EXTERNAL_PREEMPTION); wc.verifyState(AMContainerState.COMPLETED); verify(wc.tal).registerRunningContainer(wc.containerID); verify(wc.tal).unregisterRunningContainer(wc.containerID); verify(wc.chh).register(wc.containerID); verify(wc.chh).unregister(wc.containerID); outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1); Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause()); verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM); assertFalse(wc.amContainer.isInErrorState()); // Pending task complete. (Ideally, container should be dead at this point // and this event should not be generated. Network timeout on NM-RM heartbeat // can cause it to be genreated) wc.taskAttemptSucceeded(wc.taskAttemptID); wc.verifyNoOutgoingEvents(); wc.verifyHistoryStopEvent(); assertFalse(wc.amContainer.isInErrorState()); }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java
License:Apache License
@Override public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) { // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed. "; }/*from w w w . j a v a 2 s . c o m*/ if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java
License:Apache License
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) { // SchedulerId isn't used here since no node updates are sent out // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed, exitCode=" + exitStatus + ". "; }// ww w . ja v a2s . com if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java
License:Apache License
@Test(timeout = 5000) public void testContainerPreempted() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);/* w ww.j a v a2 s.c o m*/ schedulerHandler.start(); String diagnostics = "Container preempted by RM."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED); schedulerHandler.containerCompleted(mockTask, mockStatus); Assert.assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; Assert.assertEquals(mockCId, completedEvent.getContainerId()); Assert.assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics()); Assert.assertTrue(completedEvent.isPreempted()); Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause()); Assert.assertFalse(completedEvent.isDiskFailed()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java
License:Apache License
@Test(timeout = 5000) public void testContainerPreempted() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);/* www .j a v a 2 s .c om*/ schedulerHandler.start(); String diagnostics = "Container preempted by RM."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED); schedulerHandler.containerCompleted(0, mockTask, mockStatus); assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; assertEquals(mockCId, completedEvent.getContainerId()); assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics()); assertTrue(completedEvent.isPreempted()); assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause()); Assert.assertFalse(completedEvent.isDiskFailed()); schedulerHandler.stop(); schedulerHandler.close(); }