List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED
int DISKS_FAILED
To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus DISKS_FAILED.
Click Source Link
From source file:gobblin.yarn.YarnService.java
License:Apache License
/** * Check the exit status of a completed container and see if the replacement container * should try to be started on the same node. Some exit status indicates a disk or * node failure and in such cases the replacement container should try to be started on * a different node./*w w w. jav a2s .c o m*/ */ private boolean shouldStickToTheSameNode(int containerExitStatus) { switch (containerExitStatus) { case ContainerExitStatus.DISKS_FAILED: return false; case ContainerExitStatus.ABORTED: // Mostly likely this exit status is due to node failures because the // application itself will not release containers. return false; default: // Stick to the same node for other cases if host affinity is enabled. return this.containerHostAffinityEnabled; } }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*w w w. j a v a2 s .co m*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test AM requests a new container when a task fails * Error codes with same behavior - Disk failure, preemption and aborted *//* ww w.j a va 2 s. co m*/ @Test public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception { Map<String, String> config = new HashMap<>(); config.putAll(getConfig()); config.remove("yarn.container.retry.count"); SamzaTaskManager taskManager = new SamzaTaskManager(new MapConfig(config), state, amRmClientAsync, new YarnConfiguration()); MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync, TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(new MapConfig(config))); getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator); Thread thread = new Thread(allocator); getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread); // Start the task manager taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); Container container = TestUtil .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); // Create container failure - with ContainerExitStatus.DISKS_FAILED taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.DISKS_FAILED, "Disk failure")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.PREEMPTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.PREEMPTED, "Task Preempted by RM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); // Create container failure - with ContainerExitStatus.ABORTED taskManager.onContainerCompleted(TestUtil.getContainerStatus(container.getId(), ContainerExitStatus.ABORTED, "Task Aborted by the NM")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); taskManager.onShutdown(); }
From source file:org.apache.slider.server.appmaster.state.ContainerOutcome.java
License:Apache License
/** * Build a container outcome from an exit status. * The values in {@link ContainerExitStatus} are used * here./*from w w w . j av a2 s. c o m*/ * @param exitStatus exit status * @return an enumeration of the outcome. */ public static ContainerOutcome fromExitStatus(int exitStatus) { switch (exitStatus) { case ContainerExitStatus.ABORTED: case ContainerExitStatus.KILLED_BY_APPMASTER: case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER: case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION: // could either be a release or node failure. Treat as completion return Completed; case ContainerExitStatus.DISKS_FAILED: return Node_failure; case ContainerExitStatus.PREEMPTED: return Preempted; case ContainerExitStatus.KILLED_EXCEEDED_PMEM: case ContainerExitStatus.KILLED_EXCEEDED_VMEM: return Failed_limits_exceeded; default: return exitStatus == 0 ? Completed : Failed; } }
From source file:org.apache.tez.dag.app.rm.container.AMContainerEventCompleted.java
License:Apache License
public boolean isDiskFailed() { return (exitStatus == ContainerExitStatus.DISKS_FAILED); }
From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java
License:Apache License
@SuppressWarnings("rawtypes") @Test(timeout = 5000)//from w w w.jav a 2 s. c om public void testContainerCompletedAtLaunchingSpecificClusterError() { WrappedContainer wc = new WrappedContainer(); List<Event> outgoingEvents; wc.launchContainer(); wc.assignTaskAttempt(wc.taskAttemptID); wc.containerCompleted(ContainerExitStatus.DISKS_FAILED, TaskAttemptTerminationCause.NODE_DISK_ERROR); wc.verifyState(AMContainerState.COMPLETED); verify(wc.tal).registerRunningContainer(wc.containerID); verify(wc.tal).unregisterRunningContainer(wc.containerID); outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1); verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM); Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause()); assertFalse(wc.amContainer.isInErrorState()); // Container launched generated by NM call. wc.containerLaunched(); wc.verifyNoOutgoingEvents(); assertFalse(wc.amContainer.isInErrorState()); }
From source file:org.apache.tez.dag.app.rm.container.TestAMContainer.java
License:Apache License
@SuppressWarnings("rawtypes") @Test(timeout = 5000)/*from w ww . ja va 2 s . c om*/ public void testContainerDiskFailedAtRunning() { WrappedContainer wc = new WrappedContainer(); List<Event> outgoingEvents; wc.launchContainer(); wc.assignTaskAttempt(wc.taskAttemptID); wc.containerLaunched(); wc.verifyState(AMContainerState.RUNNING); wc.containerCompleted(ContainerExitStatus.DISKS_FAILED, TaskAttemptTerminationCause.NODE_DISK_ERROR); wc.verifyState(AMContainerState.COMPLETED); verify(wc.tal).registerRunningContainer(wc.containerID); verify(wc.tal).unregisterRunningContainer(wc.containerID); verify(wc.chh).register(wc.containerID); verify(wc.chh).unregister(wc.containerID); outgoingEvents = wc.verifyCountAndGetOutgoingEvents(1); Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, ((TaskAttemptEventContainerTerminatedBySystem) outgoingEvents.get(0)).getTerminationCause()); verifyUnOrderedOutgoingEventTypes(outgoingEvents, TaskAttemptEventType.TA_CONTAINER_TERMINATED_BY_SYSTEM); assertFalse(wc.amContainer.isInErrorState()); // Pending task complete. (Ideally, container should be dead at this point // and this event should not be generated. Network timeout on NM-RM heartbeat // can cause it to be genreated) wc.taskAttemptSucceeded(wc.taskAttemptID); wc.verifyNoOutgoingEvents(); wc.verifyHistoryStopEvent(); assertFalse(wc.amContainer.isInErrorState()); }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java
License:Apache License
@Override public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) { // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed. "; }/*from ww w . j a v a2 s . c o m*/ if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java
License:Apache License
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) { // SchedulerId isn't used here since no node updates are sent out // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed, exitCode=" + exitStatus + ". "; }//from www . j ava 2 s . c o m if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java
License:Apache License
@Test(timeout = 5000) public void testContainerDiskFailed() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);/*from ww w .j a v a 2s . co m*/ schedulerHandler.start(); String diagnostics = "NM disk failed."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED); schedulerHandler.containerCompleted(mockTask, mockStatus); Assert.assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; Assert.assertEquals(mockCId, completedEvent.getContainerId()); Assert.assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics()); Assert.assertFalse(completedEvent.isPreempted()); Assert.assertTrue(completedEvent.isDiskFailed()); Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause()); schedulerHandler.stop(); schedulerHandler.close(); }