List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS
int SUCCESS
To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS.
Click Source Link
From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> containerStatuses) { List<RMEvent> changes = new ArrayList<RMEvent>(); for (ContainerStatus containerStatus : containerStatuses) { ContainerId containerId = containerStatus.getContainerId(); UUID resourceId = containerToResourceMap.remove(containerId); // we have the containerId only if we did not release it. if (resourceId != null) { switch (containerStatus.getExitStatus()) { case ContainerExitStatus.SUCCESS: LOG.warn("It should never happen, container for resource '{}' " + "exited on its own", resourceId);//from w w w. j av a 2 s . com //reporting it as LOST for the client to take corrective measures. changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; case ContainerExitStatus.PREEMPTED: LOG.warn("Container for resource '{}' has been preempted", resourceId); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED)); break; case ContainerExitStatus.ABORTED: default: LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId, containerStatus.getExitStatus()); changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST)); break; } } } llamaCallback.onEvent(changes); }
From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones./* w w w . ja v a2 s . co m*/ * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info("Container completed from RM " + status); SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResrcStatuses.add(samzaResrcStatus); int completedContainerID = getIDForContainer(status.getContainerId().toString()); log.info("Completed container had ID: {}", completedContainerID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (completedContainerID != INVALID_YARN_CONTAINER_ID) { if (state.runningYarnContainers.containsKey(completedContainerID)) { log.info("Removing container ID {} from completed containers", completedContainerID); state.runningYarnContainers.remove(completedContainerID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } _callback.onResourcesCompleted(samzaResrcStatuses); }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*w w w.ja va 2 s .c om*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test Task Manager should stop when all containers finish */// www .ja v a 2 s.c o m @Test public void testTaskManagerShouldStopWhenContainersFinish() { SamzaTaskManager taskManager = new SamzaTaskManager(getConfig(), state, amRmClientAsync, new YarnConfiguration()); taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); taskManager.onContainerCompleted( TestUtil.getContainerStatus(state.amContainerId, ContainerExitStatus.SUCCESS, "")); assertTrue(taskManager.shouldShutdown()); }
From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones.//from w w w . j a v a2 s. co m * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info( "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.", status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics()); SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResourceStatuses.add(samzaResourceStatus); String completedProcessorID = getRunningProcessorId(status.getContainerId().toString()); log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(), completedProcessorID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) { if (state.runningProcessors.containsKey(completedProcessorID)) { log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.", completedProcessorID); state.runningProcessors.remove(completedProcessorID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses); }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java
License:Apache License
@Override public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) { // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed. "; }// www . j av a2 s .c o m if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java
License:Apache License
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) { // SchedulerId isn't used here since no node updates are sent out // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed, exitCode=" + exitStatus + ". "; }/* ww w . ja v a 2 s . c o m*/ if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.history.events.TestHistoryEventsProtoConversion.java
License:Apache License
private void testContainerStoppedEvent() throws Exception { ContainerStoppedEvent event = new ContainerStoppedEvent( ContainerId.newInstance(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1), 1001), 100034566, ContainerExitStatus.SUCCESS, ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1)); ContainerStoppedEvent deserializedEvent = (ContainerStoppedEvent) testProtoConversion(event); Assert.assertEquals(event.getContainerId(), deserializedEvent.getContainerId()); Assert.assertEquals(event.getStoppedTime(), deserializedEvent.getStoppedTime()); Assert.assertEquals(event.getApplicationAttemptId(), deserializedEvent.getApplicationAttemptId()); logEvents(event, deserializedEvent); }
From source file:org.elasticsearch.hadoop.yarn.am.EsCluster.java
License:Apache License
public void start() { running = true;//w w w . j a va2 s . c o m nmRpc.start(); UserGroupInformation.setConfiguration(cfg); log.info(String.format("Allocating Elasticsearch cluster with %d nodes", appConfig.containersToAllocate())); // register requests Resource capability = YarnCompat.resource(cfg, appConfig.containerMem(), appConfig.containerVCores()); Priority prio = Priority.newInstance(appConfig.amPriority()); for (int i = 0; i < appConfig.containersToAllocate(); i++) { // TODO: Add allocation (host/rack rules) - and disable location constraints ContainerRequest req = new ContainerRequest(capability, null, null, prio); amRpc.addContainerRequest(req); } // update status every 5 sec final long heartBeatRate = TimeUnit.SECONDS.toMillis(5); // start the allocation loop // when a new container is allocated, launch it right away int responseId = 0; try { do { AllocateResponse alloc = amRpc.allocate(responseId++); List<Container> currentlyAllocated = alloc.getAllocatedContainers(); for (Container container : currentlyAllocated) { launchContainer(container); allocatedContainers.add(container.getId()); } if (currentlyAllocated.size() > 0) { int needed = appConfig.containersToAllocate() - allocatedContainers.size(); if (needed > 0) { log.info(String.format("%s containers allocated, %s remaining", allocatedContainers.size(), needed)); } else { log.info(String.format("Fully allocated %s containers", allocatedContainers.size())); } } List<ContainerStatus> completed = alloc.getCompletedContainersStatuses(); for (ContainerStatus status : completed) { if (!completedContainers.contains(status.getContainerId())) { ContainerId containerId = status.getContainerId(); completedContainers.add(containerId); boolean containerSuccesful = false; switch (status.getExitStatus()) { case ContainerExitStatus.SUCCESS: log.info(String.format("Container %s finished succesfully...", containerId)); containerSuccesful = true; break; case ContainerExitStatus.ABORTED: log.warn(String.format("Container %s aborted...", containerId)); break; case ContainerExitStatus.DISKS_FAILED: log.warn(String.format("Container %s ran out of disk...", containerId)); break; case ContainerExitStatus.PREEMPTED: log.warn(String.format("Container %s preempted...", containerId)); break; default: log.warn(String.format("Container %s exited with an invalid/unknown exit code...", containerId)); } if (!containerSuccesful) { log.warn("Cluster has not completed succesfully..."); clusterHasFailed = true; running = false; } } } if (completedContainers.size() == appConfig.containersToAllocate()) { running = false; } if (running) { try { Thread.sleep(heartBeatRate); } catch (Exception ex) { throw new EsYarnNmException("Cluster interrupted"); } } } while (running); } finally { log.info("Cluster has completed running..."); try { Thread.sleep(TimeUnit.SECONDS.toMillis(15)); } catch (InterruptedException e) { throw new RuntimeException(e); } close(); } }
From source file:yarnkit.appmaster.ApplicationMasterService.java
License:Apache License
@Override public void onContainersCompleted(@Nonnull List<ContainerStatus> containerStatuses) { LOG.info(containerStatuses.size() + " container(s) have completed"); for (ContainerStatus status : containerStatuses) { LOG.info(YarnUtils.getContainerExitStatusMessage(status)); int exitStatus = status.getExitStatus(); if (exitStatus == ContainerExitStatus.SUCCESS) { totalCompleted.incrementAndGet(); } else {/*from w ww . j av a 2 s .c o m*/ if (exitStatus != ContainerExitStatus.ABORTED) { totalCompleted.incrementAndGet(); totalFailures.incrementAndGet(); } else { // Containers killed by the framework, either due to being released by // the application or being 'lost' due to node failures etc. } } } }