List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus
@Public @Unstable public abstract int getExitStatus();
Get the exit status for the container.
Note: This is valid only for completed containers i.e.
From source file:gobblin.yarn.YarnService.java
License:Apache License
/** * Handle the completion of a container. A new container will be requested to replace the one * that just exited. Depending on the exit status and if container host affinity is enabled, * the new container may or may not try to be started on the same node. * * A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. * A replacement container is needed in all but the last case. *///from w w w . j a v a 2s . c o m private void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap .remove(containerStatus.getContainerId()); String completedInstanceName = completedContainerEntry.getValue(); LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info(String.format("Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (this.shutdownInProgress) { return; } int retryCount = this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)) .incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the queue of unused // instance names so they can be reused by a replacement container. this.unusedHelixInstanceNames.offer(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus()) ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
From source file:gobblin.yarn.YarnService.java
License:Apache License
private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata( ContainerStatus containerStatus) { ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>(); eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString()); eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE, containerStatus.getState().toString()); if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS, containerStatus.getExitStatus() + ""); }//w ww . j a va 2s . c om if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS, containerStatus.getDiagnostics()); } return eventMetadataBuilder; }
From source file:husky.server.HuskyRMCallbackHandler.java
License:Apache License
public void onContainersCompleted(List<ContainerStatus> completedContainerStatus) { LOG.info("Get response from RM for container request, completedCnt = " + completedContainerStatus.size()); mNumCompletedContainers += completedContainerStatus.size(); for (ContainerStatus status : completedContainerStatus) { LOG.info(String.format("Container %s: %s, exit status: %d", status.getContainerId().toString(), status.getState().toString(), status.getExitStatus())); if (status.getExitStatus() == 0) { mNumSuccess += 1;/*from w w w. java2 s . co m*/ } } LOG.info("Total containers: " + mNumContainers + ", completed containers: " + mNumCompletedContainers); if (mNumContainers == mNumCompletedContainers) { // If all workers and master finish synchronized (finalResultLock) { finalResultLock.unlock(); finalResultLock.notifyAll(); } } }
From source file:io.hops.tensorflow.TimelineHandler.java
License:Apache License
public void publishContainerEndEvent(ContainerStatus container) { final TimelineEntity entity = new TimelineEntity(); entity.setEntityId(container.getContainerId().toString()); entity.setEntityType(ApplicationMaster.YarntfEntity.YARNTF_CONTAINER.toString()); entity.setDomainId(domainId);// ww w . j ava 2 s . c o m entity.addPrimaryFilter("user", ugi.getShortUserName()); TimelineEvent event = new TimelineEvent(); event.setTimestamp(System.currentTimeMillis()); event.setEventType(ApplicationMaster.YarntfEvent.YARNTF_CONTAINER_END.toString()); event.addEventInfo("State", container.getState().name()); event.addEventInfo("Exit Status", container.getExitStatus()); entity.addEvent(event); try { timelineClient.putEntities(entity); } catch (YarnException | IOException e) { LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e); } }
From source file:org.apache.drill.yarn.appMaster.TaskState.java
License:Apache License
protected void completed(EventContext context, ContainerStatus status) { Task task = context.task;/*from ww w .ja va 2 s. c o m*/ String diag = status.getDiagnostics(); LOG.trace(task.getLabel() + " Completed, exit status: " + status.getExitStatus() + (DoYUtil.isBlank(diag) ? "" : ": " + status.getDiagnostics())); task.completionStatus = status; }
From source file:org.apache.flink.yarn.YarnFlinkResourceManager.java
License:Apache License
/** * Invoked when the ResourceManager informs of completed containers. * Called via an actor message by the callback from the ResourceManager client. * //from w w w. java 2 s . c om * @param containers The containers that have completed. */ private void containersComplete(List<ContainerStatus> containers) { // the list contains both failed containers, as well as containers that // were gracefully returned by this application master for (ContainerStatus status : containers) { final ResourceID id = new ResourceID(status.getContainerId().toString()); // check if this is a failed container or a completed container if (containersBeingReturned.remove(status.getContainerId()) != null) { // regular completed container that we released LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics()); } else { // failed container, either at startup, or running final String exitStatus; switch (status.getExitStatus()) { case -103: exitStatus = "Vmem limit exceeded (-103)"; break; case -104: exitStatus = "Pmem limit exceeded (-104)"; break; default: exitStatus = String.valueOf(status.getExitStatus()); } final YarnContainerInLaunch launched = containersInLaunch.remove(id); if (launched != null) { LOG.info("Container {} failed, with a TaskManager in launch or registration. " + "Exit status: {}", id, exitStatus); // we will trigger re-acquiring new containers at the end } else { // failed registered worker LOG.info("Container {} failed. Exit status: {}", id, exitStatus); // notify the generic logic, which notifies the JobManager, etc. notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus); } // general failure logging failedContainersSoFar++; String diagMessage = String.format( "Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id, status.getState(), exitStatus, status.getDiagnostics()); sendInfoMessage(diagMessage); LOG.info(diagMessage); LOG.info("Total number of failed containers so far: " + failedContainersSoFar); // maxFailedContainers == -1 is infinite number of retries. if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) { String msg = "Stopping YARN session because the number of failed containers (" + failedContainersSoFar + ") exceeded the maximum failed containers (" + maxFailedContainers + "). This number is controlled by the '" + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. " + "By default its the number of requested containers."; LOG.error(msg); self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender()); // no need to do anything else return; } } } updateProgress(); // in case failed containers were among the finished containers, make // sure we re-examine and request new ones triggerCheckWorkers(); }
From source file:org.apache.flink.yarn.YarnResourceManager.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> list) { for (ContainerStatus container : list) { if (container.getExitStatus() < 0) { notifyWorkerFailed(new ResourceID(container.getContainerId().toString()), container.getDiagnostics()); }/*from w w w .j a va2 s . c o m*/ } }
From source file:org.apache.flink.yarn.YarnResourceManagerTest.java
License:Apache License
private static ContainerStatus mockContainerStatus(ContainerId containerId) { ContainerStatus mockContainerStatus = mock(ContainerStatus.class); when(mockContainerStatus.getContainerId()).thenReturn(containerId); when(mockContainerStatus.getState()).thenReturn(ContainerState.COMPLETE); when(mockContainerStatus.getDiagnostics()).thenReturn("Test exit"); when(mockContainerStatus.getExitStatus()).thenReturn(-1); return mockContainerStatus; }
From source file:org.apache.gobblin.yarn.YarnService.java
License:Apache License
/** * Handle the completion of a container. A new container will be requested to replace the one * that just exited. Depending on the exit status and if container host affinity is enabled, * the new container may or may not try to be started on the same node. * * A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. * A replacement container is needed in all but the last case. */// www . j a v a2s. c om private void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap .remove(containerStatus.getContainerId()); String completedInstanceName = completedContainerEntry.getValue(); LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info(String.format("Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) { LOGGER.info("Container release requested, so not spawning a replacement for containerId {}", containerStatus.getContainerId()); return; } if (this.shutdownInProgress) { return; } this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)); int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the queue of unused // instance names so they can be reused by a replacement container. this.unusedHelixInstanceNames.offer(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus()) ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
From source file:org.apache.hama.bsp.BSPTaskLauncher.java
License:Apache License
/** * This polls the current container status from container manager. Null if the * container hasn't finished yet.//from ww w . j a v a 2 s .com * * @return * @throws Exception */ public BSPTaskStatus poll() throws Exception { ContainerStatus lastStatus = null; GetContainerStatusesResponse getContainerStatusesResponse = cm.getContainerStatuses(statusRequest); List<ContainerStatus> containerStatuses = getContainerStatusesResponse.getContainerStatuses(); for (ContainerStatus containerStatus : containerStatuses) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); if (containerStatus.getContainerId().equals(allocatedContainer.getId())) { lastStatus = containerStatus; break; } } if (lastStatus.getState() != ContainerState.COMPLETE) { return null; } LOG.info(this.id + " Last report comes with exitstatus of " + lastStatus.getExitStatus() + " and diagnose string of " + lastStatus.getDiagnostics()); return new BSPTaskStatus(id, lastStatus.getExitStatus()); }