List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus
@Public @Unstable public abstract int getExitStatus();
Get the exit status for the container.
Note: This is valid only for completed containers i.e.
From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { GenericApplicationMaster.LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); synchronized (_genericApplicationMaster.allocatedContainerSet) { _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId()); SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap .remove(containerStatus.getContainerId()); if (stopResponseFuture != null) { ContainerStopResponse value = new ContainerStopResponse(); stopResponseFuture.set(value); } else { SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap .remove(containerStatus.getContainerId()); if (releaseResponseFuture != null) { ContainerReleaseResponse value = new ContainerReleaseResponse(); releaseResponseFuture.set(value); }//from ww w . j ava 2 s . c om } } // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully GenericApplicationMaster.LOG.info( "Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } }
From source file:org.apache.hoya.yarn.appmaster.HoyaAppMaster.java
License:Apache License
@Override //AMRMClientAsync public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size()); for (ContainerStatus status : completedContainers) { ContainerId containerId = status.getContainerId(); LOG_YARN.info(/* www .j a va 2 s .c o m*/ "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={}," + " diagnostics={}", containerId, status.getState(), status.getExitStatus(), status.getDiagnostics()); // non complete containers should not be here assert (status.getState() == ContainerState.COMPLETE); AppState.NodeCompletionResult result = appState.onCompletedNode(conf, status); if (result.containerFailed) { RoleInstance ri = result.roleInstance; log.error("Role instance {} failed ", ri); } } // ask for more containers if any failed // In the case of Hoya, we don't expect containers to complete since // Hoya is a long running application. Keep track of how many containers // are completing. If too many complete, abort the application // TODO: this needs to be better thought about (and maybe something to // better handle in Yarn for long running apps) try { reviewRequestAndReleaseNodes(); } catch (HoyaInternalStateException e) { log.warn("Exception while flexing nodes", e); } }
From source file:org.apache.hoya.yarn.appmaster.state.AppState.java
License:Apache License
/** * handle completed node in the CD -move something from the live * server list to the completed server list * @param amConf YarnConfiguration/*w w w . ja va2s. c o m*/ * @param status the node that has just completed * @return NodeCompletionResult */ public synchronized NodeCompletionResult onCompletedNode(YarnConfiguration amConf, ContainerStatus status) { ContainerId containerId = status.getContainerId(); NodeCompletionResult result = new NodeCompletionResult(); RoleInstance roleInstance; if (containersBeingReleased.containsKey(containerId)) { log.info("Container was queued for release"); Container container = containersBeingReleased.remove(containerId); RoleStatus roleStatus = lookupRoleStatus(container); log.info("decrementing role count for role {}", roleStatus.getName()); roleStatus.decReleasing(); roleStatus.decActual(); roleStatus.incCompleted(); roleHistory.onReleaseCompleted(container); } else if (surplusNodes.remove(containerId)) { //its a surplus one being purged result.surplusNode = true; } else { //a container has failed result.containerFailed = true; roleInstance = activeContainers.remove(containerId); if (roleInstance != null) { //it was active, move it to failed incFailedCountainerCount(); failedNodes.put(containerId, roleInstance); } else { // the container may have been noted as failed already, so look // it up roleInstance = failedNodes.get(containerId); } if (roleInstance != null) { int roleId = roleInstance.roleId; log.info("Failed container in role {}", roleId); try { RoleStatus roleStatus = lookupRoleStatus(roleId); roleStatus.decActual(); boolean shortLived = isShortLived(roleInstance); String message; if (roleInstance.container != null) { String user = null; try { user = HoyaUtils.getCurrentUser().getShortUserName(); } catch (IOException ioe) { } String completedLogsUrl = null; Container c = roleInstance.container; String url = null; if (amConf != null) { url = amConf.get(YarnConfiguration.YARN_LOG_SERVER_URL); } if (user != null && url != null) { completedLogsUrl = url + "/" + c.getNodeId() + "/" + roleInstance.getContainerId() + "/ctx/" + user; } message = String.format( "Failure %s on host %s" + (completedLogsUrl != null ? ", see %s" : ""), roleInstance.getContainerId(), c.getNodeId().getHost(), completedLogsUrl); } else { message = String.format("Failure %s", containerId.toString()); } roleStatus.noteFailed(message); //have a look to see if it short lived if (shortLived) { roleStatus.incStartFailed(); } if (roleInstance.container != null) { roleHistory.onFailedContainer(roleInstance.container, shortLived); } } catch (YarnRuntimeException e1) { log.error("Failed container of unknown role {}", roleId); } } else { //this isn't a known container. log.error("Notified of completed container {} that is not in the list" + " of active or failed containers", containerId); completionOfUnknownContainerEvent.incrementAndGet(); } } if (result.surplusNode) { //a surplus node return result; } //record the complete node's details; this pulls it from the livenode set //remove the node ContainerId id = status.getContainerId(); RoleInstance node = getLiveNodes().remove(id); if (node == null) { log.warn("Received notification of completion of unknown node {}", id); completionOfNodeNotInLiveListEvent.incrementAndGet(); } else { node.state = ClusterDescription.STATE_DESTROYED; node.exitCode = status.getExitStatus(); node.diagnostics = status.getDiagnostics(); getCompletedNodes().put(id, node); result.roleInstance = node; } return result; }
From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java
License:Apache License
@SuppressWarnings("unchecked") @Override/*from w w w .j a v a 2 s . c o m*/ public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); removeContainer(containerStatus.getContainerId()); LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId()); serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + ""); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } if (timelineClient != null) { YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId, appSubmitterUgi); } } }
From source file:org.apache.metron.maas.service.yarn.YarnUtils.java
License:Apache License
public void publishContainerEndEvent(final TimelineClient timelineClient, ContainerStatus container, String domainId, UserGroupInformation ugi) { final TimelineEntity entity = new TimelineEntity(); entity.setEntityId(container.getContainerId().toString()); entity.setEntityType(ApplicationMaster.DSEntity.DS_CONTAINER.toString()); entity.setDomainId(domainId);//w w w . ja v a2 s.c o m entity.addPrimaryFilter("user", ugi.getShortUserName()); TimelineEvent event = new TimelineEvent(); event.setTimestamp(System.currentTimeMillis()); event.setEventType(ContainerEvents.CONTAINER_END.toString()); event.addEventInfo("State", container.getState().name()); event.addEventInfo("Exit Status", container.getExitStatus()); entity.addEvent(event); try { timelineClient.putEntities(entity); } catch (YarnException | IOException e) { LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e); } }
From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java
License:Apache License
/** * Handles container status reports. Calls come from YARN. * * @param value containing the container status *///from ww w.ja va 2 s. c o m private void onContainerStatus(final ContainerStatus value) { final String containerId = value.getContainerId().toString(); final boolean hasContainer = this.containers.hasContainer(containerId); if (hasContainer) { LOG.log(Level.FINE, "Received container status: {0}", containerId); final ResourceStatusProto.Builder status = ResourceStatusProto.newBuilder().setIdentifier(containerId); switch (value.getState()) { case COMPLETE: LOG.log(Level.FINE, "Container completed: status {0}", value.getExitStatus()); switch (value.getExitStatus()) { case 0: status.setState(ReefServiceProtos.State.DONE); break; case 143: status.setState(ReefServiceProtos.State.KILLED); break; default: status.setState(ReefServiceProtos.State.FAILED); } status.setExitCode(value.getExitStatus()); // remove the completed container (can be either done/killed/failed) from book keeping this.containers.removeAndGet(containerId); logContainerRemoval(containerId); break; default: LOG.info("Container running"); status.setState(ReefServiceProtos.State.RUNNING); } if (value.getDiagnostics() != null) { LOG.log(Level.FINE, "Container diagnostics: {0}", value.getDiagnostics()); status.setDiagnostics(value.getDiagnostics()); } this.reefEventHandlers.onResourceStatus(status.build()); } }
From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones./* ww w .jav a2 s . co m*/ * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info("Container completed from RM " + status); SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResrcStatuses.add(samzaResrcStatus); int completedContainerID = getIDForContainer(status.getContainerId().toString()); log.info("Completed container had ID: {}", completedContainerID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (completedContainerID != INVALID_YARN_CONTAINER_ID) { if (state.runningYarnContainers.containsKey(completedContainerID)) { log.info("Removing container ID {} from completed containers", completedContainerID); state.runningYarnContainers.remove(completedContainerID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } _callback.onResourcesCompleted(samzaResrcStatuses); }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*from w ww. j av a 2s .c om*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones.//w w w . ja v a2s.c o m * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info( "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.", status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics()); SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResourceStatuses.add(samzaResourceStatus); String completedProcessorID = getRunningProcessorId(status.getContainerId().toString()); log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(), completedProcessorID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) { if (state.runningProcessors.containsKey(completedProcessorID)) { log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.", completedProcessorID); state.runningProcessors.remove(completedProcessorID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses); }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
@Override //AMRMClientAsync public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size()); for (ContainerStatus status : completedContainers) { ContainerId containerId = status.getContainerId(); LOG_YARN.info(/*w ww.ja v a 2 s . co m*/ "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={}," + " diagnostics={}", containerId, status.getState(), status.getExitStatus(), status.getDiagnostics()); // non complete containers should not be here assert (status.getState() == ContainerState.COMPLETE); AppState.NodeCompletionResult result = appState.onCompletedNode(status); if (result.containerFailed) { RoleInstance ri = result.roleInstance; log.error("Role instance {} failed ", ri); } // known nodes trigger notifications if (!result.unknownNode) { getProviderService().notifyContainerCompleted(containerId); queue(new UnregisterComponentInstance(containerId, 0, TimeUnit.MILLISECONDS)); } } reviewRequestAndReleaseNodes("onContainersCompleted"); }