List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId
@Public @Stable public abstract ContainerId getContainerId();
ContainerId
of the container. From source file:org.apache.ignite.yarn.ApplicationMaster.java
License:Apache License
/** {@inheritDoc} */ public synchronized void onContainersCompleted(List<ContainerStatus> statuses) { for (ContainerStatus status : statuses) { containers.remove(status.getContainerId()); log.log(Level.INFO, "Container completed. Container id: {0}. State: {1}.", new Object[] { status.getContainerId(), status.getState() }); }/*from w w w . jav a 2 s .c o m*/ }
From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java
License:Apache License
@SuppressWarnings("unchecked") @Override/*w ww . j a v a2 s . co m*/ public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); removeContainer(containerStatus.getContainerId()); LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId()); serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + ""); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } if (timelineClient != null) { YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId, appSubmitterUgi); } } }
From source file:org.apache.metron.maas.service.yarn.YarnUtils.java
License:Apache License
public void publishContainerEndEvent(final TimelineClient timelineClient, ContainerStatus container, String domainId, UserGroupInformation ugi) { final TimelineEntity entity = new TimelineEntity(); entity.setEntityId(container.getContainerId().toString()); entity.setEntityType(ApplicationMaster.DSEntity.DS_CONTAINER.toString()); entity.setDomainId(domainId);// ww w . ja va 2s.co m entity.addPrimaryFilter("user", ugi.getShortUserName()); TimelineEvent event = new TimelineEvent(); event.setTimestamp(System.currentTimeMillis()); event.setEventType(ContainerEvents.CONTAINER_END.toString()); event.addEventInfo("State", container.getState().name()); event.addEventInfo("Exit Status", container.getExitStatus()); entity.addEvent(event); try { timelineClient.putEntities(entity); } catch (YarnException | IOException e) { LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e); } }
From source file:org.apache.myriad.scheduler.fgs.NMHeartBeatHandler.java
License:Apache License
@VisibleForTesting protected Resource getResourcesUnderUse(RMNodeStatusEvent statusEvent) { Resource usedResources = Resource.newInstance(0, 0); for (ContainerStatus status : statusEvent.getContainers()) { if (containerInUse(status)) { RMContainer rmContainer = yarnScheduler.getRMContainer(status.getContainerId()); // (sdaingade) This check is needed as RMContainer information may not be populated // immediately after a RM restart. if (rmContainer != null) { Resources.addTo(usedResources, rmContainer.getAllocatedResource()); }/*w w w .jav a 2s .co m*/ } } return usedResources; }
From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java
License:Apache License
/** * Handles container status reports. Calls come from YARN. * * @param value containing the container status *///from w w w . j a v a 2 s .c o m private void onContainerStatus(final ContainerStatus value) { final String containerId = value.getContainerId().toString(); final boolean hasContainer = this.containers.hasContainer(containerId); if (hasContainer) { LOG.log(Level.FINE, "Received container status: {0}", containerId); final ResourceStatusProto.Builder status = ResourceStatusProto.newBuilder().setIdentifier(containerId); switch (value.getState()) { case COMPLETE: LOG.log(Level.FINE, "Container completed: status {0}", value.getExitStatus()); switch (value.getExitStatus()) { case 0: status.setState(ReefServiceProtos.State.DONE); break; case 143: status.setState(ReefServiceProtos.State.KILLED); break; default: status.setState(ReefServiceProtos.State.FAILED); } status.setExitCode(value.getExitStatus()); // remove the completed container (can be either done/killed/failed) from book keeping this.containers.removeAndGet(containerId); logContainerRemoval(containerId); break; default: LOG.info("Container running"); status.setState(ReefServiceProtos.State.RUNNING); } if (value.getDiagnostics() != null) { LOG.log(Level.FINE, "Container diagnostics: {0}", value.getDiagnostics()); status.setDiagnostics(value.getDiagnostics()); } this.reefEventHandlers.onResourceStatus(status.build()); } }
From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones./*from w w w .j ava 2s.c om*/ * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info("Container completed from RM " + status); SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResrcStatuses.add(samzaResrcStatus); int completedContainerID = getIDForContainer(status.getContainerId().toString()); log.info("Completed container had ID: {}", completedContainerID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (completedContainerID != INVALID_YARN_CONTAINER_ID) { if (state.runningYarnContainers.containsKey(completedContainerID)) { log.info("Removing container ID {} from completed containers", completedContainerID); state.runningYarnContainers.remove(completedContainerID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } _callback.onResourcesCompleted(samzaResrcStatuses); }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*from www . ja v a 2s. c om*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java
License:Apache License
/** * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific * ones./*from w w w . j a va 2 s .c o m*/ * * @param statuses the YarnContainerStatus callbacks from Yarn. */ @Override public void onContainersCompleted(List<ContainerStatus> statuses) { List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>(); for (ContainerStatus status : statuses) { log.info( "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.", status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics()); SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(), status.getDiagnostics(), status.getExitStatus()); samzaResourceStatuses.add(samzaResourceStatus); String completedProcessorID = getRunningProcessorId(status.getContainerId().toString()); log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(), completedProcessorID); //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of //failed containers. if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) { if (state.runningProcessors.containsKey(completedProcessorID)) { log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.", completedProcessorID); state.runningProcessors.remove(completedProcessorID); if (status.getExitStatus() != ContainerExitStatus.SUCCESS) state.failedContainersStatus.put(status.getContainerId().toString(), status); } } } clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses); }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
@Override //AMRMClientAsync public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size()); for (ContainerStatus status : completedContainers) { ContainerId containerId = status.getContainerId(); LOG_YARN.info(/* w w w.j a v a 2 s . com*/ "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={}," + " diagnostics={}", containerId, status.getState(), status.getExitStatus(), status.getDiagnostics()); // non complete containers should not be here assert (status.getState() == ContainerState.COMPLETE); AppState.NodeCompletionResult result = appState.onCompletedNode(status); if (result.containerFailed) { RoleInstance ri = result.roleInstance; log.error("Role instance {} failed ", ri); } // known nodes trigger notifications if (!result.unknownNode) { getProviderService().notifyContainerCompleted(containerId); queue(new UnregisterComponentInstance(containerId, 0, TimeUnit.MILLISECONDS)); } } reviewRequestAndReleaseNodes("onContainersCompleted"); }
From source file:org.apache.slider.server.appmaster.state.AppState.java
License:Apache License
/** * handle completed node in the CD -move something from the live * server list to the completed server list * @param status the node that has just completed * @return NodeCompletionResult//from w w w .j a v a 2 s . c o m */ public synchronized NodeCompletionResult onCompletedNode(ContainerStatus status) { ContainerId containerId = status.getContainerId(); NodeCompletionResult result = new NodeCompletionResult(); RoleInstance roleInstance; if (containersBeingReleased.containsKey(containerId)) { log.info("Container was queued for release : {}", containerId); Container container = containersBeingReleased.remove(containerId); RoleStatus roleStatus = lookupRoleStatus(container); int releasing = roleStatus.decReleasing(); int actual = roleStatus.decActual(); int completedCount = roleStatus.incCompleted(); log.info("decrementing role count for role {} to {}; releasing={}, completed={}", roleStatus.getName(), actual, releasing, completedCount); roleHistory.onReleaseCompleted(container, true); } else if (surplusNodes.remove(containerId)) { //its a surplus one being purged result.surplusNode = true; } else { //a container has failed result.containerFailed = true; roleInstance = removeOwnedContainer(containerId); if (roleInstance != null) { //it was active, move it to failed incFailedCountainerCount(); failedNodes.put(containerId, roleInstance); } else { // the container may have been noted as failed already, so look // it up roleInstance = failedNodes.get(containerId); } if (roleInstance != null) { int roleId = roleInstance.roleId; String rolename = roleInstance.role; log.info("Failed container in role[{}] : {}", roleId, rolename); try { RoleStatus roleStatus = lookupRoleStatus(roleId); roleStatus.decActual(); boolean shortLived = isShortLived(roleInstance); String message; Container failedContainer = roleInstance.container; //build the failure message if (failedContainer != null) { String completedLogsUrl = getLogsURLForContainer(failedContainer); message = String.format("Failure %s on host %s: %s", roleInstance.getContainerId().toString(), failedContainer.getNodeId().getHost(), completedLogsUrl); } else { message = String.format("Failure %s", containerId); } int failed = roleStatus.noteFailed(shortLived, message); log.info("Current count of failed role[{}] {} = {}", roleId, rolename, failed); if (failedContainer != null) { roleHistory.onFailedContainer(failedContainer, shortLived); } } catch (YarnRuntimeException e1) { log.error("Failed container of unknown role {}", roleId); } } else { //this isn't a known container. log.error("Notified of completed container {} that is not in the list" + " of active or failed containers", containerId); completionOfUnknownContainerEvent.incrementAndGet(); result.unknownNode = true; } } if (result.surplusNode) { //a surplus node return result; } //record the complete node's details; this pulls it from the livenode set //remove the node ContainerId id = status.getContainerId(); log.info("Removing node ID {}", id); RoleInstance node = getLiveNodes().remove(id); if (node != null) { node.state = ClusterDescription.STATE_DESTROYED; node.exitCode = status.getExitStatus(); node.diagnostics = status.getDiagnostics(); getCompletedNodes().put(id, node); result.roleInstance = node; } else { // not in the list log.warn("Received notification of completion of unknown node {}", id); completionOfNodeNotInLiveListEvent.incrementAndGet(); } // and the active node list if present removeOwnedContainer(containerId); // finally, verify the node doesn't exist any more assert !containersBeingReleased.containsKey(containerId) : "container still in release queue"; assert !getLiveNodes().containsKey(containerId) : " container still in live nodes"; assert getOwnedContainer(containerId) == null : "Container still in active container list"; return result; }