List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId
@Public @Stable public abstract ContainerId getContainerId();
ContainerId
of the container. From source file:org.apache.flink.yarn.appMaster.ApplicationMaster.java
License:Apache License
/** * Run a Thread to allocate new containers until taskManagerCount * is correct again./* ww w . j a va 2 s.c o m*/ */ private void allocateOutstandingContainer(StringBuffer containerDiag) throws Exception { // respect custom JVM options in the YAML file final String javaOpts = GlobalConfiguration.getString(ConfigConstants.FLINK_JVM_OPTIONS, ""); int allocatedContainers = 0; while (allocatedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(0); for (Container container : response.getAllocatedContainers()) { LOG.info("Got new Container for TM " + container.getId() + " on host " + container.getNodeId().getHost()); ++allocatedContainers; // Launch container by create ContainerLaunchContext ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); String tmCommand = "$JAVA_HOME/bin/java -Xmx" + heapLimit + "m " + javaOpts; if (hasLogback) { tmCommand += " -Dlog.file=\"" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-logback" + ".log\" -Dlogback.configurationFile=file:logback.xml"; } tmCommand += " " + YarnTaskManagerRunner.class.getName() + " -configDir . " + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stdout.log" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stderr.log"; ctx.setCommands(Collections.singletonList(tmCommand)); LOG.info("Starting TM with command=" + tmCommand); ctx.setLocalResources(taskManagerLocalResources); // Setup CLASSPATH for Container (=TaskTracker) Map<String, String> containerEnv = new HashMap<String, String>(); Utils.setupEnv(conf, containerEnv); //add flink.jar to class path. containerEnv.put(Client.ENV_CLIENT_USERNAME, yarnClientUsername); ctx.setEnvironment(containerEnv); UserGroupInformation user = UserGroupInformation.getCurrentUser(); try { Credentials credentials = user.getCredentials(); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); ctx.setTokens(securityTokens); } catch (IOException e) { LOG.warn("Getting current user info failed when trying to launch the container", e); } LOG.info("Launching container " + allocatedContainers); nmClient.startContainer(container, ctx); } for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container (while allocating) " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); // status. logDeadContainer(status, containerDiag); } Thread.sleep(100); } }
From source file:org.apache.flink.yarn.appMaster.ApplicationMaster.java
License:Apache License
private void logDeadContainer(ContainerStatus status, StringBuffer containerDiag) { String msg = "Diagnostics for containerId=" + status.getContainerId() + " in state=" + status.getState() + "\n" + status.getDiagnostics(); messages.add(new Message(msg)); containerDiag.append("\n\n"); containerDiag.append(msg);//from w w w . j av a 2 s .co m }
From source file:org.apache.flink.yarn.YarnFlinkResourceManager.java
License:Apache License
/** * Invoked when the ResourceManager informs of completed containers. * Called via an actor message by the callback from the ResourceManager client. * /* w w w . ja v a 2 s . c o m*/ * @param containers The containers that have completed. */ private void containersComplete(List<ContainerStatus> containers) { // the list contains both failed containers, as well as containers that // were gracefully returned by this application master for (ContainerStatus status : containers) { final ResourceID id = new ResourceID(status.getContainerId().toString()); // check if this is a failed container or a completed container if (containersBeingReturned.remove(status.getContainerId()) != null) { // regular completed container that we released LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics()); } else { // failed container, either at startup, or running final String exitStatus; switch (status.getExitStatus()) { case -103: exitStatus = "Vmem limit exceeded (-103)"; break; case -104: exitStatus = "Pmem limit exceeded (-104)"; break; default: exitStatus = String.valueOf(status.getExitStatus()); } final YarnContainerInLaunch launched = containersInLaunch.remove(id); if (launched != null) { LOG.info("Container {} failed, with a TaskManager in launch or registration. " + "Exit status: {}", id, exitStatus); // we will trigger re-acquiring new containers at the end } else { // failed registered worker LOG.info("Container {} failed. Exit status: {}", id, exitStatus); // notify the generic logic, which notifies the JobManager, etc. notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus); } // general failure logging failedContainersSoFar++; String diagMessage = String.format( "Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id, status.getState(), exitStatus, status.getDiagnostics()); sendInfoMessage(diagMessage); LOG.info(diagMessage); LOG.info("Total number of failed containers so far: " + failedContainersSoFar); // maxFailedContainers == -1 is infinite number of retries. if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) { String msg = "Stopping YARN session because the number of failed containers (" + failedContainersSoFar + ") exceeded the maximum failed containers (" + maxFailedContainers + "). This number is controlled by the '" + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. " + "By default its the number of requested containers."; LOG.error(msg); self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender()); // no need to do anything else return; } } } updateProgress(); // in case failed containers were among the finished containers, make // sure we re-examine and request new ones triggerCheckWorkers(); }
From source file:org.apache.flink.yarn.YarnResourceManager.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> list) { for (ContainerStatus container : list) { if (container.getExitStatus() < 0) { notifyWorkerFailed(new ResourceID(container.getContainerId().toString()), container.getDiagnostics()); }/* w w w . ja v a2 s. c om*/ } }
From source file:org.apache.flink.yarn.YarnResourceManagerTest.java
License:Apache License
private static ContainerStatus mockContainerStatus(ContainerId containerId) { ContainerStatus mockContainerStatus = mock(ContainerStatus.class); when(mockContainerStatus.getContainerId()).thenReturn(containerId); when(mockContainerStatus.getState()).thenReturn(ContainerState.COMPLETE); when(mockContainerStatus.getDiagnostics()).thenReturn("Test exit"); when(mockContainerStatus.getExitStatus()).thenReturn(-1); return mockContainerStatus; }
From source file:org.apache.gobblin.yarn.YarnService.java
License:Apache License
/** * Handle the completion of a container. A new container will be requested to replace the one * that just exited. Depending on the exit status and if container host affinity is enabled, * the new container may or may not try to be started on the same node. * * A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. * A replacement container is needed in all but the last case. *///from ww w . j ava2 s. c o m private void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap .remove(containerStatus.getContainerId()); String completedInstanceName = completedContainerEntry.getValue(); LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info(String.format("Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) { LOGGER.info("Container release requested, so not spawning a replacement for containerId {}", containerStatus.getContainerId()); return; } if (this.shutdownInProgress) { return; } this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)); int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the queue of unused // instance names so they can be reused by a replacement container. this.unusedHelixInstanceNames.offer(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus()) ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
From source file:org.apache.hama.bsp.BSPTaskLauncher.java
License:Apache License
/** * This polls the current container status from container manager. Null if the * container hasn't finished yet.// www . j a va 2 s . c om * * @return * @throws Exception */ public BSPTaskStatus poll() throws Exception { ContainerStatus lastStatus = null; GetContainerStatusesResponse getContainerStatusesResponse = cm.getContainerStatuses(statusRequest); List<ContainerStatus> containerStatuses = getContainerStatusesResponse.getContainerStatuses(); for (ContainerStatus containerStatus : containerStatuses) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); if (containerStatus.getContainerId().equals(allocatedContainer.getId())) { lastStatus = containerStatus; break; } } if (lastStatus.getState() != ContainerState.COMPLETE) { return null; } LOG.info(this.id + " Last report comes with exitstatus of " + lastStatus.getExitStatus() + " and diagnose string of " + lastStatus.getDiagnostics()); return new BSPTaskStatus(id, lastStatus.getExitStatus()); }
From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java
License:Apache License
@Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { GenericApplicationMaster.LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); synchronized (_genericApplicationMaster.allocatedContainerSet) { _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId()); SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap .remove(containerStatus.getContainerId()); if (stopResponseFuture != null) { ContainerStopResponse value = new ContainerStopResponse(); stopResponseFuture.set(value); } else { SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap .remove(containerStatus.getContainerId()); if (releaseResponseFuture != null) { ContainerReleaseResponse value = new ContainerReleaseResponse(); releaseResponseFuture.set(value); }/*from w ww . j a va 2 s. c om*/ } } // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully GenericApplicationMaster.LOG.info( "Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } }
From source file:org.apache.hoya.yarn.appmaster.HoyaAppMaster.java
License:Apache License
@Override //AMRMClientAsync public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size()); for (ContainerStatus status : completedContainers) { ContainerId containerId = status.getContainerId(); LOG_YARN.info(/* w w w. ja v a 2 s.co m*/ "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={}," + " diagnostics={}", containerId, status.getState(), status.getExitStatus(), status.getDiagnostics()); // non complete containers should not be here assert (status.getState() == ContainerState.COMPLETE); AppState.NodeCompletionResult result = appState.onCompletedNode(conf, status); if (result.containerFailed) { RoleInstance ri = result.roleInstance; log.error("Role instance {} failed ", ri); } } // ask for more containers if any failed // In the case of Hoya, we don't expect containers to complete since // Hoya is a long running application. Keep track of how many containers // are completing. If too many complete, abort the application // TODO: this needs to be better thought about (and maybe something to // better handle in Yarn for long running apps) try { reviewRequestAndReleaseNodes(); } catch (HoyaInternalStateException e) { log.warn("Exception while flexing nodes", e); } }
From source file:org.apache.hoya.yarn.appmaster.state.AppState.java
License:Apache License
/** * handle completed node in the CD -move something from the live * server list to the completed server list * @param amConf YarnConfiguration//w ww . j av a2 s .c o m * @param status the node that has just completed * @return NodeCompletionResult */ public synchronized NodeCompletionResult onCompletedNode(YarnConfiguration amConf, ContainerStatus status) { ContainerId containerId = status.getContainerId(); NodeCompletionResult result = new NodeCompletionResult(); RoleInstance roleInstance; if (containersBeingReleased.containsKey(containerId)) { log.info("Container was queued for release"); Container container = containersBeingReleased.remove(containerId); RoleStatus roleStatus = lookupRoleStatus(container); log.info("decrementing role count for role {}", roleStatus.getName()); roleStatus.decReleasing(); roleStatus.decActual(); roleStatus.incCompleted(); roleHistory.onReleaseCompleted(container); } else if (surplusNodes.remove(containerId)) { //its a surplus one being purged result.surplusNode = true; } else { //a container has failed result.containerFailed = true; roleInstance = activeContainers.remove(containerId); if (roleInstance != null) { //it was active, move it to failed incFailedCountainerCount(); failedNodes.put(containerId, roleInstance); } else { // the container may have been noted as failed already, so look // it up roleInstance = failedNodes.get(containerId); } if (roleInstance != null) { int roleId = roleInstance.roleId; log.info("Failed container in role {}", roleId); try { RoleStatus roleStatus = lookupRoleStatus(roleId); roleStatus.decActual(); boolean shortLived = isShortLived(roleInstance); String message; if (roleInstance.container != null) { String user = null; try { user = HoyaUtils.getCurrentUser().getShortUserName(); } catch (IOException ioe) { } String completedLogsUrl = null; Container c = roleInstance.container; String url = null; if (amConf != null) { url = amConf.get(YarnConfiguration.YARN_LOG_SERVER_URL); } if (user != null && url != null) { completedLogsUrl = url + "/" + c.getNodeId() + "/" + roleInstance.getContainerId() + "/ctx/" + user; } message = String.format( "Failure %s on host %s" + (completedLogsUrl != null ? ", see %s" : ""), roleInstance.getContainerId(), c.getNodeId().getHost(), completedLogsUrl); } else { message = String.format("Failure %s", containerId.toString()); } roleStatus.noteFailed(message); //have a look to see if it short lived if (shortLived) { roleStatus.incStartFailed(); } if (roleInstance.container != null) { roleHistory.onFailedContainer(roleInstance.container, shortLived); } } catch (YarnRuntimeException e1) { log.error("Failed container of unknown role {}", roleId); } } else { //this isn't a known container. log.error("Notified of completed container {} that is not in the list" + " of active or failed containers", containerId); completionOfUnknownContainerEvent.incrementAndGet(); } } if (result.surplusNode) { //a surplus node return result; } //record the complete node's details; this pulls it from the livenode set //remove the node ContainerId id = status.getContainerId(); RoleInstance node = getLiveNodes().remove(id); if (node == null) { log.warn("Received notification of completion of unknown node {}", id); completionOfNodeNotInLiveListEvent.incrementAndGet(); } else { node.state = ClusterDescription.STATE_DESTROYED; node.exitCode = status.getExitStatus(); node.diagnostics = status.getDiagnostics(); getCompletedNodes().put(id, node); result.roleInstance = node; } return result; }