Example usage for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId

List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId.

Prototype

@Public
@Stable
public abstract ContainerId getContainerId();

Source Link

Document

Get the ContainerId of the container.

Usage

From source file:org.apache.flink.yarn.appMaster.ApplicationMaster.java

License:Apache License

/**
 * Run a Thread to allocate new containers until taskManagerCount
 * is correct again./* ww  w  .  j a va  2 s.c  o m*/
 */
private void allocateOutstandingContainer(StringBuffer containerDiag) throws Exception {

    // respect custom JVM options in the YAML file
    final String javaOpts = GlobalConfiguration.getString(ConfigConstants.FLINK_JVM_OPTIONS, "");

    int allocatedContainers = 0;
    while (allocatedContainers < taskManagerCount) {
        AllocateResponse response = rmClient.allocate(0);
        for (Container container : response.getAllocatedContainers()) {
            LOG.info("Got new Container for TM " + container.getId() + " on host "
                    + container.getNodeId().getHost());
            ++allocatedContainers;

            // Launch container by create ContainerLaunchContext
            ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class);

            String tmCommand = "$JAVA_HOME/bin/java -Xmx" + heapLimit + "m " + javaOpts;
            if (hasLogback) {
                tmCommand += " -Dlog.file=\"" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
                        + "/taskmanager-logback" + ".log\" -Dlogback.configurationFile=file:logback.xml";
            }
            tmCommand += " " + YarnTaskManagerRunner.class.getName() + " -configDir . " + " 1>"
                    + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stdout.log" + " 2>"
                    + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stderr.log";
            ctx.setCommands(Collections.singletonList(tmCommand));

            LOG.info("Starting TM with command=" + tmCommand);

            ctx.setLocalResources(taskManagerLocalResources);

            // Setup CLASSPATH for Container (=TaskTracker)
            Map<String, String> containerEnv = new HashMap<String, String>();
            Utils.setupEnv(conf, containerEnv); //add flink.jar to class path.
            containerEnv.put(Client.ENV_CLIENT_USERNAME, yarnClientUsername);

            ctx.setEnvironment(containerEnv);

            UserGroupInformation user = UserGroupInformation.getCurrentUser();
            try {
                Credentials credentials = user.getCredentials();
                DataOutputBuffer dob = new DataOutputBuffer();
                credentials.writeTokenStorageToStream(dob);
                ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
                ctx.setTokens(securityTokens);
            } catch (IOException e) {
                LOG.warn("Getting current user info failed when trying to launch the container", e);
            }

            LOG.info("Launching container " + allocatedContainers);
            nmClient.startContainer(container, ctx);
        }
        for (ContainerStatus status : response.getCompletedContainersStatuses()) {
            ++completedContainers;
            LOG.info("Completed container (while allocating) " + status.getContainerId() + ". Total Completed:"
                    + completedContainers);
            LOG.info("Diagnostics " + status.getDiagnostics());
            // status.
            logDeadContainer(status, containerDiag);
        }
        Thread.sleep(100);
    }
}

From source file:org.apache.flink.yarn.appMaster.ApplicationMaster.java

License:Apache License

private void logDeadContainer(ContainerStatus status, StringBuffer containerDiag) {
    String msg = "Diagnostics for containerId=" + status.getContainerId() + " in state=" + status.getState()
            + "\n" + status.getDiagnostics();
    messages.add(new Message(msg));
    containerDiag.append("\n\n");
    containerDiag.append(msg);//from w  w  w . j  av  a  2 s  .co m
}

From source file:org.apache.flink.yarn.YarnFlinkResourceManager.java

License:Apache License

/**
 * Invoked when the ResourceManager informs of completed containers.
 * Called via an actor message by the callback from the ResourceManager client.
 * /*  w  w  w . ja v a  2  s  . c o  m*/
 * @param containers The containers that have completed.
 */
private void containersComplete(List<ContainerStatus> containers) {
    // the list contains both failed containers, as well as containers that
    // were gracefully returned by this application master

    for (ContainerStatus status : containers) {
        final ResourceID id = new ResourceID(status.getContainerId().toString());

        // check if this is a failed container or a completed container
        if (containersBeingReturned.remove(status.getContainerId()) != null) {
            // regular completed container that we released
            LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics());
        } else {
            // failed container, either at startup, or running
            final String exitStatus;
            switch (status.getExitStatus()) {
            case -103:
                exitStatus = "Vmem limit exceeded (-103)";
                break;
            case -104:
                exitStatus = "Pmem limit exceeded (-104)";
                break;
            default:
                exitStatus = String.valueOf(status.getExitStatus());
            }

            final YarnContainerInLaunch launched = containersInLaunch.remove(id);
            if (launched != null) {
                LOG.info("Container {} failed, with a TaskManager in launch or registration. "
                        + "Exit status: {}", id, exitStatus);
                // we will trigger re-acquiring new containers at the end
            } else {
                // failed registered worker
                LOG.info("Container {} failed. Exit status: {}", id, exitStatus);

                // notify the generic logic, which notifies the JobManager, etc.
                notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
            }

            // general failure logging
            failedContainersSoFar++;

            String diagMessage = String.format(
                    "Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id,
                    status.getState(), exitStatus, status.getDiagnostics());
            sendInfoMessage(diagMessage);

            LOG.info(diagMessage);
            LOG.info("Total number of failed containers so far: " + failedContainersSoFar);

            // maxFailedContainers == -1 is infinite number of retries.
            if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
                String msg = "Stopping YARN session because the number of failed containers ("
                        + failedContainersSoFar + ") exceeded the maximum failed containers ("
                        + maxFailedContainers + "). This number is controlled by the '"
                        + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. "
                        + "By default its the number of requested containers.";

                LOG.error(msg);
                self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
                        ActorRef.noSender());

                // no need to do anything else
                return;
            }
        }
    }

    updateProgress();

    // in case failed containers were among the finished containers, make
    // sure we re-examine and request new ones
    triggerCheckWorkers();
}

From source file:org.apache.flink.yarn.YarnResourceManager.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> list) {
    for (ContainerStatus container : list) {
        if (container.getExitStatus() < 0) {
            notifyWorkerFailed(new ResourceID(container.getContainerId().toString()),
                    container.getDiagnostics());
        }/*  w  w w .  ja v a2 s. c  om*/
    }
}

From source file:org.apache.flink.yarn.YarnResourceManagerTest.java

License:Apache License

private static ContainerStatus mockContainerStatus(ContainerId containerId) {
    ContainerStatus mockContainerStatus = mock(ContainerStatus.class);

    when(mockContainerStatus.getContainerId()).thenReturn(containerId);
    when(mockContainerStatus.getState()).thenReturn(ContainerState.COMPLETE);
    when(mockContainerStatus.getDiagnostics()).thenReturn("Test exit");
    when(mockContainerStatus.getExitStatus()).thenReturn(-1);

    return mockContainerStatus;
}

From source file:org.apache.gobblin.yarn.YarnService.java

License:Apache License

/**
 * Handle the completion of a container. A new container will be requested to replace the one
 * that just exited. Depending on the exit status and if container host affinity is enabled,
 * the new container may or may not try to be started on the same node.
 *
 * A container completes in either of the following conditions: 1) some error happens in the
 * container and caused the container to exit, 2) the container gets killed due to some reason,
 * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
 * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
 * A replacement container is needed in all but the last case.
 *///from   ww  w . j  ava2 s.  c  o m
private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry = this.containerMap
            .remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
        LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
                containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) {
        LOGGER.info("Container release requested, so not spawning a replacement for containerId {}",
                containerStatus.getContainerId());
        return;
    }

    if (this.shutdownInProgress) {
        return;
    }

    this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0));
    int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
        eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID,
                completedInstanceName);
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
                retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
        if (this.eventSubmitter.isPresent()) {
            this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                    eventMetadataBuilder.get().build());
        }

        LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
        return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
    }

    LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus())
            ? Optional.of(completedContainerEntry.getKey())
            : Optional.<Container>absent()));
}

From source file:org.apache.hama.bsp.BSPTaskLauncher.java

License:Apache License

/**
 * This polls the current container status from container manager. Null if the
 * container hasn't finished yet.//  www  . j a va 2  s .  c om
 * 
 * @return
 * @throws Exception
 */
public BSPTaskStatus poll() throws Exception {

    ContainerStatus lastStatus = null;
    GetContainerStatusesResponse getContainerStatusesResponse = cm.getContainerStatuses(statusRequest);
    List<ContainerStatus> containerStatuses = getContainerStatusesResponse.getContainerStatuses();
    for (ContainerStatus containerStatus : containerStatuses) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());

        if (containerStatus.getContainerId().equals(allocatedContainer.getId())) {
            lastStatus = containerStatus;
            break;
        }
    }
    if (lastStatus.getState() != ContainerState.COMPLETE) {
        return null;
    }
    LOG.info(this.id + " Last report comes with exitstatus of " + lastStatus.getExitStatus()
            + " and diagnose string of " + lastStatus.getDiagnostics());

    return new BSPTaskStatus(id, lastStatus.getExitStatus());
}

From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        GenericApplicationMaster.LOG.info("Got container status for containerID="
                + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus="
                + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        synchronized (_genericApplicationMaster.allocatedContainerSet) {
            _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId());
            SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap
                    .remove(containerStatus.getContainerId());
            if (stopResponseFuture != null) {
                ContainerStopResponse value = new ContainerStopResponse();
                stopResponseFuture.set(value);
            } else {
                SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap
                        .remove(containerStatus.getContainerId());
                if (releaseResponseFuture != null) {
                    ContainerReleaseResponse value = new ContainerReleaseResponse();
                    releaseResponseFuture.set(value);
                }/*from   w  ww . j a va 2 s.  c  om*/
            }
        }
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {

            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason

                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            GenericApplicationMaster.LOG.info(
                    "Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
    }
}

From source file:org.apache.hoya.yarn.appmaster.HoyaAppMaster.java

License:Apache License

@Override //AMRMClientAsync
public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size());
    for (ContainerStatus status : completedContainers) {
        ContainerId containerId = status.getContainerId();
        LOG_YARN.info(/*  w  w w.  ja  v a  2  s.co m*/
                "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={},"
                        + " diagnostics={}",
                containerId, status.getState(), status.getExitStatus(), status.getDiagnostics());

        // non complete containers should not be here
        assert (status.getState() == ContainerState.COMPLETE);
        AppState.NodeCompletionResult result = appState.onCompletedNode(conf, status);
        if (result.containerFailed) {
            RoleInstance ri = result.roleInstance;
            log.error("Role instance {} failed ", ri);
        }
    }

    // ask for more containers if any failed
    // In the case of Hoya, we don't expect containers to complete since
    // Hoya is a long running application. Keep track of how many containers
    // are completing. If too many complete, abort the application
    // TODO: this needs to be better thought about (and maybe something to
    // better handle in Yarn for long running apps)

    try {
        reviewRequestAndReleaseNodes();
    } catch (HoyaInternalStateException e) {
        log.warn("Exception while flexing nodes", e);
    }
}

From source file:org.apache.hoya.yarn.appmaster.state.AppState.java

License:Apache License

/**
 * handle completed node in the CD -move something from the live
 * server list to the completed server list
 * @param amConf YarnConfiguration//w ww  .  j av  a2 s  .c o  m
 * @param status the node that has just completed
 * @return NodeCompletionResult
 */
public synchronized NodeCompletionResult onCompletedNode(YarnConfiguration amConf, ContainerStatus status) {
    ContainerId containerId = status.getContainerId();
    NodeCompletionResult result = new NodeCompletionResult();
    RoleInstance roleInstance;

    if (containersBeingReleased.containsKey(containerId)) {
        log.info("Container was queued for release");
        Container container = containersBeingReleased.remove(containerId);
        RoleStatus roleStatus = lookupRoleStatus(container);
        log.info("decrementing role count for role {}", roleStatus.getName());
        roleStatus.decReleasing();
        roleStatus.decActual();
        roleStatus.incCompleted();
        roleHistory.onReleaseCompleted(container);

    } else if (surplusNodes.remove(containerId)) {
        //its a surplus one being purged
        result.surplusNode = true;
    } else {
        //a container has failed 
        result.containerFailed = true;
        roleInstance = activeContainers.remove(containerId);
        if (roleInstance != null) {
            //it was active, move it to failed 
            incFailedCountainerCount();
            failedNodes.put(containerId, roleInstance);
        } else {
            // the container may have been noted as failed already, so look
            // it up
            roleInstance = failedNodes.get(containerId);
        }
        if (roleInstance != null) {
            int roleId = roleInstance.roleId;
            log.info("Failed container in role {}", roleId);
            try {
                RoleStatus roleStatus = lookupRoleStatus(roleId);
                roleStatus.decActual();
                boolean shortLived = isShortLived(roleInstance);
                String message;
                if (roleInstance.container != null) {
                    String user = null;
                    try {
                        user = HoyaUtils.getCurrentUser().getShortUserName();
                    } catch (IOException ioe) {
                    }
                    String completedLogsUrl = null;
                    Container c = roleInstance.container;
                    String url = null;
                    if (amConf != null) {
                        url = amConf.get(YarnConfiguration.YARN_LOG_SERVER_URL);
                    }
                    if (user != null && url != null) {
                        completedLogsUrl = url + "/" + c.getNodeId() + "/" + roleInstance.getContainerId()
                                + "/ctx/" + user;
                    }
                    message = String.format(
                            "Failure %s on host %s" + (completedLogsUrl != null ? ", see %s" : ""),
                            roleInstance.getContainerId(), c.getNodeId().getHost(), completedLogsUrl);
                } else {
                    message = String.format("Failure %s", containerId.toString());
                }
                roleStatus.noteFailed(message);
                //have a look to see if it short lived
                if (shortLived) {
                    roleStatus.incStartFailed();
                }

                if (roleInstance.container != null) {
                    roleHistory.onFailedContainer(roleInstance.container, shortLived);
                }

            } catch (YarnRuntimeException e1) {
                log.error("Failed container of unknown role {}", roleId);
            }
        } else {
            //this isn't a known container.

            log.error("Notified of completed container {} that is not in the list"
                    + " of active or failed containers", containerId);
            completionOfUnknownContainerEvent.incrementAndGet();
        }
    }

    if (result.surplusNode) {
        //a surplus node
        return result;
    }

    //record the complete node's details; this pulls it from the livenode set 
    //remove the node
    ContainerId id = status.getContainerId();
    RoleInstance node = getLiveNodes().remove(id);
    if (node == null) {
        log.warn("Received notification of completion of unknown node {}", id);
        completionOfNodeNotInLiveListEvent.incrementAndGet();

    } else {
        node.state = ClusterDescription.STATE_DESTROYED;
        node.exitCode = status.getExitStatus();
        node.diagnostics = status.getDiagnostics();
        getCompletedNodes().put(id, node);
        result.roleInstance = node;
    }
    return result;
}