Example usage for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId

List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId.

Prototype

@Public
@Stable
public abstract ContainerId getContainerId();

Source Link

Document

Get the ContainerId of the container.

Usage

From source file:org.apache.ignite.yarn.ApplicationMaster.java

License:Apache License

/** {@inheritDoc} */
public synchronized void onContainersCompleted(List<ContainerStatus> statuses) {
    for (ContainerStatus status : statuses) {
        containers.remove(status.getContainerId());

        log.log(Level.INFO, "Container completed. Container id: {0}. State: {1}.",
                new Object[] { status.getContainerId(), status.getState() });
    }/*from  w  w w .  jav  a 2  s  .c  o m*/
}

From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*w  ww  . j a v  a2 s . co m*/
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());
        removeContainer(containerStatus.getContainerId());
        LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId());
        serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + "");
        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                // shell script failed
                // counts as completed
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
        if (timelineClient != null) {
            YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId,
                    appSubmitterUgi);
        }
    }
}

From source file:org.apache.metron.maas.service.yarn.YarnUtils.java

License:Apache License

public void publishContainerEndEvent(final TimelineClient timelineClient, ContainerStatus container,
        String domainId, UserGroupInformation ugi) {
    final TimelineEntity entity = new TimelineEntity();
    entity.setEntityId(container.getContainerId().toString());
    entity.setEntityType(ApplicationMaster.DSEntity.DS_CONTAINER.toString());
    entity.setDomainId(domainId);// ww w .  ja  va 2s.co m
    entity.addPrimaryFilter("user", ugi.getShortUserName());
    TimelineEvent event = new TimelineEvent();
    event.setTimestamp(System.currentTimeMillis());
    event.setEventType(ContainerEvents.CONTAINER_END.toString());
    event.addEventInfo("State", container.getState().name());
    event.addEventInfo("Exit Status", container.getExitStatus());
    entity.addEvent(event);
    try {
        timelineClient.putEntities(entity);
    } catch (YarnException | IOException e) {
        LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e);
    }
}

From source file:org.apache.myriad.scheduler.fgs.NMHeartBeatHandler.java

License:Apache License

@VisibleForTesting
protected Resource getResourcesUnderUse(RMNodeStatusEvent statusEvent) {
    Resource usedResources = Resource.newInstance(0, 0);
    for (ContainerStatus status : statusEvent.getContainers()) {
        if (containerInUse(status)) {
            RMContainer rmContainer = yarnScheduler.getRMContainer(status.getContainerId());
            // (sdaingade) This check is needed as RMContainer information may not be populated
            // immediately after a RM restart.
            if (rmContainer != null) {
                Resources.addTo(usedResources, rmContainer.getAllocatedResource());
            }/*w w  w  .jav  a 2s .co m*/
        }
    }
    return usedResources;
}

From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java

License:Apache License

/**
 * Handles container status reports. Calls come from YARN.
 *
 * @param value containing the container status
 *///from   w  w w .  j  a v  a  2 s  .c  o m
private void onContainerStatus(final ContainerStatus value) {

    final String containerId = value.getContainerId().toString();
    final boolean hasContainer = this.containers.hasContainer(containerId);

    if (hasContainer) {
        LOG.log(Level.FINE, "Received container status: {0}", containerId);

        final ResourceStatusProto.Builder status = ResourceStatusProto.newBuilder().setIdentifier(containerId);

        switch (value.getState()) {
        case COMPLETE:
            LOG.log(Level.FINE, "Container completed: status {0}", value.getExitStatus());
            switch (value.getExitStatus()) {
            case 0:
                status.setState(ReefServiceProtos.State.DONE);
                break;
            case 143:
                status.setState(ReefServiceProtos.State.KILLED);
                break;
            default:
                status.setState(ReefServiceProtos.State.FAILED);
            }
            status.setExitCode(value.getExitStatus());
            // remove the completed container (can be either done/killed/failed) from book keeping
            this.containers.removeAndGet(containerId);
            logContainerRemoval(containerId);
            break;
        default:
            LOG.info("Container running");
            status.setState(ReefServiceProtos.State.RUNNING);
        }

        if (value.getDiagnostics() != null) {
            LOG.log(Level.FINE, "Container diagnostics: {0}", value.getDiagnostics());
            status.setDiagnostics(value.getDiagnostics());
        }

        this.reefEventHandlers.onResourceStatus(status.build());
    }
}

From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones./*from  w  w w  .j  ava  2s.c  om*/
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info("Container completed from RM " + status);

        SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResrcStatuses.add(samzaResrcStatus);

        int completedContainerID = getIDForContainer(status.getContainerId().toString());
        log.info("Completed container had ID: {}", completedContainerID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (completedContainerID != INVALID_YARN_CONTAINER_ID) {
            if (state.runningYarnContainers.containsKey(completedContainerID)) {
                log.info("Removing container ID {} from completed containers", completedContainerID);
                state.runningYarnContainers.remove(completedContainerID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    _callback.onResourcesCompleted(samzaResrcStatuses);
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*from  www .  ja v a  2s.  c om*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones./*from  w w w  . j a  va 2  s  .c  o  m*/
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info(
                "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.",
                status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics());

        SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResourceStatuses.add(samzaResourceStatus);

        String completedProcessorID = getRunningProcessorId(status.getContainerId().toString());
        log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(),
                completedProcessorID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) {
            if (state.runningProcessors.containsKey(completedProcessorID)) {
                log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.",
                        completedProcessorID);
                state.runningProcessors.remove(completedProcessorID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses);
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

@Override //AMRMClientAsync
public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size());
    for (ContainerStatus status : completedContainers) {
        ContainerId containerId = status.getContainerId();
        LOG_YARN.info(/* w w  w.j  a  v a  2  s  .  com*/
                "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={},"
                        + " diagnostics={}",
                containerId, status.getState(), status.getExitStatus(), status.getDiagnostics());

        // non complete containers should not be here
        assert (status.getState() == ContainerState.COMPLETE);
        AppState.NodeCompletionResult result = appState.onCompletedNode(status);
        if (result.containerFailed) {
            RoleInstance ri = result.roleInstance;
            log.error("Role instance {} failed ", ri);
        }

        //  known nodes trigger notifications
        if (!result.unknownNode) {
            getProviderService().notifyContainerCompleted(containerId);
            queue(new UnregisterComponentInstance(containerId, 0, TimeUnit.MILLISECONDS));
        }
    }

    reviewRequestAndReleaseNodes("onContainersCompleted");
}

From source file:org.apache.slider.server.appmaster.state.AppState.java

License:Apache License

/**
 * handle completed node in the CD -move something from the live
 * server list to the completed server list
 * @param status the node that has just completed
 * @return NodeCompletionResult//from w w w .j a v a  2 s  .  c  o  m
 */
public synchronized NodeCompletionResult onCompletedNode(ContainerStatus status) {
    ContainerId containerId = status.getContainerId();
    NodeCompletionResult result = new NodeCompletionResult();
    RoleInstance roleInstance;

    if (containersBeingReleased.containsKey(containerId)) {
        log.info("Container was queued for release : {}", containerId);
        Container container = containersBeingReleased.remove(containerId);
        RoleStatus roleStatus = lookupRoleStatus(container);
        int releasing = roleStatus.decReleasing();
        int actual = roleStatus.decActual();
        int completedCount = roleStatus.incCompleted();
        log.info("decrementing role count for role {} to {}; releasing={}, completed={}", roleStatus.getName(),
                actual, releasing, completedCount);
        roleHistory.onReleaseCompleted(container, true);

    } else if (surplusNodes.remove(containerId)) {
        //its a surplus one being purged
        result.surplusNode = true;
    } else {
        //a container has failed 
        result.containerFailed = true;
        roleInstance = removeOwnedContainer(containerId);
        if (roleInstance != null) {
            //it was active, move it to failed 
            incFailedCountainerCount();
            failedNodes.put(containerId, roleInstance);
        } else {
            // the container may have been noted as failed already, so look
            // it up
            roleInstance = failedNodes.get(containerId);
        }
        if (roleInstance != null) {
            int roleId = roleInstance.roleId;
            String rolename = roleInstance.role;
            log.info("Failed container in role[{}] : {}", roleId, rolename);
            try {
                RoleStatus roleStatus = lookupRoleStatus(roleId);
                roleStatus.decActual();
                boolean shortLived = isShortLived(roleInstance);
                String message;
                Container failedContainer = roleInstance.container;

                //build the failure message
                if (failedContainer != null) {
                    String completedLogsUrl = getLogsURLForContainer(failedContainer);
                    message = String.format("Failure %s on host %s: %s",
                            roleInstance.getContainerId().toString(), failedContainer.getNodeId().getHost(),
                            completedLogsUrl);
                } else {
                    message = String.format("Failure %s", containerId);
                }
                int failed = roleStatus.noteFailed(shortLived, message);
                log.info("Current count of failed role[{}] {} =  {}", roleId, rolename, failed);
                if (failedContainer != null) {
                    roleHistory.onFailedContainer(failedContainer, shortLived);
                }

            } catch (YarnRuntimeException e1) {
                log.error("Failed container of unknown role {}", roleId);
            }
        } else {
            //this isn't a known container.

            log.error("Notified of completed container {} that is not in the list"
                    + " of active or failed containers", containerId);
            completionOfUnknownContainerEvent.incrementAndGet();
            result.unknownNode = true;
        }
    }

    if (result.surplusNode) {
        //a surplus node
        return result;
    }

    //record the complete node's details; this pulls it from the livenode set 
    //remove the node
    ContainerId id = status.getContainerId();
    log.info("Removing node ID {}", id);
    RoleInstance node = getLiveNodes().remove(id);
    if (node != null) {
        node.state = ClusterDescription.STATE_DESTROYED;
        node.exitCode = status.getExitStatus();
        node.diagnostics = status.getDiagnostics();
        getCompletedNodes().put(id, node);
        result.roleInstance = node;
    } else {
        // not in the list
        log.warn("Received notification of completion of unknown node {}", id);
        completionOfNodeNotInLiveListEvent.incrementAndGet();

    }

    // and the active node list if present
    removeOwnedContainer(containerId);

    // finally, verify the node doesn't exist any more
    assert !containersBeingReleased.containsKey(containerId) : "container still in release queue";
    assert !getLiveNodes().containsKey(containerId) : " container still in live nodes";
    assert getOwnedContainer(containerId) == null : "Container still in active container list";

    return result;
}