Example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus.

Prototype

@Public
@Unstable
public abstract int getExitStatus();

Source Link

Document

Get the exit status for the container.

Note: This is valid only for completed containers i.e.

Usage

From source file:org.apache.helix.provisioning.yarn.RMCallbackHandler.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        GenericApplicationMaster.LOG.info("Got container status for containerID="
                + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus="
                + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        synchronized (_genericApplicationMaster.allocatedContainerSet) {
            _genericApplicationMaster.allocatedContainerSet.remove(containerStatus.getContainerId());
            SettableFuture<ContainerStopResponse> stopResponseFuture = _genericApplicationMaster.containerStopMap
                    .remove(containerStatus.getContainerId());
            if (stopResponseFuture != null) {
                ContainerStopResponse value = new ContainerStopResponse();
                stopResponseFuture.set(value);
            } else {
                SettableFuture<ContainerReleaseResponse> releaseResponseFuture = _genericApplicationMaster.containerReleaseMap
                        .remove(containerStatus.getContainerId());
                if (releaseResponseFuture != null) {
                    ContainerReleaseResponse value = new ContainerReleaseResponse();
                    releaseResponseFuture.set(value);
                }//from ww  w  .  j  ava  2  s .  c  om
            }
        }
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {

            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason

                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            GenericApplicationMaster.LOG.info(
                    "Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
    }
}

From source file:org.apache.hoya.yarn.appmaster.HoyaAppMaster.java

License:Apache License

@Override //AMRMClientAsync
public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size());
    for (ContainerStatus status : completedContainers) {
        ContainerId containerId = status.getContainerId();
        LOG_YARN.info(/* www  .j a va  2  s  .c  o m*/
                "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={},"
                        + " diagnostics={}",
                containerId, status.getState(), status.getExitStatus(), status.getDiagnostics());

        // non complete containers should not be here
        assert (status.getState() == ContainerState.COMPLETE);
        AppState.NodeCompletionResult result = appState.onCompletedNode(conf, status);
        if (result.containerFailed) {
            RoleInstance ri = result.roleInstance;
            log.error("Role instance {} failed ", ri);
        }
    }

    // ask for more containers if any failed
    // In the case of Hoya, we don't expect containers to complete since
    // Hoya is a long running application. Keep track of how many containers
    // are completing. If too many complete, abort the application
    // TODO: this needs to be better thought about (and maybe something to
    // better handle in Yarn for long running apps)

    try {
        reviewRequestAndReleaseNodes();
    } catch (HoyaInternalStateException e) {
        log.warn("Exception while flexing nodes", e);
    }
}

From source file:org.apache.hoya.yarn.appmaster.state.AppState.java

License:Apache License

/**
 * handle completed node in the CD -move something from the live
 * server list to the completed server list
 * @param amConf YarnConfiguration/*w  w  w . ja  va2s.  c o  m*/
 * @param status the node that has just completed
 * @return NodeCompletionResult
 */
public synchronized NodeCompletionResult onCompletedNode(YarnConfiguration amConf, ContainerStatus status) {
    ContainerId containerId = status.getContainerId();
    NodeCompletionResult result = new NodeCompletionResult();
    RoleInstance roleInstance;

    if (containersBeingReleased.containsKey(containerId)) {
        log.info("Container was queued for release");
        Container container = containersBeingReleased.remove(containerId);
        RoleStatus roleStatus = lookupRoleStatus(container);
        log.info("decrementing role count for role {}", roleStatus.getName());
        roleStatus.decReleasing();
        roleStatus.decActual();
        roleStatus.incCompleted();
        roleHistory.onReleaseCompleted(container);

    } else if (surplusNodes.remove(containerId)) {
        //its a surplus one being purged
        result.surplusNode = true;
    } else {
        //a container has failed 
        result.containerFailed = true;
        roleInstance = activeContainers.remove(containerId);
        if (roleInstance != null) {
            //it was active, move it to failed 
            incFailedCountainerCount();
            failedNodes.put(containerId, roleInstance);
        } else {
            // the container may have been noted as failed already, so look
            // it up
            roleInstance = failedNodes.get(containerId);
        }
        if (roleInstance != null) {
            int roleId = roleInstance.roleId;
            log.info("Failed container in role {}", roleId);
            try {
                RoleStatus roleStatus = lookupRoleStatus(roleId);
                roleStatus.decActual();
                boolean shortLived = isShortLived(roleInstance);
                String message;
                if (roleInstance.container != null) {
                    String user = null;
                    try {
                        user = HoyaUtils.getCurrentUser().getShortUserName();
                    } catch (IOException ioe) {
                    }
                    String completedLogsUrl = null;
                    Container c = roleInstance.container;
                    String url = null;
                    if (amConf != null) {
                        url = amConf.get(YarnConfiguration.YARN_LOG_SERVER_URL);
                    }
                    if (user != null && url != null) {
                        completedLogsUrl = url + "/" + c.getNodeId() + "/" + roleInstance.getContainerId()
                                + "/ctx/" + user;
                    }
                    message = String.format(
                            "Failure %s on host %s" + (completedLogsUrl != null ? ", see %s" : ""),
                            roleInstance.getContainerId(), c.getNodeId().getHost(), completedLogsUrl);
                } else {
                    message = String.format("Failure %s", containerId.toString());
                }
                roleStatus.noteFailed(message);
                //have a look to see if it short lived
                if (shortLived) {
                    roleStatus.incStartFailed();
                }

                if (roleInstance.container != null) {
                    roleHistory.onFailedContainer(roleInstance.container, shortLived);
                }

            } catch (YarnRuntimeException e1) {
                log.error("Failed container of unknown role {}", roleId);
            }
        } else {
            //this isn't a known container.

            log.error("Notified of completed container {} that is not in the list"
                    + " of active or failed containers", containerId);
            completionOfUnknownContainerEvent.incrementAndGet();
        }
    }

    if (result.surplusNode) {
        //a surplus node
        return result;
    }

    //record the complete node's details; this pulls it from the livenode set 
    //remove the node
    ContainerId id = status.getContainerId();
    RoleInstance node = getLiveNodes().remove(id);
    if (node == null) {
        log.warn("Received notification of completion of unknown node {}", id);
        completionOfNodeNotInLiveListEvent.incrementAndGet();

    } else {
        node.state = ClusterDescription.STATE_DESTROYED;
        node.exitCode = status.getExitStatus();
        node.diagnostics = status.getDiagnostics();
        getCompletedNodes().put(id, node);
        result.roleInstance = node;
    }
    return result;
}

From source file:org.apache.metron.maas.service.callback.ContainerRequestListener.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*from  w  w  w  .j  a  v  a 2  s  .  c o  m*/
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
    for (ContainerStatus containerStatus : completedContainers) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());
        removeContainer(containerStatus.getContainerId());
        LOG.info("REMOVING CONTAINER " + containerStatus.getContainerId());
        serviceDiscoverer.unregisterByContainer(containerStatus.getContainerId() + "");
        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);
        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
            // container failed
            if (ContainerExitStatus.ABORTED != exitStatus) {
                // shell script failed
                // counts as completed
            } else {
                // container was killed by framework, possibly preempted
                // we should re-try as the container was lost for some reason
                // we do not need to release the container as it would be done
                // by the RM
            }
        } else {
            // nothing to do
            // container completed successfully
            LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
        }
        if (timelineClient != null) {
            YarnUtils.INSTANCE.publishContainerEndEvent(timelineClient, containerStatus, domainId,
                    appSubmitterUgi);
        }
    }
}

From source file:org.apache.metron.maas.service.yarn.YarnUtils.java

License:Apache License

public void publishContainerEndEvent(final TimelineClient timelineClient, ContainerStatus container,
        String domainId, UserGroupInformation ugi) {
    final TimelineEntity entity = new TimelineEntity();
    entity.setEntityId(container.getContainerId().toString());
    entity.setEntityType(ApplicationMaster.DSEntity.DS_CONTAINER.toString());
    entity.setDomainId(domainId);//w  w  w  .  ja v a2  s.c o m
    entity.addPrimaryFilter("user", ugi.getShortUserName());
    TimelineEvent event = new TimelineEvent();
    event.setTimestamp(System.currentTimeMillis());
    event.setEventType(ContainerEvents.CONTAINER_END.toString());
    event.addEventInfo("State", container.getState().name());
    event.addEventInfo("Exit Status", container.getExitStatus());
    entity.addEvent(event);
    try {
        timelineClient.putEntities(entity);
    } catch (YarnException | IOException e) {
        LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e);
    }
}

From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java

License:Apache License

/**
 * Handles container status reports. Calls come from YARN.
 *
 * @param value containing the container status
 *///from  ww  w.ja va  2 s. c  o  m
private void onContainerStatus(final ContainerStatus value) {

    final String containerId = value.getContainerId().toString();
    final boolean hasContainer = this.containers.hasContainer(containerId);

    if (hasContainer) {
        LOG.log(Level.FINE, "Received container status: {0}", containerId);

        final ResourceStatusProto.Builder status = ResourceStatusProto.newBuilder().setIdentifier(containerId);

        switch (value.getState()) {
        case COMPLETE:
            LOG.log(Level.FINE, "Container completed: status {0}", value.getExitStatus());
            switch (value.getExitStatus()) {
            case 0:
                status.setState(ReefServiceProtos.State.DONE);
                break;
            case 143:
                status.setState(ReefServiceProtos.State.KILLED);
                break;
            default:
                status.setState(ReefServiceProtos.State.FAILED);
            }
            status.setExitCode(value.getExitStatus());
            // remove the completed container (can be either done/killed/failed) from book keeping
            this.containers.removeAndGet(containerId);
            logContainerRemoval(containerId);
            break;
        default:
            LOG.info("Container running");
            status.setState(ReefServiceProtos.State.RUNNING);
        }

        if (value.getDiagnostics() != null) {
            LOG.log(Level.FINE, "Container diagnostics: {0}", value.getDiagnostics());
            status.setDiagnostics(value.getDiagnostics());
        }

        this.reefEventHandlers.onResourceStatus(status.build());
    }
}

From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones./*  ww  w  .jav  a2  s  .  co  m*/
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info("Container completed from RM " + status);

        SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResrcStatuses.add(samzaResrcStatus);

        int completedContainerID = getIDForContainer(status.getContainerId().toString());
        log.info("Completed container had ID: {}", completedContainerID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (completedContainerID != INVALID_YARN_CONTAINER_ID) {
            if (state.runningYarnContainers.containsKey(completedContainerID)) {
                log.info("Removing container ID {} from completed containers", completedContainerID);
                state.runningYarnContainers.remove(completedContainerID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    _callback.onResourcesCompleted(samzaResrcStatuses);
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*from w  ww. j av a 2s  .c  om*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones.//w  w  w .  ja v a2s.c  o  m
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info(
                "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.",
                status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics());

        SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResourceStatuses.add(samzaResourceStatus);

        String completedProcessorID = getRunningProcessorId(status.getContainerId().toString());
        log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(),
                completedProcessorID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) {
            if (state.runningProcessors.containsKey(completedProcessorID)) {
                log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.",
                        completedProcessorID);
                state.runningProcessors.remove(completedProcessorID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses);
}

From source file:org.apache.slider.server.appmaster.SliderAppMaster.java

License:Apache License

@Override //AMRMClientAsync
public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) {
    LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size());
    for (ContainerStatus status : completedContainers) {
        ContainerId containerId = status.getContainerId();
        LOG_YARN.info(/*w  ww.ja  v a  2 s . co m*/
                "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={},"
                        + " diagnostics={}",
                containerId, status.getState(), status.getExitStatus(), status.getDiagnostics());

        // non complete containers should not be here
        assert (status.getState() == ContainerState.COMPLETE);
        AppState.NodeCompletionResult result = appState.onCompletedNode(status);
        if (result.containerFailed) {
            RoleInstance ri = result.roleInstance;
            log.error("Role instance {} failed ", ri);
        }

        //  known nodes trigger notifications
        if (!result.unknownNode) {
            getProviderService().notifyContainerCompleted(containerId);
            queue(new UnregisterComponentInstance(containerId, 0, TimeUnit.MILLISECONDS));
        }
    }

    reviewRequestAndReleaseNodes("onContainersCompleted");
}