Example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus.

Prototype

@Public
@Unstable
public abstract int getExitStatus();

Source Link

Document

Get the exit status for the container.

Note: This is valid only for completed containers i.e.

Usage

From source file:gobblin.yarn.YarnService.java

License:Apache License

/**
 * Handle the completion of a container. A new container will be requested to replace the one
 * that just exited. Depending on the exit status and if container host affinity is enabled,
 * the new container may or may not try to be started on the same node.
 *
 * A container completes in either of the following conditions: 1) some error happens in the
 * container and caused the container to exit, 2) the container gets killed due to some reason,
 * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
 * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
 * A replacement container is needed in all but the last case.
 *///from  w  w  w  .  j a v a  2s  .  c o  m
private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry = this.containerMap
            .remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
        LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
                containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.shutdownInProgress) {
        return;
    }

    int retryCount = this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0))
            .incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
        eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID,
                completedInstanceName);
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
                retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
        if (this.eventSubmitter.isPresent()) {
            this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                    eventMetadataBuilder.get().build());
        }

        LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
        return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
    }

    LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus())
            ? Optional.of(completedContainerEntry.getKey())
            : Optional.<Container>absent()));
}

From source file:gobblin.yarn.YarnService.java

License:Apache License

private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(
        ContainerStatus containerStatus) {
    ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
    eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID,
            containerStatus.getContainerId().toString());
    eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
            containerStatus.getState().toString());
    if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
        eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
                containerStatus.getExitStatus() + "");
    }//w ww  . j  a  va 2s  .  c om
    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
        eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
                containerStatus.getDiagnostics());
    }

    return eventMetadataBuilder;
}

From source file:husky.server.HuskyRMCallbackHandler.java

License:Apache License

public void onContainersCompleted(List<ContainerStatus> completedContainerStatus) {
    LOG.info("Get response from RM for container request, completedCnt = " + completedContainerStatus.size());
    mNumCompletedContainers += completedContainerStatus.size();
    for (ContainerStatus status : completedContainerStatus) {
        LOG.info(String.format("Container %s: %s, exit status: %d", status.getContainerId().toString(),
                status.getState().toString(), status.getExitStatus()));
        if (status.getExitStatus() == 0) {
            mNumSuccess += 1;/*from w  w  w.  java2  s .  co  m*/
        }
    }
    LOG.info("Total containers: " + mNumContainers + ", completed containers: " + mNumCompletedContainers);
    if (mNumContainers == mNumCompletedContainers) {
        // If all workers and master finish
        synchronized (finalResultLock) {
            finalResultLock.unlock();
            finalResultLock.notifyAll();
        }
    }
}

From source file:io.hops.tensorflow.TimelineHandler.java

License:Apache License

public void publishContainerEndEvent(ContainerStatus container) {
    final TimelineEntity entity = new TimelineEntity();
    entity.setEntityId(container.getContainerId().toString());
    entity.setEntityType(ApplicationMaster.YarntfEntity.YARNTF_CONTAINER.toString());
    entity.setDomainId(domainId);// ww w  . j ava 2 s  . c  o m
    entity.addPrimaryFilter("user", ugi.getShortUserName());
    TimelineEvent event = new TimelineEvent();
    event.setTimestamp(System.currentTimeMillis());
    event.setEventType(ApplicationMaster.YarntfEvent.YARNTF_CONTAINER_END.toString());
    event.addEventInfo("State", container.getState().name());
    event.addEventInfo("Exit Status", container.getExitStatus());
    entity.addEvent(event);
    try {
        timelineClient.putEntities(entity);
    } catch (YarnException | IOException e) {
        LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e);
    }
}

From source file:org.apache.drill.yarn.appMaster.TaskState.java

License:Apache License

protected void completed(EventContext context, ContainerStatus status) {
    Task task = context.task;/*from ww  w  .ja  va 2  s.  c  o m*/
    String diag = status.getDiagnostics();
    LOG.trace(task.getLabel() + " Completed, exit status: " + status.getExitStatus()
            + (DoYUtil.isBlank(diag) ? "" : ": " + status.getDiagnostics()));
    task.completionStatus = status;
}

From source file:org.apache.flink.yarn.YarnFlinkResourceManager.java

License:Apache License

/**
 * Invoked when the ResourceManager informs of completed containers.
 * Called via an actor message by the callback from the ResourceManager client.
 * //from  w w  w. java 2  s .  c  om
 * @param containers The containers that have completed.
 */
private void containersComplete(List<ContainerStatus> containers) {
    // the list contains both failed containers, as well as containers that
    // were gracefully returned by this application master

    for (ContainerStatus status : containers) {
        final ResourceID id = new ResourceID(status.getContainerId().toString());

        // check if this is a failed container or a completed container
        if (containersBeingReturned.remove(status.getContainerId()) != null) {
            // regular completed container that we released
            LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics());
        } else {
            // failed container, either at startup, or running
            final String exitStatus;
            switch (status.getExitStatus()) {
            case -103:
                exitStatus = "Vmem limit exceeded (-103)";
                break;
            case -104:
                exitStatus = "Pmem limit exceeded (-104)";
                break;
            default:
                exitStatus = String.valueOf(status.getExitStatus());
            }

            final YarnContainerInLaunch launched = containersInLaunch.remove(id);
            if (launched != null) {
                LOG.info("Container {} failed, with a TaskManager in launch or registration. "
                        + "Exit status: {}", id, exitStatus);
                // we will trigger re-acquiring new containers at the end
            } else {
                // failed registered worker
                LOG.info("Container {} failed. Exit status: {}", id, exitStatus);

                // notify the generic logic, which notifies the JobManager, etc.
                notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
            }

            // general failure logging
            failedContainersSoFar++;

            String diagMessage = String.format(
                    "Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id,
                    status.getState(), exitStatus, status.getDiagnostics());
            sendInfoMessage(diagMessage);

            LOG.info(diagMessage);
            LOG.info("Total number of failed containers so far: " + failedContainersSoFar);

            // maxFailedContainers == -1 is infinite number of retries.
            if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
                String msg = "Stopping YARN session because the number of failed containers ("
                        + failedContainersSoFar + ") exceeded the maximum failed containers ("
                        + maxFailedContainers + "). This number is controlled by the '"
                        + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. "
                        + "By default its the number of requested containers.";

                LOG.error(msg);
                self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
                        ActorRef.noSender());

                // no need to do anything else
                return;
            }
        }
    }

    updateProgress();

    // in case failed containers were among the finished containers, make
    // sure we re-examine and request new ones
    triggerCheckWorkers();
}

From source file:org.apache.flink.yarn.YarnResourceManager.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> list) {
    for (ContainerStatus container : list) {
        if (container.getExitStatus() < 0) {
            notifyWorkerFailed(new ResourceID(container.getContainerId().toString()),
                    container.getDiagnostics());
        }/*from   w  w  w  .j a  va2  s  .  c o m*/
    }
}

From source file:org.apache.flink.yarn.YarnResourceManagerTest.java

License:Apache License

private static ContainerStatus mockContainerStatus(ContainerId containerId) {
    ContainerStatus mockContainerStatus = mock(ContainerStatus.class);

    when(mockContainerStatus.getContainerId()).thenReturn(containerId);
    when(mockContainerStatus.getState()).thenReturn(ContainerState.COMPLETE);
    when(mockContainerStatus.getDiagnostics()).thenReturn("Test exit");
    when(mockContainerStatus.getExitStatus()).thenReturn(-1);

    return mockContainerStatus;
}

From source file:org.apache.gobblin.yarn.YarnService.java

License:Apache License

/**
 * Handle the completion of a container. A new container will be requested to replace the one
 * that just exited. Depending on the exit status and if container host affinity is enabled,
 * the new container may or may not try to be started on the same node.
 *
 * A container completes in either of the following conditions: 1) some error happens in the
 * container and caused the container to exit, 2) the container gets killed due to some reason,
 * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
 * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
 * A replacement container is needed in all but the last case.
 */// www . j  a v  a2s. c om
private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry = this.containerMap
            .remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
        LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
                containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) {
        LOGGER.info("Container release requested, so not spawning a replacement for containerId {}",
                containerStatus.getContainerId());
        return;
    }

    if (this.shutdownInProgress) {
        return;
    }

    this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0));
    int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
        eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID,
                completedInstanceName);
        eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
                retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
        if (this.eventSubmitter.isPresent()) {
            this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                    eventMetadataBuilder.get().build());
        }

        LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
        return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
    }

    LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus())
            ? Optional.of(completedContainerEntry.getKey())
            : Optional.<Container>absent()));
}

From source file:org.apache.hama.bsp.BSPTaskLauncher.java

License:Apache License

/**
 * This polls the current container status from container manager. Null if the
 * container hasn't finished yet.//from ww  w . j  a  v a 2 s .com
 * 
 * @return
 * @throws Exception
 */
public BSPTaskStatus poll() throws Exception {

    ContainerStatus lastStatus = null;
    GetContainerStatusesResponse getContainerStatusesResponse = cm.getContainerStatuses(statusRequest);
    List<ContainerStatus> containerStatuses = getContainerStatusesResponse.getContainerStatuses();
    for (ContainerStatus containerStatus : containerStatuses) {
        LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
                + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
                + ", diagnostics=" + containerStatus.getDiagnostics());

        if (containerStatus.getContainerId().equals(allocatedContainer.getId())) {
            lastStatus = containerStatus;
            break;
        }
    }
    if (lastStatus.getState() != ContainerState.COMPLETE) {
        return null;
    }
    LOG.info(this.id + " Last report comes with exitstatus of " + lastStatus.getExitStatus()
            + " and diagnose string of " + lastStatus.getDiagnostics());

    return new BSPTaskStatus(id, lastStatus.getExitStatus());
}