Example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS

List of usage examples for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS.

Prototype

int SUCCESS

To view the source code for org.apache.hadoop.yarn.api.records ContainerExitStatus SUCCESS.

Click Source Link

Usage

From source file:com.cloudera.llama.am.yarn.YarnRMConnector.java

License:Apache License

@Override
public void onContainersCompleted(List<ContainerStatus> containerStatuses) {
    List<RMEvent> changes = new ArrayList<RMEvent>();
    for (ContainerStatus containerStatus : containerStatuses) {
        ContainerId containerId = containerStatus.getContainerId();
        UUID resourceId = containerToResourceMap.remove(containerId);
        // we have the containerId only if we did not release it.
        if (resourceId != null) {
            switch (containerStatus.getExitStatus()) {
            case ContainerExitStatus.SUCCESS:
                LOG.warn("It should never happen, container for resource '{}' " + "exited on its own",
                        resourceId);//from  w  w w. j av  a  2 s  . com
                //reporting it as LOST for the client to take corrective measures.
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            case ContainerExitStatus.PREEMPTED:
                LOG.warn("Container for resource '{}' has been preempted", resourceId);
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.PREEMPTED));
                break;
            case ContainerExitStatus.ABORTED:
            default:
                LOG.warn("Container for resource '{}' has been lost, exit status" + " '{}'", resourceId,
                        containerStatus.getExitStatus());
                changes.add(RMEvent.createStatusChangeEvent(resourceId, PlacedResource.Status.LOST));
                break;
            }
        }
    }
    llamaCallback.onEvent(changes);
}

From source file:org.apache.samza.job.yarn.refactor.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones./* w  w  w .  ja  v  a2 s  .  co  m*/
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResrcStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info("Container completed from RM " + status);

        SamzaResourceStatus samzaResrcStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResrcStatuses.add(samzaResrcStatus);

        int completedContainerID = getIDForContainer(status.getContainerId().toString());
        log.info("Completed container had ID: {}", completedContainerID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (completedContainerID != INVALID_YARN_CONTAINER_ID) {
            if (state.runningYarnContainers.containsKey(completedContainerID)) {
                log.info("Removing container ID {} from completed containers", completedContainerID);
                state.runningYarnContainers.remove(completedContainerID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    _callback.onResourcesCompleted(samzaResrcStatuses);
}

From source file:org.apache.samza.job.yarn.SamzaTaskManager.java

License:Apache License

/**
 * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides
 * whether a container that exited is marked as complete or failure.
 *//*w  w  w.ja va  2  s  .c om*/
@Override
public void onContainerCompleted(ContainerStatus containerStatus) {
    String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId());
    int containerId = -1;
    for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) {
        if (entry.getValue().id().equals(containerStatus.getContainerId())) {
            containerId = entry.getKey();
            break;
        }
    }
    state.runningContainers.remove(containerId);

    int exitStatus = containerStatus.getExitStatus();
    switch (exitStatus) {
    case ContainerExitStatus.SUCCESS:
        log.info("Container {} completed successfully.", containerIdStr);

        state.completedContainers.incrementAndGet();

        if (containerId != -1) {
            state.finishedContainers.add(containerId);
            containerFailures.remove(containerId);
        }

        if (state.completedContainers.get() == state.containerCount) {
            log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed.");
            state.status = FinalApplicationStatus.SUCCEEDED;
        }
        break;

    case ContainerExitStatus.DISKS_FAILED:
    case ContainerExitStatus.ABORTED:
    case ContainerExitStatus.PREEMPTED:
        log.info(
                "Got an exit code of {}. This means that container {} was "
                        + "killed by YARN, either due to being released by the application "
                        + "master or being 'lost' due to node failures etc. or due to preemption by the RM",
                exitStatus, containerIdStr);

        state.releasedContainers.incrementAndGet();

        // If this container was assigned some partitions (a containerId), then
        // clean up, and request a new container for the tasks. This only
        // should happen if the container was 'lost' due to node failure, not
        // if the AM released the container.
        if (containerId != -1) {
            log.info(
                    "Released container {} was assigned task group ID {}. Requesting a new container for the task group.",
                    containerIdStr, containerId);

            state.neededContainers.incrementAndGet();
            state.jobHealthy.set(false);

            // request a container on new host
            containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST);
        }
        break;

    default:
        // TODO: Handle failure more intelligently. Should track NodeFailures!
        log.info("Container failed for some reason. Let's start it again");
        log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - "
                + containerStatus.getDiagnostics());

        state.failedContainers.incrementAndGet();
        state.failedContainersStatus.put(containerIdStr, containerStatus);
        state.jobHealthy.set(false);

        if (containerId != -1) {
            state.neededContainers.incrementAndGet();
            // Find out previously running container location
            String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId,
                    SetContainerHostMapping.HOST_KEY);
            if (!hostAffinityEnabled || lastSeenOn == null) {
                lastSeenOn = ContainerAllocator.ANY_HOST;
            }
            // A container failed for an unknown reason. Let's check to see if
            // we need to shutdown the whole app master if too many container
            // failures have happened. The rules for failing are that the
            // failure count for a task group id must be > the configured retry
            // count, and the last failure (the one prior to this one) must have
            // happened less than retry window ms ago. If retry count is set to
            // 0, the app master will fail on any container failure. If the
            // retry count is set to a number < 0, a container failure will
            // never trigger an app master failure.
            int retryCount = yarnConfig.getContainerRetryCount();
            int retryWindowMs = yarnConfig.getContainerRetryWindowMs();

            if (retryCount == 0) {
                log.error(
                        "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.",
                        containerId, containerIdStr);

                tooManyFailedContainers = true;
            } else if (retryCount > 0) {
                int currentFailCount;
                long lastFailureTime;
                if (containerFailures.containsKey(containerId)) {
                    ContainerFailure failure = containerFailures.get(containerId);
                    currentFailCount = failure.getCount() + 1;
                    lastFailureTime = failure.getLastFailure();
                } else {
                    currentFailCount = 1;
                    lastFailureTime = 0L;
                }
                if (currentFailCount >= retryCount) {
                    long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime;

                    if (lastFailureMsDiff < retryWindowMs) {
                        log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed "
                                + currentFailCount + " times, with last failure " + lastFailureMsDiff
                                + "ms ago. This is greater than retry count of " + retryCount
                                + " and window of " + retryWindowMs
                                + "ms , so shutting down the application master, and marking the job as failed.");

                        // We have too many failures, and we're within the window
                        // boundary, so reset shut down the app master.
                        tooManyFailedContainers = true;
                        state.status = FinalApplicationStatus.FAILED;
                    } else {
                        log.info(
                                "Resetting fail count for container ID {} back to 1, since last container failure ({}) for "
                                        + "this container ID was outside the bounds of the retry window.",
                                containerId, containerIdStr);

                        // Reset counter back to 1, since the last failure for this
                        // container happened outside the window boundary.
                        containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis()));
                    }
                } else {
                    log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount);
                    containerFailures.put(containerId,
                            new ContainerFailure(currentFailCount, System.currentTimeMillis()));
                }
            }

            if (!tooManyFailedContainers) {
                // Request a new container
                containerAllocator.requestContainer(containerId, lastSeenOn);
            }
        }

    }
}

From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java

License:Apache License

/**
 * Test Task Manager should stop when all containers finish
 *///  www .ja v a  2  s.c o  m
@Test
public void testTaskManagerShouldStopWhenContainersFinish() {
    SamzaTaskManager taskManager = new SamzaTaskManager(getConfig(), state, amRmClientAsync,
            new YarnConfiguration());

    taskManager.onInit();

    assertFalse(taskManager.shouldShutdown());

    taskManager.onContainerCompleted(
            TestUtil.getContainerStatus(state.amContainerId, ContainerExitStatus.SUCCESS, ""));

    assertTrue(taskManager.shouldShutdown());
}

From source file:org.apache.samza.job.yarn.YarnClusterResourceManager.java

License:Apache License

/**
 * Callback invoked from Yarn when containers complete. This translates the yarn callbacks into Samza specific
 * ones.//from   w w w  .  j a  v a2 s.  co  m
 *
 * @param statuses the YarnContainerStatus callbacks from Yarn.
 */
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    List<SamzaResourceStatus> samzaResourceStatuses = new ArrayList<>();

    for (ContainerStatus status : statuses) {
        log.info(
                "Got completion notification for Container ID: {} with status: {} and state: {}. Diagnostics information: {}.",
                status.getContainerId(), status.getExitStatus(), status.getState(), status.getDiagnostics());

        SamzaResourceStatus samzaResourceStatus = new SamzaResourceStatus(status.getContainerId().toString(),
                status.getDiagnostics(), status.getExitStatus());
        samzaResourceStatuses.add(samzaResourceStatus);

        String completedProcessorID = getRunningProcessorId(status.getContainerId().toString());
        log.info("Completed Container ID: {} had Processor ID: {}", status.getContainerId(),
                completedProcessorID);

        //remove the container from the list of running containers, if failed with a non-zero exit code, add it to the list of
        //failed containers.
        if (!completedProcessorID.equals(INVALID_PROCESSOR_ID)) {
            if (state.runningProcessors.containsKey(completedProcessorID)) {
                log.info("Removing Processor ID: {} from YarnClusterResourceManager running processors.",
                        completedProcessorID);
                state.runningProcessors.remove(completedProcessorID);

                if (status.getExitStatus() != ContainerExitStatus.SUCCESS)
                    state.failedContainersStatus.put(status.getContainerId().toString(), status);
            }
        }
    }
    clusterManagerCallback.onResourcesCompleted(samzaResourceStatuses);
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java

License:Apache License

@Override
public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) {
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed. ";
        }// www  . j  av  a2 s  .c  o  m
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java

License:Apache License

public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
    // SchedulerId isn't used here since no node updates are sent out
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed, exitCode=" + exitStatus + ". ";
        }/* ww w  . ja  v  a 2  s  . c o  m*/
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.history.events.TestHistoryEventsProtoConversion.java

License:Apache License

private void testContainerStoppedEvent() throws Exception {
    ContainerStoppedEvent event = new ContainerStoppedEvent(
            ContainerId.newInstance(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1), 1001),
            100034566, ContainerExitStatus.SUCCESS,
            ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1));
    ContainerStoppedEvent deserializedEvent = (ContainerStoppedEvent) testProtoConversion(event);
    Assert.assertEquals(event.getContainerId(), deserializedEvent.getContainerId());
    Assert.assertEquals(event.getStoppedTime(), deserializedEvent.getStoppedTime());
    Assert.assertEquals(event.getApplicationAttemptId(), deserializedEvent.getApplicationAttemptId());
    logEvents(event, deserializedEvent);
}

From source file:org.elasticsearch.hadoop.yarn.am.EsCluster.java

License:Apache License

public void start() {
    running = true;//w w w .  j a va2  s  .  c o  m
    nmRpc.start();

    UserGroupInformation.setConfiguration(cfg);

    log.info(String.format("Allocating Elasticsearch cluster with %d nodes", appConfig.containersToAllocate()));

    // register requests
    Resource capability = YarnCompat.resource(cfg, appConfig.containerMem(), appConfig.containerVCores());
    Priority prio = Priority.newInstance(appConfig.amPriority());

    for (int i = 0; i < appConfig.containersToAllocate(); i++) {
        // TODO: Add allocation (host/rack rules) - and disable location constraints
        ContainerRequest req = new ContainerRequest(capability, null, null, prio);
        amRpc.addContainerRequest(req);
    }

    // update status every 5 sec
    final long heartBeatRate = TimeUnit.SECONDS.toMillis(5);

    // start the allocation loop
    // when a new container is allocated, launch it right away

    int responseId = 0;

    try {
        do {
            AllocateResponse alloc = amRpc.allocate(responseId++);
            List<Container> currentlyAllocated = alloc.getAllocatedContainers();
            for (Container container : currentlyAllocated) {
                launchContainer(container);
                allocatedContainers.add(container.getId());
            }

            if (currentlyAllocated.size() > 0) {
                int needed = appConfig.containersToAllocate() - allocatedContainers.size();
                if (needed > 0) {
                    log.info(String.format("%s containers allocated, %s remaining", allocatedContainers.size(),
                            needed));
                } else {
                    log.info(String.format("Fully allocated %s containers", allocatedContainers.size()));
                }
            }

            List<ContainerStatus> completed = alloc.getCompletedContainersStatuses();
            for (ContainerStatus status : completed) {
                if (!completedContainers.contains(status.getContainerId())) {
                    ContainerId containerId = status.getContainerId();
                    completedContainers.add(containerId);

                    boolean containerSuccesful = false;

                    switch (status.getExitStatus()) {
                    case ContainerExitStatus.SUCCESS:
                        log.info(String.format("Container %s finished succesfully...", containerId));
                        containerSuccesful = true;
                        break;
                    case ContainerExitStatus.ABORTED:
                        log.warn(String.format("Container %s aborted...", containerId));
                        break;
                    case ContainerExitStatus.DISKS_FAILED:
                        log.warn(String.format("Container %s ran out of disk...", containerId));
                        break;
                    case ContainerExitStatus.PREEMPTED:
                        log.warn(String.format("Container %s preempted...", containerId));
                        break;
                    default:
                        log.warn(String.format("Container %s exited with an invalid/unknown exit code...",
                                containerId));
                    }

                    if (!containerSuccesful) {
                        log.warn("Cluster has not completed succesfully...");
                        clusterHasFailed = true;
                        running = false;
                    }
                }
            }

            if (completedContainers.size() == appConfig.containersToAllocate()) {
                running = false;
            }

            if (running) {
                try {
                    Thread.sleep(heartBeatRate);
                } catch (Exception ex) {
                    throw new EsYarnNmException("Cluster interrupted");
                }
            }
        } while (running);
    } finally {
        log.info("Cluster has completed running...");
        try {
            Thread.sleep(TimeUnit.SECONDS.toMillis(15));
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        close();
    }
}

From source file:yarnkit.appmaster.ApplicationMasterService.java

License:Apache License

@Override
public void onContainersCompleted(@Nonnull List<ContainerStatus> containerStatuses) {
    LOG.info(containerStatuses.size() + " container(s) have completed");

    for (ContainerStatus status : containerStatuses) {
        LOG.info(YarnUtils.getContainerExitStatusMessage(status));

        int exitStatus = status.getExitStatus();
        if (exitStatus == ContainerExitStatus.SUCCESS) {
            totalCompleted.incrementAndGet();
        } else {/*from   w  ww  . j av a  2  s  .c o  m*/
            if (exitStatus != ContainerExitStatus.ABORTED) {
                totalCompleted.incrementAndGet();
                totalFailures.incrementAndGet();
            } else {
                // Containers killed by the framework, either due to being released by
                // the application or being 'lost' due to node failures etc.
            }
        }
    }

}