Example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus.

Prototype

@Public
@Unstable
public abstract int getExitStatus();

Source Link

Document

Get the exit status for the container.

Note: This is valid only for completed containers i.e.

Usage

From source file:org.apache.slider.server.appmaster.state.AppState.java

License:Apache License

/**
 * handle completed node in the CD -move something from the live
 * server list to the completed server list
 * @param status the node that has just completed
 * @return NodeCompletionResult/* w  w w  . j ava 2 s . c o  m*/
 */
public synchronized NodeCompletionResult onCompletedNode(ContainerStatus status) {
    ContainerId containerId = status.getContainerId();
    NodeCompletionResult result = new NodeCompletionResult();
    RoleInstance roleInstance;

    if (containersBeingReleased.containsKey(containerId)) {
        log.info("Container was queued for release : {}", containerId);
        Container container = containersBeingReleased.remove(containerId);
        RoleStatus roleStatus = lookupRoleStatus(container);
        int releasing = roleStatus.decReleasing();
        int actual = roleStatus.decActual();
        int completedCount = roleStatus.incCompleted();
        log.info("decrementing role count for role {} to {}; releasing={}, completed={}", roleStatus.getName(),
                actual, releasing, completedCount);
        roleHistory.onReleaseCompleted(container, true);

    } else if (surplusNodes.remove(containerId)) {
        //its a surplus one being purged
        result.surplusNode = true;
    } else {
        //a container has failed 
        result.containerFailed = true;
        roleInstance = removeOwnedContainer(containerId);
        if (roleInstance != null) {
            //it was active, move it to failed 
            incFailedCountainerCount();
            failedNodes.put(containerId, roleInstance);
        } else {
            // the container may have been noted as failed already, so look
            // it up
            roleInstance = failedNodes.get(containerId);
        }
        if (roleInstance != null) {
            int roleId = roleInstance.roleId;
            String rolename = roleInstance.role;
            log.info("Failed container in role[{}] : {}", roleId, rolename);
            try {
                RoleStatus roleStatus = lookupRoleStatus(roleId);
                roleStatus.decActual();
                boolean shortLived = isShortLived(roleInstance);
                String message;
                Container failedContainer = roleInstance.container;

                //build the failure message
                if (failedContainer != null) {
                    String completedLogsUrl = getLogsURLForContainer(failedContainer);
                    message = String.format("Failure %s on host %s: %s",
                            roleInstance.getContainerId().toString(), failedContainer.getNodeId().getHost(),
                            completedLogsUrl);
                } else {
                    message = String.format("Failure %s", containerId);
                }
                int failed = roleStatus.noteFailed(shortLived, message);
                log.info("Current count of failed role[{}] {} =  {}", roleId, rolename, failed);
                if (failedContainer != null) {
                    roleHistory.onFailedContainer(failedContainer, shortLived);
                }

            } catch (YarnRuntimeException e1) {
                log.error("Failed container of unknown role {}", roleId);
            }
        } else {
            //this isn't a known container.

            log.error("Notified of completed container {} that is not in the list"
                    + " of active or failed containers", containerId);
            completionOfUnknownContainerEvent.incrementAndGet();
            result.unknownNode = true;
        }
    }

    if (result.surplusNode) {
        //a surplus node
        return result;
    }

    //record the complete node's details; this pulls it from the livenode set 
    //remove the node
    ContainerId id = status.getContainerId();
    log.info("Removing node ID {}", id);
    RoleInstance node = getLiveNodes().remove(id);
    if (node != null) {
        node.state = ClusterDescription.STATE_DESTROYED;
        node.exitCode = status.getExitStatus();
        node.diagnostics = status.getDiagnostics();
        getCompletedNodes().put(id, node);
        result.roleInstance = node;
    } else {
        // not in the list
        log.warn("Received notification of completion of unknown node {}", id);
        completionOfNodeNotInLiveListEvent.incrementAndGet();

    }

    // and the active node list if present
    removeOwnedContainer(containerId);

    // finally, verify the node doesn't exist any more
    assert !containersBeingReleased.containsKey(containerId) : "container still in release queue";
    assert !getLiveNodes().containsKey(containerId) : " container still in live nodes";
    assert getOwnedContainer(containerId) == null : "Container still in active container list";

    return result;
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java

License:Apache License

@Override
public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) {
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed. ";
        }//from  w w  w  .j ava  2  s.  c  o  m
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java

License:Apache License

public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
    // SchedulerId isn't used here since no node updates are sent out
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed, exitCode=" + exitStatus + ". ";
        }/*from  w  ww.j  av  a 2 s.co m*/
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java

License:Apache License

@Test(timeout = 5000)
public void testContainerPreempted() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);//from   w ww. j av  a 2s.c om
    schedulerHandler.start();

    String diagnostics = "Container preempted by RM.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED);
    schedulerHandler.containerCompleted(mockTask, mockStatus);
    Assert.assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    Assert.assertEquals(mockCId, completedEvent.getContainerId());
    Assert.assertEquals("Container preempted externally. Container preempted by RM.",
            completedEvent.getDiagnostics());
    Assert.assertTrue(completedEvent.isPreempted());
    Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause());
    Assert.assertFalse(completedEvent.isDiskFailed());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java

License:Apache License

@Test(timeout = 5000)
public void testContainerDiskFailed() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);/*from ww w  .  j  a v  a  2s  .c  o  m*/
    schedulerHandler.start();

    String diagnostics = "NM disk failed.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED);
    schedulerHandler.containerCompleted(mockTask, mockStatus);
    Assert.assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    Assert.assertEquals(mockCId, completedEvent.getContainerId());
    Assert.assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics());
    Assert.assertFalse(completedEvent.isPreempted());
    Assert.assertTrue(completedEvent.isDiskFailed());
    Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java

License:Apache License

@Test(timeout = 5000)
public void testContainerPreempted() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);//from w w  w  . jav a  2s.c o m
    schedulerHandler.start();

    String diagnostics = "Container preempted by RM.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED);
    schedulerHandler.containerCompleted(0, mockTask, mockStatus);
    assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    assertEquals(mockCId, completedEvent.getContainerId());
    assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics());
    assertTrue(completedEvent.isPreempted());
    assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause());
    Assert.assertFalse(completedEvent.isDiskFailed());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java

License:Apache License

@Test(timeout = 5000)
public void testContainerDiskFailed() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);/*from   w  w  w .  j  a  v a  2s .  c  om*/
    schedulerHandler.start();

    String diagnostics = "NM disk failed.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED);
    schedulerHandler.containerCompleted(0, mockTask, mockStatus);
    assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    assertEquals(mockCId, completedEvent.getContainerId());
    assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics());
    Assert.assertFalse(completedEvent.isPreempted());
    assertTrue(completedEvent.isDiskFailed());
    assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java

License:Apache License

@Test(timeout = 5000)
public void testContainerExceededPMem() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);//  ww  w .j av a2 s.c  om
    schedulerHandler.start();

    String diagnostics = "Exceeded Physical Memory";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    // use -104 rather than ContainerExitStatus.KILLED_EXCEEDED_PMEM because
    // ContainerExitStatus.KILLED_EXCEEDED_PMEM is only available after hadoop-2.5
    when(mockStatus.getExitStatus()).thenReturn(-104);
    schedulerHandler.containerCompleted(0, mockTask, mockStatus);
    assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    assertEquals(mockCId, completedEvent.getContainerId());
    assertEquals("Container failed, exitCode=-104. Exceeded Physical Memory", completedEvent.getDiagnostics());
    Assert.assertFalse(completedEvent.isPreempted());
    Assert.assertFalse(completedEvent.isDiskFailed());
    assertEquals(TaskAttemptTerminationCause.CONTAINER_EXITED, completedEvent.getTerminationCause());

    schedulerHandler.stop();
    schedulerHandler.close();
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Set our own configuration (ToolRunner only sets it prior to calling
    // run())/*w  w  w .j  ava 2s .  c o m*/
    conf = getConf();

    // Our own RM Handler
    ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId);

    // Connect
    rmHandler.getAMResourceManager();

    // Register
    try {
        rmHandler.registerApplicationMaster(masterHost, masterPort);
    } catch (YarnRemoteException ex) {
        LOG.error("Error encountered while trying to register application master", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Get file splits, configuration, etc.
    Set<ConfigurationTuple> configTuples;
    try {
        configTuples = getConfigurationTuples();
    } catch (IOException ex) {
        LOG.error("Error encountered while trying to generate configurations", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }
    // Needed for our master service later
    Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples);

    // Initial containers we want, based off of the file splits
    List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler);
    List<ContainerId> releasedContainers = new ArrayList<>();

    // Send an initial allocation request
    List<Container> allocatedContainers = new ArrayList<>();
    try {
        int needed = configTuples.size();
        int got = 0;
        int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10"));
        int attempts = 0;

        List<Container> acquiredContainers;

        while (got < needed && attempts < maxAttempts) {
            LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts
                    + ", maxAttempts=" + maxAttempts);

            acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getAllocatedContainers();

            got += acquiredContainers.size();
            attempts++;

            allocatedContainers.addAll(acquiredContainers);
            acquiredContainers.clear();

            LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size());

            Thread.sleep(2500);
        }
    } catch (YarnRemoteException ex) {
        LOG.error("Encountered an error while trying to allocate containers", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    final int numContainers = configTuples.size();

    /*
     * 
     * 
     * TODO: fix this so we try N times to get enough containers!
     * 
     * 
     * 
     * 
     */
    // Make sure we got all our containers, or else bail
    if (allocatedContainers.size() < numContainers) {
        LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers
                + ", allocated=" + allocatedContainers.size());

        requestedContainers.clear(); // We don't want new containers!

        // Add containers into released list
        for (Container c : allocatedContainers) {
            releasedContainers.add(c.getId());
        }

        // Release containers
        try {
            rmHandler.allocateRequest(requestedContainers, releasedContainers);
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to release unwanted containers", ex);
        }

        // Notify our handlers that we got a problem
        rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got "
                + allocatedContainers.size(), FinalApplicationStatus.FAILED);
        // bail
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Launch our worker process, as we now expect workers to actally do
    // something
    LOG.info("Starting master service");
    ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf,
            masterComputable, masterUpdateable, appConfig, conf);

    ExecutorService executor = Executors.newSingleThreadExecutor();
    Future<Integer> masterThread = executor.submit(masterService);

    // We got the number of containers we wanted, let's launch them
    LOG.info("Launching child containers");
    List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers);

    // Use an empty list for heartbeat purposes
    requestedContainers.clear();

    // Some local counters. Do we really need Atomic?
    AtomicInteger numCompletedContainers = new AtomicInteger();
    AtomicInteger numFailedContainers = new AtomicInteger();

    LOG.info("Waiting for containers to complete...");
    // Go into run-loop waiting for containers to finish, also our heartbeat
    while (numCompletedContainers.get() < numContainers) {
        // Don't pound the RM
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting on completed containers", ex);
            return ReturnCode.MASTER_ERROR.getCode();
        }

        // Heartbeat, effectively
        List<ContainerStatus> completedContainers;

        try {
            completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getCompletedContainersStatuses();
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex);

            continue; // Nothing to report, probably an error / endless loop
        }

        for (ContainerStatus cs : completedContainers) {
            int exitCode = cs.getExitStatus();
            if (exitCode != 0) {
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();

                masterService.fail();
                executor.shutdown();

                // Force kill our application, fail fast?
                LOG.info("At least one container failed with a non-zero exit code (" + exitCode
                        + "); killing application");
                rmHandler.finishApplication(
                        "Failing, due to at least container coming back with an non-zero exit code.",
                        FinalApplicationStatus.KILLED);

                return -10;
            } else {
                numCompletedContainers.incrementAndGet();
            }
        }
    }

    // All containers have completed
    // Wait for launch threads to complete (this shouldn't really happen)
    LOG.info("Containers completed");
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(1000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting for Launcher threads to complete", ex);
        }
    }

    // Ensure that our master service has completed as well
    if (!masterThread.isDone()) {
        masterService.fail();
    }

    int masterExit = masterThread.get();
    LOG.info("Master service completed with exitCode=" + masterExit);
    executor.shutdown();

    if (masterExit == 0) {

        String impersonatedUser = System.getenv("USER");

        UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser);
        //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser());
        ugi.doAs(new PrivilegedExceptionAction<Void>() {
            public Void run() {

                Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH));
                FileSystem fs;
                try {
                    fs = out.getFileSystem(conf);

                    FSDataOutputStream fos = fs.create(out);
                    LOG.info("Writing master results to " + out.toString());

                    masterComputable.complete(fos);

                    fos.flush();
                    fos.close();

                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

                return null;

                //FileSystem fs = FileSystem.get(conf);
                //fs.mkdir( out );
            }
        });

        /*
        LOG.info( "Here we would try to write to " + out.toString() );
        LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() );
        LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() );
                
        LOG.info( "Env Var User: " + System.getenv("USER") );
        */
        //LOG.info( "Ideally we'd be user: " + this.props.getProperty(  ) );

        //       for (Map.Entry<String, String> entry : this.conf) {
        //           LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue());
        //     }

    } else {
        LOG.warn("Not writing master results, as the master came back with errors!");
    }

    // Application finished
    ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR;

    try {
        if (numFailedContainers.get() == 0) {
            rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED);
        } else {
            String diag = "Completed with " + numFailedContainers.get() + " failed containers";
            rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED);
        }
    } catch (YarnRemoteException ex) {
        LOG.warn("Encountered an error while trying to send final status to resource manager", ex);
    }

    return rc.getCode();
}

From source file:org.elasticsearch.hadoop.yarn.am.EsCluster.java

License:Apache License

public void start() {
    running = true;/*from   w w w. j  a v a  2  s .co m*/
    nmRpc.start();

    UserGroupInformation.setConfiguration(cfg);

    log.info(String.format("Allocating Elasticsearch cluster with %d nodes", appConfig.containersToAllocate()));

    // register requests
    Resource capability = YarnCompat.resource(cfg, appConfig.containerMem(), appConfig.containerVCores());
    Priority prio = Priority.newInstance(appConfig.amPriority());

    for (int i = 0; i < appConfig.containersToAllocate(); i++) {
        // TODO: Add allocation (host/rack rules) - and disable location constraints
        ContainerRequest req = new ContainerRequest(capability, null, null, prio);
        amRpc.addContainerRequest(req);
    }

    // update status every 5 sec
    final long heartBeatRate = TimeUnit.SECONDS.toMillis(5);

    // start the allocation loop
    // when a new container is allocated, launch it right away

    int responseId = 0;

    try {
        do {
            AllocateResponse alloc = amRpc.allocate(responseId++);
            List<Container> currentlyAllocated = alloc.getAllocatedContainers();
            for (Container container : currentlyAllocated) {
                launchContainer(container);
                allocatedContainers.add(container.getId());
            }

            if (currentlyAllocated.size() > 0) {
                int needed = appConfig.containersToAllocate() - allocatedContainers.size();
                if (needed > 0) {
                    log.info(String.format("%s containers allocated, %s remaining", allocatedContainers.size(),
                            needed));
                } else {
                    log.info(String.format("Fully allocated %s containers", allocatedContainers.size()));
                }
            }

            List<ContainerStatus> completed = alloc.getCompletedContainersStatuses();
            for (ContainerStatus status : completed) {
                if (!completedContainers.contains(status.getContainerId())) {
                    ContainerId containerId = status.getContainerId();
                    completedContainers.add(containerId);

                    boolean containerSuccesful = false;

                    switch (status.getExitStatus()) {
                    case ContainerExitStatus.SUCCESS:
                        log.info(String.format("Container %s finished succesfully...", containerId));
                        containerSuccesful = true;
                        break;
                    case ContainerExitStatus.ABORTED:
                        log.warn(String.format("Container %s aborted...", containerId));
                        break;
                    case ContainerExitStatus.DISKS_FAILED:
                        log.warn(String.format("Container %s ran out of disk...", containerId));
                        break;
                    case ContainerExitStatus.PREEMPTED:
                        log.warn(String.format("Container %s preempted...", containerId));
                        break;
                    default:
                        log.warn(String.format("Container %s exited with an invalid/unknown exit code...",
                                containerId));
                    }

                    if (!containerSuccesful) {
                        log.warn("Cluster has not completed succesfully...");
                        clusterHasFailed = true;
                        running = false;
                    }
                }
            }

            if (completedContainers.size() == appConfig.containersToAllocate()) {
                running = false;
            }

            if (running) {
                try {
                    Thread.sleep(heartBeatRate);
                } catch (Exception ex) {
                    throw new EsYarnNmException("Cluster interrupted");
                }
            }
        } while (running);
    } finally {
        log.info("Cluster has completed running...");
        try {
            Thread.sleep(TimeUnit.SECONDS.toMillis(15));
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        close();
    }
}