List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getExitStatus
@Public @Unstable public abstract int getExitStatus();
Get the exit status for the container.
Note: This is valid only for completed containers i.e.
From source file:org.apache.slider.server.appmaster.state.AppState.java
License:Apache License
/** * handle completed node in the CD -move something from the live * server list to the completed server list * @param status the node that has just completed * @return NodeCompletionResult/* w w w . j ava 2 s . c o m*/ */ public synchronized NodeCompletionResult onCompletedNode(ContainerStatus status) { ContainerId containerId = status.getContainerId(); NodeCompletionResult result = new NodeCompletionResult(); RoleInstance roleInstance; if (containersBeingReleased.containsKey(containerId)) { log.info("Container was queued for release : {}", containerId); Container container = containersBeingReleased.remove(containerId); RoleStatus roleStatus = lookupRoleStatus(container); int releasing = roleStatus.decReleasing(); int actual = roleStatus.decActual(); int completedCount = roleStatus.incCompleted(); log.info("decrementing role count for role {} to {}; releasing={}, completed={}", roleStatus.getName(), actual, releasing, completedCount); roleHistory.onReleaseCompleted(container, true); } else if (surplusNodes.remove(containerId)) { //its a surplus one being purged result.surplusNode = true; } else { //a container has failed result.containerFailed = true; roleInstance = removeOwnedContainer(containerId); if (roleInstance != null) { //it was active, move it to failed incFailedCountainerCount(); failedNodes.put(containerId, roleInstance); } else { // the container may have been noted as failed already, so look // it up roleInstance = failedNodes.get(containerId); } if (roleInstance != null) { int roleId = roleInstance.roleId; String rolename = roleInstance.role; log.info("Failed container in role[{}] : {}", roleId, rolename); try { RoleStatus roleStatus = lookupRoleStatus(roleId); roleStatus.decActual(); boolean shortLived = isShortLived(roleInstance); String message; Container failedContainer = roleInstance.container; //build the failure message if (failedContainer != null) { String completedLogsUrl = getLogsURLForContainer(failedContainer); message = String.format("Failure %s on host %s: %s", roleInstance.getContainerId().toString(), failedContainer.getNodeId().getHost(), completedLogsUrl); } else { message = String.format("Failure %s", containerId); } int failed = roleStatus.noteFailed(shortLived, message); log.info("Current count of failed role[{}] {} = {}", roleId, rolename, failed); if (failedContainer != null) { roleHistory.onFailedContainer(failedContainer, shortLived); } } catch (YarnRuntimeException e1) { log.error("Failed container of unknown role {}", roleId); } } else { //this isn't a known container. log.error("Notified of completed container {} that is not in the list" + " of active or failed containers", containerId); completionOfUnknownContainerEvent.incrementAndGet(); result.unknownNode = true; } } if (result.surplusNode) { //a surplus node return result; } //record the complete node's details; this pulls it from the livenode set //remove the node ContainerId id = status.getContainerId(); log.info("Removing node ID {}", id); RoleInstance node = getLiveNodes().remove(id); if (node != null) { node.state = ClusterDescription.STATE_DESTROYED; node.exitCode = status.getExitStatus(); node.diagnostics = status.getDiagnostics(); getCompletedNodes().put(id, node); result.roleInstance = node; } else { // not in the list log.warn("Received notification of completion of unknown node {}", id); completionOfNodeNotInLiveListEvent.incrementAndGet(); } // and the active node list if present removeOwnedContainer(containerId); // finally, verify the node doesn't exist any more assert !containersBeingReleased.containsKey(containerId) : "container still in release queue"; assert !getLiveNodes().containsKey(containerId) : " container still in live nodes"; assert getOwnedContainer(containerId) == null : "Container still in active container list"; return result; }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.java
License:Apache License
@Override public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) { // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed. "; }//from w w w .j ava 2 s. c o m if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java
License:Apache License
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) { // SchedulerId isn't used here since no node updates are sent out // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS) { message = "Container failed, exitCode=" + exitStatus + ". "; }/*from w ww.j av a 2 s.co m*/ if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java
License:Apache License
@Test(timeout = 5000) public void testContainerPreempted() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);//from w ww. j av a 2s.c om schedulerHandler.start(); String diagnostics = "Container preempted by RM."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED); schedulerHandler.containerCompleted(mockTask, mockStatus); Assert.assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; Assert.assertEquals(mockCId, completedEvent.getContainerId()); Assert.assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics()); Assert.assertTrue(completedEvent.isPreempted()); Assert.assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause()); Assert.assertFalse(completedEvent.isDiskFailed()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerEventHandler.java
License:Apache License
@Test(timeout = 5000) public void testContainerDiskFailed() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);/*from ww w . j a v a 2s .c o m*/ schedulerHandler.start(); String diagnostics = "NM disk failed."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED); schedulerHandler.containerCompleted(mockTask, mockStatus); Assert.assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); Assert.assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; Assert.assertEquals(mockCId, completedEvent.getContainerId()); Assert.assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics()); Assert.assertFalse(completedEvent.isPreempted()); Assert.assertTrue(completedEvent.isDiskFailed()); Assert.assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java
License:Apache License
@Test(timeout = 5000) public void testContainerPreempted() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);//from w w w . jav a 2s.c o m schedulerHandler.start(); String diagnostics = "Container preempted by RM."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.PREEMPTED); schedulerHandler.containerCompleted(0, mockTask, mockStatus); assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; assertEquals(mockCId, completedEvent.getContainerId()); assertEquals("Container preempted externally. Container preempted by RM.", completedEvent.getDiagnostics()); assertTrue(completedEvent.isPreempted()); assertEquals(TaskAttemptTerminationCause.EXTERNAL_PREEMPTION, completedEvent.getTerminationCause()); Assert.assertFalse(completedEvent.isDiskFailed()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java
License:Apache License
@Test(timeout = 5000) public void testContainerDiskFailed() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);/*from w w w . j a v a 2s . c om*/ schedulerHandler.start(); String diagnostics = "NM disk failed."; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED); schedulerHandler.containerCompleted(0, mockTask, mockStatus); assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; assertEquals(mockCId, completedEvent.getContainerId()); assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics()); Assert.assertFalse(completedEvent.isPreempted()); assertTrue(completedEvent.isDiskFailed()); assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.apache.tez.dag.app.rm.TestTaskSchedulerManager.java
License:Apache License
@Test(timeout = 5000) public void testContainerExceededPMem() throws IOException { Configuration conf = new Configuration(false); schedulerHandler.init(conf);// ww w .j av a2 s.c om schedulerHandler.start(); String diagnostics = "Exceeded Physical Memory"; TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class); ContainerStatus mockStatus = mock(ContainerStatus.class); ContainerId mockCId = mock(ContainerId.class); AMContainer mockAMContainer = mock(AMContainer.class); when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer); when(mockAMContainer.getContainerId()).thenReturn(mockCId); when(mockStatus.getContainerId()).thenReturn(mockCId); when(mockStatus.getDiagnostics()).thenReturn(diagnostics); // use -104 rather than ContainerExitStatus.KILLED_EXCEEDED_PMEM because // ContainerExitStatus.KILLED_EXCEEDED_PMEM is only available after hadoop-2.5 when(mockStatus.getExitStatus()).thenReturn(-104); schedulerHandler.containerCompleted(0, mockTask, mockStatus); assertEquals(1, mockEventHandler.events.size()); Event event = mockEventHandler.events.get(0); assertEquals(AMContainerEventType.C_COMPLETED, event.getType()); AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event; assertEquals(mockCId, completedEvent.getContainerId()); assertEquals("Container failed, exitCode=-104. Exceeded Physical Memory", completedEvent.getDiagnostics()); Assert.assertFalse(completedEvent.isPreempted()); Assert.assertFalse(completedEvent.isDiskFailed()); assertEquals(TaskAttemptTerminationCause.CONTAINER_EXITED, completedEvent.getTerminationCause()); schedulerHandler.stop(); schedulerHandler.close(); }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
@Override public int run(String[] args) throws Exception { // Set our own configuration (ToolRunner only sets it prior to calling // run())/*w w w .j ava 2s . c o m*/ conf = getConf(); // Our own RM Handler ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId); // Connect rmHandler.getAMResourceManager(); // Register try { rmHandler.registerApplicationMaster(masterHost, masterPort); } catch (YarnRemoteException ex) { LOG.error("Error encountered while trying to register application master", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Get file splits, configuration, etc. Set<ConfigurationTuple> configTuples; try { configTuples = getConfigurationTuples(); } catch (IOException ex) { LOG.error("Error encountered while trying to generate configurations", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Needed for our master service later Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples); // Initial containers we want, based off of the file splits List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler); List<ContainerId> releasedContainers = new ArrayList<>(); // Send an initial allocation request List<Container> allocatedContainers = new ArrayList<>(); try { int needed = configTuples.size(); int got = 0; int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10")); int attempts = 0; List<Container> acquiredContainers; while (got < needed && attempts < maxAttempts) { LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts + ", maxAttempts=" + maxAttempts); acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getAllocatedContainers(); got += acquiredContainers.size(); attempts++; allocatedContainers.addAll(acquiredContainers); acquiredContainers.clear(); LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size()); Thread.sleep(2500); } } catch (YarnRemoteException ex) { LOG.error("Encountered an error while trying to allocate containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } final int numContainers = configTuples.size(); /* * * * TODO: fix this so we try N times to get enough containers! * * * * */ // Make sure we got all our containers, or else bail if (allocatedContainers.size() < numContainers) { LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers + ", allocated=" + allocatedContainers.size()); requestedContainers.clear(); // We don't want new containers! // Add containers into released list for (Container c : allocatedContainers) { releasedContainers.add(c.getId()); } // Release containers try { rmHandler.allocateRequest(requestedContainers, releasedContainers); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to release unwanted containers", ex); } // Notify our handlers that we got a problem rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got " + allocatedContainers.size(), FinalApplicationStatus.FAILED); // bail return ReturnCode.MASTER_ERROR.getCode(); } // Launch our worker process, as we now expect workers to actally do // something LOG.info("Starting master service"); ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf, masterComputable, masterUpdateable, appConfig, conf); ExecutorService executor = Executors.newSingleThreadExecutor(); Future<Integer> masterThread = executor.submit(masterService); // We got the number of containers we wanted, let's launch them LOG.info("Launching child containers"); List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers); // Use an empty list for heartbeat purposes requestedContainers.clear(); // Some local counters. Do we really need Atomic? AtomicInteger numCompletedContainers = new AtomicInteger(); AtomicInteger numFailedContainers = new AtomicInteger(); LOG.info("Waiting for containers to complete..."); // Go into run-loop waiting for containers to finish, also our heartbeat while (numCompletedContainers.get() < numContainers) { // Don't pound the RM try { Thread.sleep(2000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting on completed containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Heartbeat, effectively List<ContainerStatus> completedContainers; try { completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getCompletedContainersStatuses(); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex); continue; // Nothing to report, probably an error / endless loop } for (ContainerStatus cs : completedContainers) { int exitCode = cs.getExitStatus(); if (exitCode != 0) { numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); masterService.fail(); executor.shutdown(); // Force kill our application, fail fast? LOG.info("At least one container failed with a non-zero exit code (" + exitCode + "); killing application"); rmHandler.finishApplication( "Failing, due to at least container coming back with an non-zero exit code.", FinalApplicationStatus.KILLED); return -10; } else { numCompletedContainers.incrementAndGet(); } } } // All containers have completed // Wait for launch threads to complete (this shouldn't really happen) LOG.info("Containers completed"); for (Thread launchThread : launchThreads) { try { launchThread.join(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting for Launcher threads to complete", ex); } } // Ensure that our master service has completed as well if (!masterThread.isDone()) { masterService.fail(); } int masterExit = masterThread.get(); LOG.info("Master service completed with exitCode=" + masterExit); executor.shutdown(); if (masterExit == 0) { String impersonatedUser = System.getenv("USER"); UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser); //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser()); ugi.doAs(new PrivilegedExceptionAction<Void>() { public Void run() { Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH)); FileSystem fs; try { fs = out.getFileSystem(conf); FSDataOutputStream fos = fs.create(out); LOG.info("Writing master results to " + out.toString()); masterComputable.complete(fos); fos.flush(); fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; //FileSystem fs = FileSystem.get(conf); //fs.mkdir( out ); } }); /* LOG.info( "Here we would try to write to " + out.toString() ); LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() ); LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() ); LOG.info( "Env Var User: " + System.getenv("USER") ); */ //LOG.info( "Ideally we'd be user: " + this.props.getProperty( ) ); // for (Map.Entry<String, String> entry : this.conf) { // LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue()); // } } else { LOG.warn("Not writing master results, as the master came back with errors!"); } // Application finished ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR; try { if (numFailedContainers.get() == 0) { rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED); } else { String diag = "Completed with " + numFailedContainers.get() + " failed containers"; rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED); } } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to send final status to resource manager", ex); } return rc.getCode(); }
From source file:org.elasticsearch.hadoop.yarn.am.EsCluster.java
License:Apache License
public void start() { running = true;/*from w w w. j a v a 2 s .co m*/ nmRpc.start(); UserGroupInformation.setConfiguration(cfg); log.info(String.format("Allocating Elasticsearch cluster with %d nodes", appConfig.containersToAllocate())); // register requests Resource capability = YarnCompat.resource(cfg, appConfig.containerMem(), appConfig.containerVCores()); Priority prio = Priority.newInstance(appConfig.amPriority()); for (int i = 0; i < appConfig.containersToAllocate(); i++) { // TODO: Add allocation (host/rack rules) - and disable location constraints ContainerRequest req = new ContainerRequest(capability, null, null, prio); amRpc.addContainerRequest(req); } // update status every 5 sec final long heartBeatRate = TimeUnit.SECONDS.toMillis(5); // start the allocation loop // when a new container is allocated, launch it right away int responseId = 0; try { do { AllocateResponse alloc = amRpc.allocate(responseId++); List<Container> currentlyAllocated = alloc.getAllocatedContainers(); for (Container container : currentlyAllocated) { launchContainer(container); allocatedContainers.add(container.getId()); } if (currentlyAllocated.size() > 0) { int needed = appConfig.containersToAllocate() - allocatedContainers.size(); if (needed > 0) { log.info(String.format("%s containers allocated, %s remaining", allocatedContainers.size(), needed)); } else { log.info(String.format("Fully allocated %s containers", allocatedContainers.size())); } } List<ContainerStatus> completed = alloc.getCompletedContainersStatuses(); for (ContainerStatus status : completed) { if (!completedContainers.contains(status.getContainerId())) { ContainerId containerId = status.getContainerId(); completedContainers.add(containerId); boolean containerSuccesful = false; switch (status.getExitStatus()) { case ContainerExitStatus.SUCCESS: log.info(String.format("Container %s finished succesfully...", containerId)); containerSuccesful = true; break; case ContainerExitStatus.ABORTED: log.warn(String.format("Container %s aborted...", containerId)); break; case ContainerExitStatus.DISKS_FAILED: log.warn(String.format("Container %s ran out of disk...", containerId)); break; case ContainerExitStatus.PREEMPTED: log.warn(String.format("Container %s preempted...", containerId)); break; default: log.warn(String.format("Container %s exited with an invalid/unknown exit code...", containerId)); } if (!containerSuccesful) { log.warn("Cluster has not completed succesfully..."); clusterHasFailed = true; running = false; } } } if (completedContainers.size() == appConfig.containersToAllocate()) { running = false; } if (running) { try { Thread.sleep(heartBeatRate); } catch (Exception ex) { throw new EsYarnNmException("Cluster interrupted"); } } } while (running); } finally { log.info("Cluster has completed running..."); try { Thread.sleep(TimeUnit.SECONDS.toMillis(15)); } catch (InterruptedException e) { throw new RuntimeException(e); } close(); } }