List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED
FinalApplicationStatus KILLED
To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED.
Click Source Link
From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java
License:Apache License
public AppFinalStatus getFinalAppStatus() { FinalApplicationStatus finishState = FinalApplicationStatus.UNDEFINED; StringBuffer sb = new StringBuffer(); if (dagAppMaster == null) { finishState = FinalApplicationStatus.UNDEFINED; sb.append("App not yet initialized"); } else {/*from w w w.ja va2 s .co m*/ DAGAppMasterState appMasterState = dagAppMaster.getState(); if (appMasterState == DAGAppMasterState.SUCCEEDED) { finishState = FinalApplicationStatus.SUCCEEDED; } else if (appMasterState == DAGAppMasterState.KILLED || (appMasterState == DAGAppMasterState.RUNNING && isSignalled)) { finishState = FinalApplicationStatus.KILLED; } else if (appMasterState == DAGAppMasterState.FAILED || appMasterState == DAGAppMasterState.ERROR) { finishState = FinalApplicationStatus.FAILED; } else { finishState = FinalApplicationStatus.UNDEFINED; } List<String> diagnostics = dagAppMaster.getDiagnostics(); if (diagnostics != null) { for (String s : diagnostics) { sb.append(s).append("\n"); } } } if (LOG.isDebugEnabled()) { LOG.debug("Setting job diagnostics to " + sb.toString()); } // if history url is set use the same, if historyUrl is set to "" then rm ui disables the // history url return new AppFinalStatus(finishState, sb.toString(), historyUrl); }
From source file:org.apache.tez.hadoop.shim.TestHadoopShim28.java
License:Apache License
@Test public void testApplyFinalApplicationStatusCorrection() { HadoopShim shim = new HadoopShim28(); // Session mode success/failure, change to ended Assert.assertEquals(FinalApplicationStatus.ENDED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.SUCCEEDED, true, false)); Assert.assertEquals(FinalApplicationStatus.ENDED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, true, false)); // Non-session mode success/failure, retain success/failure Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.SUCCEEDED, false, false)); Assert.assertEquals(FinalApplicationStatus.FAILED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, false, false)); // Session and non-session mode error, retain failed. Assert.assertEquals(FinalApplicationStatus.FAILED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, true, true)); Assert.assertEquals(FinalApplicationStatus.FAILED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, false, true)); // Session and non-session mode killed is killed. Assert.assertEquals(FinalApplicationStatus.KILLED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.KILLED, true, false)); Assert.assertEquals(FinalApplicationStatus.KILLED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.KILLED, false, false)); // Session and non-session mode undefined is undefined. Assert.assertEquals(FinalApplicationStatus.UNDEFINED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.UNDEFINED, true, false)); Assert.assertEquals(FinalApplicationStatus.UNDEFINED, shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.UNDEFINED, false, false)); }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
@Override public int run(String[] args) throws Exception { // Set our own configuration (ToolRunner only sets it prior to calling // run())// w w w .j av a 2 s . co m conf = getConf(); // Our own RM Handler ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId); // Connect rmHandler.getAMResourceManager(); // Register try { rmHandler.registerApplicationMaster(masterHost, masterPort); } catch (YarnRemoteException ex) { LOG.error("Error encountered while trying to register application master", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Get file splits, configuration, etc. Set<ConfigurationTuple> configTuples; try { configTuples = getConfigurationTuples(); } catch (IOException ex) { LOG.error("Error encountered while trying to generate configurations", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Needed for our master service later Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples); // Initial containers we want, based off of the file splits List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler); List<ContainerId> releasedContainers = new ArrayList<>(); // Send an initial allocation request List<Container> allocatedContainers = new ArrayList<>(); try { int needed = configTuples.size(); int got = 0; int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10")); int attempts = 0; List<Container> acquiredContainers; while (got < needed && attempts < maxAttempts) { LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts + ", maxAttempts=" + maxAttempts); acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getAllocatedContainers(); got += acquiredContainers.size(); attempts++; allocatedContainers.addAll(acquiredContainers); acquiredContainers.clear(); LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size()); Thread.sleep(2500); } } catch (YarnRemoteException ex) { LOG.error("Encountered an error while trying to allocate containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } final int numContainers = configTuples.size(); /* * * * TODO: fix this so we try N times to get enough containers! * * * * */ // Make sure we got all our containers, or else bail if (allocatedContainers.size() < numContainers) { LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers + ", allocated=" + allocatedContainers.size()); requestedContainers.clear(); // We don't want new containers! // Add containers into released list for (Container c : allocatedContainers) { releasedContainers.add(c.getId()); } // Release containers try { rmHandler.allocateRequest(requestedContainers, releasedContainers); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to release unwanted containers", ex); } // Notify our handlers that we got a problem rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got " + allocatedContainers.size(), FinalApplicationStatus.FAILED); // bail return ReturnCode.MASTER_ERROR.getCode(); } // Launch our worker process, as we now expect workers to actally do // something LOG.info("Starting master service"); ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf, masterComputable, masterUpdateable, appConfig, conf); ExecutorService executor = Executors.newSingleThreadExecutor(); Future<Integer> masterThread = executor.submit(masterService); // We got the number of containers we wanted, let's launch them LOG.info("Launching child containers"); List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers); // Use an empty list for heartbeat purposes requestedContainers.clear(); // Some local counters. Do we really need Atomic? AtomicInteger numCompletedContainers = new AtomicInteger(); AtomicInteger numFailedContainers = new AtomicInteger(); LOG.info("Waiting for containers to complete..."); // Go into run-loop waiting for containers to finish, also our heartbeat while (numCompletedContainers.get() < numContainers) { // Don't pound the RM try { Thread.sleep(2000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting on completed containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Heartbeat, effectively List<ContainerStatus> completedContainers; try { completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getCompletedContainersStatuses(); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex); continue; // Nothing to report, probably an error / endless loop } for (ContainerStatus cs : completedContainers) { int exitCode = cs.getExitStatus(); if (exitCode != 0) { numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); masterService.fail(); executor.shutdown(); // Force kill our application, fail fast? LOG.info("At least one container failed with a non-zero exit code (" + exitCode + "); killing application"); rmHandler.finishApplication( "Failing, due to at least container coming back with an non-zero exit code.", FinalApplicationStatus.KILLED); return -10; } else { numCompletedContainers.incrementAndGet(); } } } // All containers have completed // Wait for launch threads to complete (this shouldn't really happen) LOG.info("Containers completed"); for (Thread launchThread : launchThreads) { try { launchThread.join(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting for Launcher threads to complete", ex); } } // Ensure that our master service has completed as well if (!masterThread.isDone()) { masterService.fail(); } int masterExit = masterThread.get(); LOG.info("Master service completed with exitCode=" + masterExit); executor.shutdown(); if (masterExit == 0) { String impersonatedUser = System.getenv("USER"); UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser); //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser()); ugi.doAs(new PrivilegedExceptionAction<Void>() { public Void run() { Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH)); FileSystem fs; try { fs = out.getFileSystem(conf); FSDataOutputStream fos = fs.create(out); LOG.info("Writing master results to " + out.toString()); masterComputable.complete(fos); fos.flush(); fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; //FileSystem fs = FileSystem.get(conf); //fs.mkdir( out ); } }); /* LOG.info( "Here we would try to write to " + out.toString() ); LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() ); LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() ); LOG.info( "Env Var User: " + System.getenv("USER") ); */ //LOG.info( "Ideally we'd be user: " + this.props.getProperty( ) ); // for (Map.Entry<String, String> entry : this.conf) { // LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue()); // } } else { LOG.warn("Not writing master results, as the master came back with errors!"); } // Application finished ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR; try { if (numFailedContainers.get() == 0) { rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED); } else { String diag = "Completed with " + numFailedContainers.get() + " failed containers"; rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED); } } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to send final status to resource manager", ex); } return rc.getCode(); }