List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED
FinalApplicationStatus FAILED
To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.
Click Source Link
From source file:org.apache.tez.tests.TestExternalTezServicesErrors.java
License:Apache License
private void testFatalError(String methodName, Vertex.VertexExecutionContext lhsExecutionContext, String dagNameSuffix, List<String> expectedDiagMessages) throws IOException, TezException, YarnException, InterruptedException { TezConfiguration tezClientConf = new TezConfiguration(extServiceTestHelper.getConfForJobs()); TezClient tezClient = TezClient//from w w w .j a v a 2 s.co m .newBuilder(TestExternalTezServicesErrors.class.getSimpleName() + methodName + "_session", tezClientConf) .setIsSession(true).setServicePluginDescriptor(servicePluginsDescriptor).build(); ApplicationId appId = null; try { tezClient.start(); LOG.info("TezSessionStarted for " + methodName); tezClient.waitTillReady(); LOG.info("TezSession ready for submission for " + methodName); JoinValidateConfigured joinValidate = new JoinValidateConfigured(EXECUTION_CONTEXT_DEFAULT, lhsExecutionContext, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, dagNameSuffix); DAG dag = joinValidate.createDag(new TezConfiguration(extServiceTestHelper.getConfForJobs()), HASH_JOIN_EXPECTED_RESULT_PATH, HASH_JOIN_OUTPUT_PATH, 3); DAGClient dagClient = tezClient.submitDAG(dag); DAGStatus dagStatus = dagClient .waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS)); assertEquals(DAGStatus.State.ERROR, dagStatus.getState()); boolean foundDiag = false; for (String diag : dagStatus.getDiagnostics()) { foundDiag = checkDiag(diag, expectedDiagMessages); if (foundDiag) { break; } } appId = tezClient.getAppMasterApplicationId(); assertTrue(foundDiag); } catch (InterruptedException e) { e.printStackTrace(); } finally { tezClient.stop(); } // Verify the state of the application. if (appId != null) { YarnClient yarnClient = YarnClient.createYarnClient(); try { yarnClient.init(tezClientConf); yarnClient.start(); ApplicationReport appReport = yarnClient.getApplicationReport(appId); YarnApplicationState appState = appReport.getYarnApplicationState(); while (!EnumSet .of(YarnApplicationState.FINISHED, YarnApplicationState.FAILED, YarnApplicationState.KILLED) .contains(appState)) { Thread.sleep(200L); appReport = yarnClient.getApplicationReport(appId); appState = appReport.getYarnApplicationState(); } // TODO Workaround for YARN-4554. AppReport does not provide diagnostics - need to fetch them from ApplicationAttemptReport ApplicationAttemptId appAttemptId = appReport.getCurrentApplicationAttemptId(); ApplicationAttemptReport appAttemptReport = yarnClient.getApplicationAttemptReport(appAttemptId); String diag = appAttemptReport.getDiagnostics(); assertEquals(FinalApplicationStatus.FAILED, appReport.getFinalApplicationStatus()); assertEquals(YarnApplicationState.FINISHED, appReport.getYarnApplicationState()); checkDiag(diag, expectedDiagMessages); } finally { yarnClient.stop(); } } }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
@Override public int run(String[] args) throws Exception { // Set our own configuration (ToolRunner only sets it prior to calling // run())//from ww w . ja v a 2 s . c o m conf = getConf(); // Our own RM Handler ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId); // Connect rmHandler.getAMResourceManager(); // Register try { rmHandler.registerApplicationMaster(masterHost, masterPort); } catch (YarnRemoteException ex) { LOG.error("Error encountered while trying to register application master", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Get file splits, configuration, etc. Set<ConfigurationTuple> configTuples; try { configTuples = getConfigurationTuples(); } catch (IOException ex) { LOG.error("Error encountered while trying to generate configurations", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Needed for our master service later Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples); // Initial containers we want, based off of the file splits List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler); List<ContainerId> releasedContainers = new ArrayList<>(); // Send an initial allocation request List<Container> allocatedContainers = new ArrayList<>(); try { int needed = configTuples.size(); int got = 0; int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10")); int attempts = 0; List<Container> acquiredContainers; while (got < needed && attempts < maxAttempts) { LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts + ", maxAttempts=" + maxAttempts); acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getAllocatedContainers(); got += acquiredContainers.size(); attempts++; allocatedContainers.addAll(acquiredContainers); acquiredContainers.clear(); LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size()); Thread.sleep(2500); } } catch (YarnRemoteException ex) { LOG.error("Encountered an error while trying to allocate containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } final int numContainers = configTuples.size(); /* * * * TODO: fix this so we try N times to get enough containers! * * * * */ // Make sure we got all our containers, or else bail if (allocatedContainers.size() < numContainers) { LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers + ", allocated=" + allocatedContainers.size()); requestedContainers.clear(); // We don't want new containers! // Add containers into released list for (Container c : allocatedContainers) { releasedContainers.add(c.getId()); } // Release containers try { rmHandler.allocateRequest(requestedContainers, releasedContainers); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to release unwanted containers", ex); } // Notify our handlers that we got a problem rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got " + allocatedContainers.size(), FinalApplicationStatus.FAILED); // bail return ReturnCode.MASTER_ERROR.getCode(); } // Launch our worker process, as we now expect workers to actally do // something LOG.info("Starting master service"); ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf, masterComputable, masterUpdateable, appConfig, conf); ExecutorService executor = Executors.newSingleThreadExecutor(); Future<Integer> masterThread = executor.submit(masterService); // We got the number of containers we wanted, let's launch them LOG.info("Launching child containers"); List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers); // Use an empty list for heartbeat purposes requestedContainers.clear(); // Some local counters. Do we really need Atomic? AtomicInteger numCompletedContainers = new AtomicInteger(); AtomicInteger numFailedContainers = new AtomicInteger(); LOG.info("Waiting for containers to complete..."); // Go into run-loop waiting for containers to finish, also our heartbeat while (numCompletedContainers.get() < numContainers) { // Don't pound the RM try { Thread.sleep(2000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting on completed containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Heartbeat, effectively List<ContainerStatus> completedContainers; try { completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getCompletedContainersStatuses(); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex); continue; // Nothing to report, probably an error / endless loop } for (ContainerStatus cs : completedContainers) { int exitCode = cs.getExitStatus(); if (exitCode != 0) { numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); masterService.fail(); executor.shutdown(); // Force kill our application, fail fast? LOG.info("At least one container failed with a non-zero exit code (" + exitCode + "); killing application"); rmHandler.finishApplication( "Failing, due to at least container coming back with an non-zero exit code.", FinalApplicationStatus.KILLED); return -10; } else { numCompletedContainers.incrementAndGet(); } } } // All containers have completed // Wait for launch threads to complete (this shouldn't really happen) LOG.info("Containers completed"); for (Thread launchThread : launchThreads) { try { launchThread.join(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting for Launcher threads to complete", ex); } } // Ensure that our master service has completed as well if (!masterThread.isDone()) { masterService.fail(); } int masterExit = masterThread.get(); LOG.info("Master service completed with exitCode=" + masterExit); executor.shutdown(); if (masterExit == 0) { String impersonatedUser = System.getenv("USER"); UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser); //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser()); ugi.doAs(new PrivilegedExceptionAction<Void>() { public Void run() { Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH)); FileSystem fs; try { fs = out.getFileSystem(conf); FSDataOutputStream fos = fs.create(out); LOG.info("Writing master results to " + out.toString()); masterComputable.complete(fos); fos.flush(); fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; //FileSystem fs = FileSystem.get(conf); //fs.mkdir( out ); } }); /* LOG.info( "Here we would try to write to " + out.toString() ); LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() ); LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() ); LOG.info( "Env Var User: " + System.getenv("USER") ); */ //LOG.info( "Ideally we'd be user: " + this.props.getProperty( ) ); // for (Map.Entry<String, String> entry : this.conf) { // LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue()); // } } else { LOG.warn("Not writing master results, as the master came back with errors!"); } // Application finished ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR; try { if (numFailedContainers.get() == 0) { rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED); } else { String diag = "Completed with " + numFailedContainers.get() + " failed containers"; rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED); } } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to send final status to resource manager", ex); } return rc.getCode(); }
From source file:org.dknight.app.ApplicationMaster.java
License:Apache License
private void finish() { // Join all launched threads // needed for when we time out // and we need to release containers for (Thread launchThread : launchThreads) { try {//from w w w .j a v a 2 s. c om launchThread.join(10000); } catch (InterruptedException e) { LOG.info("Exception thrown in thread join: " + e.getMessage()); e.printStackTrace(); } } //stop yarnClient yarnClient.stop(); // When the application completes, it should stop all running containers LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); // When the application completes, it should send a finish application // signal to the RM LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus; String appMessage = null; success = true; if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) { appStatus = FinalApplicationStatus.SUCCEEDED; } else { appStatus = FinalApplicationStatus.FAILED; appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get(); success = false; } try { amRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (YarnException ex) { LOG.error("Failed to unregister application", ex); } catch (IOException e) { LOG.error("Failed to unregister application", e); } amRMClient.stop(); }
From source file:org.elasticsearch.hadoop.yarn.am.AppMasterRpc.java
License:Apache License
public void failAM() { unregisterAM(FinalApplicationStatus.FAILED); }
From source file:org.hdl.caffe.yarn.app.ApplicationMaster.java
License:Apache License
@VisibleForTesting protected boolean finish() { // wait for completion. while (!done && (numCompletedContainers.get() != numTotalContainers)) { try {/* ww w .j av a 2s . co m*/ Thread.sleep(200); } catch (InterruptedException ex) { } } // Join all launched threads // needed for when we time out // and we need to release containers for (Thread launchThread : launchThreads) { try { launchThread.join(10000); } catch (InterruptedException e) { LOG.info("Exception thrown in thread join: " + e.getMessage()); e.printStackTrace(); } } // When the application completes, it should stop all running containers LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); // When the application completes, it should send a finish application // signal to the RM LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus; String appMessage = null; boolean success = true; if (numFailedContainers.get() == 0) { appStatus = FinalApplicationStatus.SUCCEEDED; } else { appStatus = FinalApplicationStatus.FAILED; appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get(); LOG.info(appMessage); success = false; } try { amRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (YarnException ex) { LOG.error("Failed to unregister application", ex); } catch (IOException e) { LOG.error("Failed to unregister application", e); } amRMClient.stop(); return success; }
From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java
License:Apache License
private FinalApplicationStatus getFinalAppStatus() { if (completedContainerNum.get() - failedContainerNum.get() >= args.totalContainerNum) { return FinalApplicationStatus.SUCCEEDED; } else {/*from w w w . ja v a2s.co m*/ return FinalApplicationStatus.FAILED; } }
From source file:org.hortonworks.dovetail.am.AppMaster.java
License:Apache License
private void finish() { for (Thread launchThread : launchThreads) { try {/*w w w. j av a2 s. c o m*/ launchThread.join(10000); } catch (InterruptedException e) { LOG.info("Exception thrown in thread join: " + e.getMessage()); e.printStackTrace(); } } LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus; String appMessage = null; success = true; if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numContainers) { appStatus = FinalApplicationStatus.SUCCEEDED; } else { appStatus = FinalApplicationStatus.FAILED; appMessage = "Diagnostics." + ", total=" + numContainers + ", completed=" + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get(); success = false; } try { resourceManager.unregisterApplicationMaster(appStatus, appMessage, null); } catch (YarnException ex) { LOG.log(Level.SEVERE, "Failed to unregister application", ex); } catch (IOException e) { LOG.log(Level.SEVERE, "Failed to unregister application", e); } done = true; resourceManager.stop(); }
From source file:org.springframework.yarn.am.StaticAppmaster.java
License:Apache License
@Override protected void onInit() throws Exception { super.onInit(); getMonitor().addContainerMonitorStateListener(new ContainerMonitorListener() { @Override/* ww w . ja va 2s . c o m*/ public void state(ContainerMonitorState state) { if (getMonitor().freeCount() == 0) { int completed = state.getCompleted(); int failed = state.getFailed(); if (completed + failed >= containerCount) { if (failed > 0) { setFinalApplicationStatus(FinalApplicationStatus.FAILED); } notifyCompleted(); } } } }); }
From source file:org.springframework.yarn.am.StaticEventingAppmaster.java
License:Apache License
@Override protected void onContainerCompleted(ContainerStatus status) { super.onContainerCompleted(status); getMonitor().reportContainerStatus(status); int exitStatus = status.getExitStatus(); if (exitStatus == 0) { if (isComplete()) { notifyCompleted();//ww w. j a v a 2s .com } } else { if (!onContainerFailed(status)) { setFinalApplicationStatus(FinalApplicationStatus.FAILED); notifyCompleted(); } } }
From source file:org.springframework.yarn.batch.am.AbstractBatchAppmaster.java
License:Apache License
/** * Runs the given job.//from ww w . j a v a 2 s . c o m * * @param job the job to run */ public void runJob(Job job) { if (job instanceof AbstractJob) { ((AbstractJob) job).registerJobExecutionListener(new JobExecutionListener() { @Override public void beforeJob(JobExecution jobExecution) { } @Override public void afterJob(JobExecution jobExecution) { if (jobExecution.getStatus().equals(BatchStatus.COMPLETED)) { log.info("Batch status complete, notify listeners"); notifyCompleted(); } } }); } JobParameters jobParameters = getJobParametersConverter().getJobParameters(getParameters()); try { getJobLauncher().run(job, jobParameters); } catch (Exception e) { log.error("Error running job=" + job, e); setFinalApplicationStatus(FinalApplicationStatus.FAILED); notifyCompleted(); } }