List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED
FinalApplicationStatus SUCCEEDED
To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED.
Click Source Link
From source file:org.chenchun.ApplicationMaster.java
License:Apache License
public static void main(String[] args) throws IOException, YarnException, InterruptedException { final String params = args[0]; final int containerNum = Integer.valueOf(args[1]); // Initialize clients to ResourceManager and NodeManagers Configuration conf = new YarnConfiguration(); AMRMClient<AMRMClient.ContainerRequest> rmClient = AMRMClient.createAMRMClient(); rmClient.init(conf);/*from ww w .j a v a 2 s. c o m*/ rmClient.start(); NMClient nmClient = NMClient.createNMClient(); nmClient.init(conf); nmClient.start(); // Register with ResourceManager System.out.println("registerApplicationMaster 0"); rmClient.registerApplicationMaster("", 0, ""); System.out.println("registerApplicationMaster 1"); // Priority for worker containers - priorities are intra-application Priority priority = Records.newRecord(Priority.class); priority.setPriority(0); // Resource requirements for worker containers Resource capability = Records.newRecord(Resource.class); capability.setMemory(128); capability.setVirtualCores(1); // Make container requests to ResourceManager for (int i = 0; i < containerNum; ++i) { AMRMClient.ContainerRequest containerAsk = new AMRMClient.ContainerRequest(capability, null, null, priority); System.out.println("Making res-req " + i); rmClient.addContainerRequest(containerAsk); } // Obtain allocated containers, launch and check for responses int responseId = 0; int completedContainers = 0; while (completedContainers < containerNum) { AllocateResponse response = rmClient.allocate(responseId++); System.out.println("Allocate response " + response.getAMCommand() + " " + "allocate " + response.getAllocatedContainers().size() + "contains"); for (Container container : response.getAllocatedContainers()) { // Launch container by create ContainerLaunchContext ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); ctx.setCommands( Collections.singletonList(params + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")); System.out.println("Launching container " + container.getId() + " on " + container.getNodeId()); nmClient.startContainer(container, ctx); } for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; System.out.println("Completed container " + status.getContainerId()); } Thread.sleep(1000); } System.out.println("Unregister ApplicationMaster"); // Un-register with ResourceManager rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", ""); }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
@Override public int run(String[] args) throws Exception { // Set our own configuration (ToolRunner only sets it prior to calling // run())// www. java 2s . co m conf = getConf(); // Our own RM Handler ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId); // Connect rmHandler.getAMResourceManager(); // Register try { rmHandler.registerApplicationMaster(masterHost, masterPort); } catch (YarnRemoteException ex) { LOG.error("Error encountered while trying to register application master", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Get file splits, configuration, etc. Set<ConfigurationTuple> configTuples; try { configTuples = getConfigurationTuples(); } catch (IOException ex) { LOG.error("Error encountered while trying to generate configurations", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Needed for our master service later Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples); // Initial containers we want, based off of the file splits List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler); List<ContainerId> releasedContainers = new ArrayList<>(); // Send an initial allocation request List<Container> allocatedContainers = new ArrayList<>(); try { int needed = configTuples.size(); int got = 0; int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10")); int attempts = 0; List<Container> acquiredContainers; while (got < needed && attempts < maxAttempts) { LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts + ", maxAttempts=" + maxAttempts); acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getAllocatedContainers(); got += acquiredContainers.size(); attempts++; allocatedContainers.addAll(acquiredContainers); acquiredContainers.clear(); LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size()); Thread.sleep(2500); } } catch (YarnRemoteException ex) { LOG.error("Encountered an error while trying to allocate containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } final int numContainers = configTuples.size(); /* * * * TODO: fix this so we try N times to get enough containers! * * * * */ // Make sure we got all our containers, or else bail if (allocatedContainers.size() < numContainers) { LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers + ", allocated=" + allocatedContainers.size()); requestedContainers.clear(); // We don't want new containers! // Add containers into released list for (Container c : allocatedContainers) { releasedContainers.add(c.getId()); } // Release containers try { rmHandler.allocateRequest(requestedContainers, releasedContainers); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to release unwanted containers", ex); } // Notify our handlers that we got a problem rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got " + allocatedContainers.size(), FinalApplicationStatus.FAILED); // bail return ReturnCode.MASTER_ERROR.getCode(); } // Launch our worker process, as we now expect workers to actally do // something LOG.info("Starting master service"); ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf, masterComputable, masterUpdateable, appConfig, conf); ExecutorService executor = Executors.newSingleThreadExecutor(); Future<Integer> masterThread = executor.submit(masterService); // We got the number of containers we wanted, let's launch them LOG.info("Launching child containers"); List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers); // Use an empty list for heartbeat purposes requestedContainers.clear(); // Some local counters. Do we really need Atomic? AtomicInteger numCompletedContainers = new AtomicInteger(); AtomicInteger numFailedContainers = new AtomicInteger(); LOG.info("Waiting for containers to complete..."); // Go into run-loop waiting for containers to finish, also our heartbeat while (numCompletedContainers.get() < numContainers) { // Don't pound the RM try { Thread.sleep(2000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting on completed containers", ex); return ReturnCode.MASTER_ERROR.getCode(); } // Heartbeat, effectively List<ContainerStatus> completedContainers; try { completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers) .getCompletedContainersStatuses(); } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex); continue; // Nothing to report, probably an error / endless loop } for (ContainerStatus cs : completedContainers) { int exitCode = cs.getExitStatus(); if (exitCode != 0) { numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); masterService.fail(); executor.shutdown(); // Force kill our application, fail fast? LOG.info("At least one container failed with a non-zero exit code (" + exitCode + "); killing application"); rmHandler.finishApplication( "Failing, due to at least container coming back with an non-zero exit code.", FinalApplicationStatus.KILLED); return -10; } else { numCompletedContainers.incrementAndGet(); } } } // All containers have completed // Wait for launch threads to complete (this shouldn't really happen) LOG.info("Containers completed"); for (Thread launchThread : launchThreads) { try { launchThread.join(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting for Launcher threads to complete", ex); } } // Ensure that our master service has completed as well if (!masterThread.isDone()) { masterService.fail(); } int masterExit = masterThread.get(); LOG.info("Master service completed with exitCode=" + masterExit); executor.shutdown(); if (masterExit == 0) { String impersonatedUser = System.getenv("USER"); UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser); //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser()); ugi.doAs(new PrivilegedExceptionAction<Void>() { public Void run() { Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH)); FileSystem fs; try { fs = out.getFileSystem(conf); FSDataOutputStream fos = fs.create(out); LOG.info("Writing master results to " + out.toString()); masterComputable.complete(fos); fos.flush(); fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; //FileSystem fs = FileSystem.get(conf); //fs.mkdir( out ); } }); /* LOG.info( "Here we would try to write to " + out.toString() ); LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() ); LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() ); LOG.info( "Env Var User: " + System.getenv("USER") ); */ //LOG.info( "Ideally we'd be user: " + this.props.getProperty( ) ); // for (Map.Entry<String, String> entry : this.conf) { // LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue()); // } } else { LOG.warn("Not writing master results, as the master came back with errors!"); } // Application finished ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR; try { if (numFailedContainers.get() == 0) { rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED); } else { String diag = "Completed with " + numFailedContainers.get() + " failed containers"; rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED); } } catch (YarnRemoteException ex) { LOG.warn("Encountered an error while trying to send final status to resource manager", ex); } return rc.getCode(); }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.client.Client.java
License:Apache License
/** * TODO: consider the scenarios where we dont get enough containers * - we need to re-submit the job till we get the containers alloc'd * /*from w w w. j a v a 2 s .c o m*/ */ @Override public int run(String[] args) throws Exception { //System.out.println("IR: Client.run() [start]"); if (args.length < 1) LOG.info("No configuration file specified, using default (" + ConfigFields.DEFAULT_CONFIG_FILE + ")"); long startTime = System.currentTimeMillis(); String configFile = (args.length < 1) ? ConfigFields.DEFAULT_CONFIG_FILE : args[0]; Properties props = new Properties(); Configuration conf = getConf(); try { FileInputStream fis = new FileInputStream(configFile); props.load(fis); } catch (FileNotFoundException ex) { throw ex; // TODO: be nice } catch (IOException ex) { throw ex; // TODO: be nice } // Make sure we have some bare minimums ConfigFields.validateConfig(props); if (LOG.isDebugEnabled()) { LOG.debug("Loaded configuration: "); for (Map.Entry<Object, Object> entry : props.entrySet()) { LOG.debug(entry.getKey() + "=" + entry.getValue()); } } // TODO: make sure input file(s), libs, etc. actually exist! // Ensure our input path exists Path p = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH)); FileSystem fs = FileSystem.get(conf); if (!fs.exists(p)) throw new FileNotFoundException("Input path not found: " + p.toString() + " (in " + fs.getUri() + ")"); LOG.info("Using input path: " + p.toString()); // Connect ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, null); rmHandler.getClientResourceManager(); // Create an Application request/ID ApplicationId appId = rmHandler.getApplicationId(); // Our AppId String appName = props.getProperty(ConfigFields.APP_NAME, ConfigFields.DEFAULT_APP_NAME).replace(' ', '_'); LOG.info("Got an application, id=" + appId + ", appName=" + appName); // Copy resources to [HD]FS LOG.debug("Copying resources to filesystem"); Utils.copyLocalResourcesToFs(props, conf, appId, appName); // Local resources Utils.copyLocalResourceToFs(configFile, ConfigFields.APP_CONFIG_FILE, conf, appId, appName); // Config file try { Utils.copyLocalResourceToFs("log4j.properties", "log4j.properties", conf, appId, appName); // Log4j } catch (FileNotFoundException ex) { LOG.warn("log4j.properties file not found"); } // Create our context List<String> commands = Utils.getMasterCommand(conf, props); Map<String, LocalResource> localResources = Utils.getLocalResourcesForApplication(conf, appId, appName, props, LocalResourceVisibility.APPLICATION); // Submit app rmHandler.submitApplication(appId, appName, Utils.getEnvironment(conf, props), localResources, commands, Integer.parseInt(props.getProperty(ConfigFields.YARN_MEMORY, "512"))); /* * TODO: * - look at updating this code region to make sure job is submitted! * */ StopWatch watch = new StopWatch(); watch.start(); // Wait for app to complete while (true) { Thread.sleep(2000); ApplicationReport report = rmHandler.getApplicationReport(appId); LOG.info("IterativeReduce report: " + " appId=" + appId.getId() + ", state: " + report.getYarnApplicationState().toString() + ", Running Time: " + watch.toString()); //report.getDiagnostics() if (YarnApplicationState.FINISHED == report.getYarnApplicationState()) { LOG.info("Application finished in " + (System.currentTimeMillis() - startTime) + "ms"); if (FinalApplicationStatus.SUCCEEDED == report.getFinalApplicationStatus()) { LOG.info("Application completed succesfully."); return 0; } else { LOG.info("Application completed with en error: " + report.getDiagnostics()); return -1; } } else if (YarnApplicationState.FAILED == report.getYarnApplicationState() || YarnApplicationState.KILLED == report.getYarnApplicationState()) { LOG.info("Application completed with a failed or killed state: " + report.getDiagnostics()); return -1; } } }
From source file:org.dknight.app.ApplicationMaster.java
License:Apache License
private void finish() { // Join all launched threads // needed for when we time out // and we need to release containers for (Thread launchThread : launchThreads) { try {/*from ww w .ja v a2 s . c o m*/ launchThread.join(10000); } catch (InterruptedException e) { LOG.info("Exception thrown in thread join: " + e.getMessage()); e.printStackTrace(); } } //stop yarnClient yarnClient.stop(); // When the application completes, it should stop all running containers LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); // When the application completes, it should send a finish application // signal to the RM LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus; String appMessage = null; success = true; if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) { appStatus = FinalApplicationStatus.SUCCEEDED; } else { appStatus = FinalApplicationStatus.FAILED; appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get(); success = false; } try { amRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (YarnException ex) { LOG.error("Failed to unregister application", ex); } catch (IOException e) { LOG.error("Failed to unregister application", e); } amRMClient.stop(); }
From source file:org.dknight.app.UnmanagedAMLauncher.java
License:Apache License
public boolean run() throws IOException, YarnException { LOG.info("Starting Client"); // Connect to ResourceManager rmClient.start();/*from w w w . j a v a2 s . c o m*/ try { // Create launch context for app master LOG.info("Setting up application submission context for ASM"); ApplicationSubmissionContext appContext = rmClient.createApplication() .getApplicationSubmissionContext(); ApplicationId appId = appContext.getApplicationId(); // set the application name appContext.setApplicationName(appName); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); appContext.setAMContainerSpec(amContainer); // unmanaged AM appContext.setUnmanagedAM(true); LOG.info("Setting unmanaged AM"); // Submit the application to the applications manager LOG.info("Submitting application to ASM"); rmClient.submitApplication(appContext); // Monitor the application to wait for launch state ApplicationReport appReport = monitorApplication(appId, EnumSet.of(YarnApplicationState.ACCEPTED)); ApplicationAttemptId attemptId = appReport.getCurrentApplicationAttemptId(); LOG.info("Launching application with id: " + attemptId); // launch AM launchAM(attemptId); // Monitor the application for end state appReport = monitorApplication(appId, EnumSet.of(YarnApplicationState.KILLED, YarnApplicationState.FAILED, YarnApplicationState.FINISHED)); YarnApplicationState appState = appReport.getYarnApplicationState(); FinalApplicationStatus appStatus = appReport.getFinalApplicationStatus(); LOG.info("App ended with state: " + appReport.getYarnApplicationState() + " and status: " + appStatus); boolean success; if (YarnApplicationState.FINISHED == appState && FinalApplicationStatus.SUCCEEDED == appStatus) { LOG.info("Application has completed successfully."); success = true; } else { LOG.info("Application did finished unsuccessfully." + " YarnState=" + appState.toString() + ", FinalStatus=" + appStatus.toString()); success = false; } return success; } finally { rmClient.stop(); } }
From source file:org.elasticsearch.hadoop.yarn.am.AppMasterRpc.java
License:Apache License
public void finishAM() { unregisterAM(FinalApplicationStatus.SUCCEEDED); }
From source file:org.hdl.caffe.yarn.app.ApplicationMaster.java
License:Apache License
@VisibleForTesting protected boolean finish() { // wait for completion. while (!done && (numCompletedContainers.get() != numTotalContainers)) { try {//from w w w .j a v a 2s. c o m Thread.sleep(200); } catch (InterruptedException ex) { } } // Join all launched threads // needed for when we time out // and we need to release containers for (Thread launchThread : launchThreads) { try { launchThread.join(10000); } catch (InterruptedException e) { LOG.info("Exception thrown in thread join: " + e.getMessage()); e.printStackTrace(); } } // When the application completes, it should stop all running containers LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); // When the application completes, it should send a finish application // signal to the RM LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus; String appMessage = null; boolean success = true; if (numFailedContainers.get() == 0) { appStatus = FinalApplicationStatus.SUCCEEDED; } else { appStatus = FinalApplicationStatus.FAILED; appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed=" + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get(); LOG.info(appMessage); success = false; } try { amRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (YarnException ex) { LOG.error("Failed to unregister application", ex); } catch (IOException e) { LOG.error("Failed to unregister application", e); } amRMClient.stop(); return success; }
From source file:org.hdl.caffe.yarn.app.Client.java
License:Apache License
/** * Monitor the submitted application for completion. * * @param appId Application Id of application to be monitored * @return true if application completed successfully * @throws YarnException/*www . j a v a 2 s . c o m*/ * @throws IOException */ private boolean monitorApplication(ApplicationId appId) throws YarnException, IOException { while (true) { try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.debug("Thread sleep in monitoring loop interrupted"); } ApplicationReport report = yarnClient.getApplicationReport(appId); LOG.info("Got application report from ASM for" + ", appId=" + appId.getId() + ", clientToAMToken=" + report.getClientToAMToken() + ", appDiagnostics=" + report.getDiagnostics() + ", appMasterHost=" + report.getHost() + ", appQueue=" + report.getQueue() + ", appMasterRpcPort=" + report.getRpcPort() + ", appStartTime=" + report.getStartTime() + ", yarnAppState=" + report.getYarnApplicationState().toString() + ", caffeAppFinalState=" + report.getFinalApplicationStatus().toString() + ", appTrackingUrl=" + report.getTrackingUrl() + ", appUser=" + report.getUser()); YarnApplicationState state = report.getYarnApplicationState(); FinalApplicationStatus caffeStatus = report.getFinalApplicationStatus(); if (YarnApplicationState.RUNNING == state) { if (appRpc == null) { String hostname = report.getHost(); int port = report.getRpcPort(); LOG.info("Application master rpc host: " + hostname + "; port: " + port); appRpc = new CaffeApplicationRpcClient(hostname, port).getRpc(); } } if (YarnApplicationState.FINISHED == state) { if (FinalApplicationStatus.SUCCEEDED == caffeStatus) { LOG.info("Application has completed successfully. Breaking monitoring loop"); return true; } else { LOG.info("Application did finished unsuccessfully." + " YarnState=" + state.toString() + ", appFinalState=" + caffeStatus.toString() + ". Breaking monitoring loop"); return false; } } else if (YarnApplicationState.KILLED == state || YarnApplicationState.FAILED == state) { LOG.info("Application did not finish." + " YarnState=" + state.toString() + ", appFinalState=" + caffeStatus.toString() + ". Breaking monitoring loop"); return false; } } }
From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java
License:Apache License
/** * Main run function for the application master *//*from w ww . ja v a 2 s . c om*/ @SuppressWarnings({ "unchecked" }) public boolean run() throws Exception { int numTotalContainersToRequest = args.totalContainerNum - launchedContainers.size(); // Setup ask for containers from RM // Send request for containers to RM // Until we get our fully allocated quota, we keep on polling RM for // containers // Keep looping until all the containers are launched and shell script // executed on them ( regardless of success/failure). for (int i = 0; i < numTotalContainersToRequest; ++i) { ContainerRequest containerAsk = setupContainerAskForRM(); amRMClient.addContainerRequest(containerAsk); } requestedContainerNum.set(args.totalContainerNum); // wait for completion. while (!done && (completedContainerNum.get() != args.totalContainerNum)) { try { Thread.sleep(200); } catch (InterruptedException e) { LOG.error("Exception thrown when waiting for container completion: " + e.getMessage()); throw e; } } // Join all launched threads // needed for when we time out // and we need to release containers for (Thread launchThread : launchThreads) { try { launchThread.join(10000); } catch (InterruptedException e) { LOG.error("Exception thrown in thread join: " + e.getMessage()); throw e; } } // When the application completes, it should stop all running containers LOG.info("Application completed. Stopping running containers"); nmClientAsync.stop(); // When the application completes, it should send a finish application // signal to the RM LOG.info("Application completed. Signalling finish to RM"); FinalApplicationStatus appStatus = getFinalAppStatus(); String appMessage = "Diagnostics." + ", total=" + args.totalContainerNum + ", completed=" + completedContainerNum.get() + ", allocated=" + allocatedContainerNum.get() + ", failed=" + failedContainerNum.get(); LOG.info(appMessage); try { amRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (Exception ex) { LOG.error("Failed to unregister application", ex); } amRMClient.stop(); return appStatus.equals(FinalApplicationStatus.SUCCEEDED); }
From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java
License:Apache License
private FinalApplicationStatus getFinalAppStatus() { if (completedContainerNum.get() - failedContainerNum.get() >= args.totalContainerNum) { return FinalApplicationStatus.SUCCEEDED; } else {/*www . ja v a 2s . c o m*/ return FinalApplicationStatus.FAILED; } }