Example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED

List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED.

Prototype

FinalApplicationStatus SUCCEEDED

To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus SUCCEEDED.

Click Source Link

Document

Application which finished successfully.

Usage

From source file:org.chenchun.ApplicationMaster.java

License:Apache License

public static void main(String[] args) throws IOException, YarnException, InterruptedException {
    final String params = args[0];
    final int containerNum = Integer.valueOf(args[1]);

    // Initialize clients to ResourceManager and NodeManagers
    Configuration conf = new YarnConfiguration();

    AMRMClient<AMRMClient.ContainerRequest> rmClient = AMRMClient.createAMRMClient();
    rmClient.init(conf);/*from  ww w  .j a v a 2  s.  c o m*/
    rmClient.start();

    NMClient nmClient = NMClient.createNMClient();
    nmClient.init(conf);
    nmClient.start();

    // Register with ResourceManager
    System.out.println("registerApplicationMaster 0");
    rmClient.registerApplicationMaster("", 0, "");
    System.out.println("registerApplicationMaster 1");

    // Priority for worker containers - priorities are intra-application
    Priority priority = Records.newRecord(Priority.class);
    priority.setPriority(0);

    // Resource requirements for worker containers
    Resource capability = Records.newRecord(Resource.class);
    capability.setMemory(128);
    capability.setVirtualCores(1);

    // Make container requests to ResourceManager
    for (int i = 0; i < containerNum; ++i) {
        AMRMClient.ContainerRequest containerAsk = new AMRMClient.ContainerRequest(capability, null, null,
                priority);
        System.out.println("Making res-req " + i);
        rmClient.addContainerRequest(containerAsk);
    }

    // Obtain allocated containers, launch and check for responses
    int responseId = 0;
    int completedContainers = 0;
    while (completedContainers < containerNum) {
        AllocateResponse response = rmClient.allocate(responseId++);
        System.out.println("Allocate response " + response.getAMCommand() + " " + "allocate "
                + response.getAllocatedContainers().size() + "contains");
        for (Container container : response.getAllocatedContainers()) {
            // Launch container by create ContainerLaunchContext
            ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class);
            ctx.setCommands(
                    Collections.singletonList(params + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
                            + "/stdout" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"));
            System.out.println("Launching container " + container.getId() + " on " + container.getNodeId());
            nmClient.startContainer(container, ctx);
        }
        for (ContainerStatus status : response.getCompletedContainersStatuses()) {
            ++completedContainers;
            System.out.println("Completed container " + status.getContainerId());
        }
        Thread.sleep(1000);
    }
    System.out.println("Unregister ApplicationMaster");

    // Un-register with ResourceManager
    rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", "");
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Set our own configuration (ToolRunner only sets it prior to calling
    // run())// www. java 2s . co  m
    conf = getConf();

    // Our own RM Handler
    ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId);

    // Connect
    rmHandler.getAMResourceManager();

    // Register
    try {
        rmHandler.registerApplicationMaster(masterHost, masterPort);
    } catch (YarnRemoteException ex) {
        LOG.error("Error encountered while trying to register application master", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Get file splits, configuration, etc.
    Set<ConfigurationTuple> configTuples;
    try {
        configTuples = getConfigurationTuples();
    } catch (IOException ex) {
        LOG.error("Error encountered while trying to generate configurations", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }
    // Needed for our master service later
    Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples);

    // Initial containers we want, based off of the file splits
    List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler);
    List<ContainerId> releasedContainers = new ArrayList<>();

    // Send an initial allocation request
    List<Container> allocatedContainers = new ArrayList<>();
    try {
        int needed = configTuples.size();
        int got = 0;
        int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10"));
        int attempts = 0;

        List<Container> acquiredContainers;

        while (got < needed && attempts < maxAttempts) {
            LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts
                    + ", maxAttempts=" + maxAttempts);

            acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getAllocatedContainers();

            got += acquiredContainers.size();
            attempts++;

            allocatedContainers.addAll(acquiredContainers);
            acquiredContainers.clear();

            LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size());

            Thread.sleep(2500);
        }
    } catch (YarnRemoteException ex) {
        LOG.error("Encountered an error while trying to allocate containers", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    final int numContainers = configTuples.size();

    /*
     * 
     * 
     * TODO: fix this so we try N times to get enough containers!
     * 
     * 
     * 
     * 
     */
    // Make sure we got all our containers, or else bail
    if (allocatedContainers.size() < numContainers) {
        LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers
                + ", allocated=" + allocatedContainers.size());

        requestedContainers.clear(); // We don't want new containers!

        // Add containers into released list
        for (Container c : allocatedContainers) {
            releasedContainers.add(c.getId());
        }

        // Release containers
        try {
            rmHandler.allocateRequest(requestedContainers, releasedContainers);
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to release unwanted containers", ex);
        }

        // Notify our handlers that we got a problem
        rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got "
                + allocatedContainers.size(), FinalApplicationStatus.FAILED);
        // bail
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Launch our worker process, as we now expect workers to actally do
    // something
    LOG.info("Starting master service");
    ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf,
            masterComputable, masterUpdateable, appConfig, conf);

    ExecutorService executor = Executors.newSingleThreadExecutor();
    Future<Integer> masterThread = executor.submit(masterService);

    // We got the number of containers we wanted, let's launch them
    LOG.info("Launching child containers");
    List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers);

    // Use an empty list for heartbeat purposes
    requestedContainers.clear();

    // Some local counters. Do we really need Atomic?
    AtomicInteger numCompletedContainers = new AtomicInteger();
    AtomicInteger numFailedContainers = new AtomicInteger();

    LOG.info("Waiting for containers to complete...");
    // Go into run-loop waiting for containers to finish, also our heartbeat
    while (numCompletedContainers.get() < numContainers) {
        // Don't pound the RM
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting on completed containers", ex);
            return ReturnCode.MASTER_ERROR.getCode();
        }

        // Heartbeat, effectively
        List<ContainerStatus> completedContainers;

        try {
            completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getCompletedContainersStatuses();
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex);

            continue; // Nothing to report, probably an error / endless loop
        }

        for (ContainerStatus cs : completedContainers) {
            int exitCode = cs.getExitStatus();
            if (exitCode != 0) {
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();

                masterService.fail();
                executor.shutdown();

                // Force kill our application, fail fast?
                LOG.info("At least one container failed with a non-zero exit code (" + exitCode
                        + "); killing application");
                rmHandler.finishApplication(
                        "Failing, due to at least container coming back with an non-zero exit code.",
                        FinalApplicationStatus.KILLED);

                return -10;
            } else {
                numCompletedContainers.incrementAndGet();
            }
        }
    }

    // All containers have completed
    // Wait for launch threads to complete (this shouldn't really happen)
    LOG.info("Containers completed");
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(1000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting for Launcher threads to complete", ex);
        }
    }

    // Ensure that our master service has completed as well
    if (!masterThread.isDone()) {
        masterService.fail();
    }

    int masterExit = masterThread.get();
    LOG.info("Master service completed with exitCode=" + masterExit);
    executor.shutdown();

    if (masterExit == 0) {

        String impersonatedUser = System.getenv("USER");

        UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser);
        //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser());
        ugi.doAs(new PrivilegedExceptionAction<Void>() {
            public Void run() {

                Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH));
                FileSystem fs;
                try {
                    fs = out.getFileSystem(conf);

                    FSDataOutputStream fos = fs.create(out);
                    LOG.info("Writing master results to " + out.toString());

                    masterComputable.complete(fos);

                    fos.flush();
                    fos.close();

                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

                return null;

                //FileSystem fs = FileSystem.get(conf);
                //fs.mkdir( out );
            }
        });

        /*
        LOG.info( "Here we would try to write to " + out.toString() );
        LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() );
        LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() );
                
        LOG.info( "Env Var User: " + System.getenv("USER") );
        */
        //LOG.info( "Ideally we'd be user: " + this.props.getProperty(  ) );

        //       for (Map.Entry<String, String> entry : this.conf) {
        //           LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue());
        //     }

    } else {
        LOG.warn("Not writing master results, as the master came back with errors!");
    }

    // Application finished
    ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR;

    try {
        if (numFailedContainers.get() == 0) {
            rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED);
        } else {
            String diag = "Completed with " + numFailedContainers.get() + " failed containers";
            rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED);
        }
    } catch (YarnRemoteException ex) {
        LOG.warn("Encountered an error while trying to send final status to resource manager", ex);
    }

    return rc.getCode();
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.client.Client.java

License:Apache License

/**
 * TODO: consider the scenarios where we dont get enough containers 
 * - we need to re-submit the job till we get the containers alloc'd
 * /*from   w w w. j  a v a 2 s .c o  m*/
 */
@Override
public int run(String[] args) throws Exception {

    //System.out.println("IR: Client.run() [start]");

    if (args.length < 1)
        LOG.info("No configuration file specified, using default (" + ConfigFields.DEFAULT_CONFIG_FILE + ")");

    long startTime = System.currentTimeMillis();
    String configFile = (args.length < 1) ? ConfigFields.DEFAULT_CONFIG_FILE : args[0];
    Properties props = new Properties();
    Configuration conf = getConf();

    try {
        FileInputStream fis = new FileInputStream(configFile);
        props.load(fis);
    } catch (FileNotFoundException ex) {
        throw ex; // TODO: be nice
    } catch (IOException ex) {
        throw ex; // TODO: be nice
    }

    // Make sure we have some bare minimums
    ConfigFields.validateConfig(props);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Loaded configuration: ");
        for (Map.Entry<Object, Object> entry : props.entrySet()) {
            LOG.debug(entry.getKey() + "=" + entry.getValue());
        }
    }

    // TODO: make sure input file(s), libs, etc. actually exist!
    // Ensure our input path exists

    Path p = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH));
    FileSystem fs = FileSystem.get(conf);

    if (!fs.exists(p))
        throw new FileNotFoundException("Input path not found: " + p.toString() + " (in " + fs.getUri() + ")");

    LOG.info("Using input path: " + p.toString());

    // Connect
    ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, null);
    rmHandler.getClientResourceManager();

    // Create an Application request/ID
    ApplicationId appId = rmHandler.getApplicationId(); // Our AppId
    String appName = props.getProperty(ConfigFields.APP_NAME, ConfigFields.DEFAULT_APP_NAME).replace(' ', '_');

    LOG.info("Got an application, id=" + appId + ", appName=" + appName);

    // Copy resources to [HD]FS
    LOG.debug("Copying resources to filesystem");
    Utils.copyLocalResourcesToFs(props, conf, appId, appName); // Local resources
    Utils.copyLocalResourceToFs(configFile, ConfigFields.APP_CONFIG_FILE, conf, appId, appName); // Config file

    try {
        Utils.copyLocalResourceToFs("log4j.properties", "log4j.properties", conf, appId, appName); // Log4j
    } catch (FileNotFoundException ex) {
        LOG.warn("log4j.properties file not found");
    }

    // Create our context
    List<String> commands = Utils.getMasterCommand(conf, props);
    Map<String, LocalResource> localResources = Utils.getLocalResourcesForApplication(conf, appId, appName,
            props, LocalResourceVisibility.APPLICATION);

    // Submit app
    rmHandler.submitApplication(appId, appName, Utils.getEnvironment(conf, props), localResources, commands,
            Integer.parseInt(props.getProperty(ConfigFields.YARN_MEMORY, "512")));

    /*
     * TODO:
     * - look at updating this code region to make sure job is submitted!
     * 
     */

    StopWatch watch = new StopWatch();
    watch.start();

    // Wait for app to complete
    while (true) {
        Thread.sleep(2000);

        ApplicationReport report = rmHandler.getApplicationReport(appId);
        LOG.info("IterativeReduce report: " + " appId=" + appId.getId() + ", state: "
                + report.getYarnApplicationState().toString() + ", Running Time: " + watch.toString());

        //report.getDiagnostics()

        if (YarnApplicationState.FINISHED == report.getYarnApplicationState()) {
            LOG.info("Application finished in " + (System.currentTimeMillis() - startTime) + "ms");

            if (FinalApplicationStatus.SUCCEEDED == report.getFinalApplicationStatus()) {
                LOG.info("Application completed succesfully.");
                return 0;
            } else {
                LOG.info("Application completed with en error: " + report.getDiagnostics());
                return -1;
            }
        } else if (YarnApplicationState.FAILED == report.getYarnApplicationState()
                || YarnApplicationState.KILLED == report.getYarnApplicationState()) {

            LOG.info("Application completed with a failed or killed state: " + report.getDiagnostics());
            return -1;
        }

    }

}

From source file:org.dknight.app.ApplicationMaster.java

License:Apache License

private void finish() {
    // Join all launched threads
    // needed for when we time out
    // and we need to release containers
    for (Thread launchThread : launchThreads) {
        try {/*from  ww  w .ja  v a2 s .  c  o  m*/
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.info("Exception thrown in thread join: " + e.getMessage());
            e.printStackTrace();
        }
    }

    //stop yarnClient
    yarnClient.stop();
    // When the application completes, it should stop all running containers
    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    // When the application completes, it should send a finish application
    // signal to the RM
    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus;
    String appMessage = null;
    success = true;
    if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) {
        appStatus = FinalApplicationStatus.SUCCEEDED;
    } else {
        appStatus = FinalApplicationStatus.FAILED;
        appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed="
                + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed="
                + numFailedContainers.get();
        success = false;
    }
    try {
        amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (YarnException ex) {
        LOG.error("Failed to unregister application", ex);
    } catch (IOException e) {
        LOG.error("Failed to unregister application", e);
    }

    amRMClient.stop();
}

From source file:org.dknight.app.UnmanagedAMLauncher.java

License:Apache License

public boolean run() throws IOException, YarnException {
    LOG.info("Starting Client");

    // Connect to ResourceManager
    rmClient.start();/*from w  w  w  . j  a v a2  s  .  c o m*/
    try {
        // Create launch context for app master
        LOG.info("Setting up application submission context for ASM");
        ApplicationSubmissionContext appContext = rmClient.createApplication()
                .getApplicationSubmissionContext();
        ApplicationId appId = appContext.getApplicationId();

        // set the application name
        appContext.setApplicationName(appName);

        // Set the priority for the application master
        Priority pri = Records.newRecord(Priority.class);
        pri.setPriority(amPriority);
        appContext.setPriority(pri);

        // Set the queue to which this application is to be submitted in the RM
        appContext.setQueue(amQueue);
        // Set up the container launch context for the application master
        ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class);
        appContext.setAMContainerSpec(amContainer);

        // unmanaged AM
        appContext.setUnmanagedAM(true);
        LOG.info("Setting unmanaged AM");

        // Submit the application to the applications manager
        LOG.info("Submitting application to ASM");
        rmClient.submitApplication(appContext);

        // Monitor the application to wait for launch state
        ApplicationReport appReport = monitorApplication(appId, EnumSet.of(YarnApplicationState.ACCEPTED));
        ApplicationAttemptId attemptId = appReport.getCurrentApplicationAttemptId();
        LOG.info("Launching application with id: " + attemptId);

        // launch AM
        launchAM(attemptId);

        // Monitor the application for end state
        appReport = monitorApplication(appId, EnumSet.of(YarnApplicationState.KILLED,
                YarnApplicationState.FAILED, YarnApplicationState.FINISHED));

        YarnApplicationState appState = appReport.getYarnApplicationState();
        FinalApplicationStatus appStatus = appReport.getFinalApplicationStatus();

        LOG.info("App ended with state: " + appReport.getYarnApplicationState() + " and status: " + appStatus);

        boolean success;
        if (YarnApplicationState.FINISHED == appState && FinalApplicationStatus.SUCCEEDED == appStatus) {
            LOG.info("Application has completed successfully.");
            success = true;
        } else {
            LOG.info("Application did finished unsuccessfully." + " YarnState=" + appState.toString()
                    + ", FinalStatus=" + appStatus.toString());
            success = false;
        }

        return success;
    } finally {
        rmClient.stop();
    }
}

From source file:org.elasticsearch.hadoop.yarn.am.AppMasterRpc.java

License:Apache License

public void finishAM() {
    unregisterAM(FinalApplicationStatus.SUCCEEDED);
}

From source file:org.hdl.caffe.yarn.app.ApplicationMaster.java

License:Apache License

@VisibleForTesting
protected boolean finish() {
    // wait for completion.
    while (!done && (numCompletedContainers.get() != numTotalContainers)) {
        try {//from w w w  .j  a  v a 2s.  c o m
            Thread.sleep(200);
        } catch (InterruptedException ex) {

        }
    }

    // Join all launched threads
    // needed for when we time out
    // and we need to release containers
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.info("Exception thrown in thread join: " + e.getMessage());
            e.printStackTrace();
        }
    }

    // When the application completes, it should stop all running containers
    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    // When the application completes, it should send a finish application
    // signal to the RM
    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus;
    String appMessage = null;
    boolean success = true;
    if (numFailedContainers.get() == 0) {
        appStatus = FinalApplicationStatus.SUCCEEDED;
    } else {
        appStatus = FinalApplicationStatus.FAILED;
        appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed="
                + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed="
                + numFailedContainers.get();
        LOG.info(appMessage);
        success = false;
    }
    try {
        amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (YarnException ex) {
        LOG.error("Failed to unregister application", ex);
    } catch (IOException e) {
        LOG.error("Failed to unregister application", e);
    }

    amRMClient.stop();

    return success;
}

From source file:org.hdl.caffe.yarn.app.Client.java

License:Apache License

/**
 * Monitor the submitted application for completion.
 *
 * @param appId Application Id of application to be monitored
 * @return true if application completed successfully
 * @throws YarnException/*www  .  j a v a  2  s  .  c  o  m*/
 * @throws IOException
 */
private boolean monitorApplication(ApplicationId appId) throws YarnException, IOException {

    while (true) {

        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            LOG.debug("Thread sleep in monitoring loop interrupted");
        }

        ApplicationReport report = yarnClient.getApplicationReport(appId);

        LOG.info("Got application report from ASM for" + ", appId=" + appId.getId() + ", clientToAMToken="
                + report.getClientToAMToken() + ", appDiagnostics=" + report.getDiagnostics()
                + ", appMasterHost=" + report.getHost() + ", appQueue=" + report.getQueue()
                + ", appMasterRpcPort=" + report.getRpcPort() + ", appStartTime=" + report.getStartTime()
                + ", yarnAppState=" + report.getYarnApplicationState().toString() + ", caffeAppFinalState="
                + report.getFinalApplicationStatus().toString() + ", appTrackingUrl=" + report.getTrackingUrl()
                + ", appUser=" + report.getUser());

        YarnApplicationState state = report.getYarnApplicationState();
        FinalApplicationStatus caffeStatus = report.getFinalApplicationStatus();

        if (YarnApplicationState.RUNNING == state) {
            if (appRpc == null) {
                String hostname = report.getHost();
                int port = report.getRpcPort();
                LOG.info("Application master rpc host: " + hostname + "; port: " + port);
                appRpc = new CaffeApplicationRpcClient(hostname, port).getRpc();
            }
        }

        if (YarnApplicationState.FINISHED == state) {
            if (FinalApplicationStatus.SUCCEEDED == caffeStatus) {
                LOG.info("Application has completed successfully. Breaking monitoring loop");
                return true;
            } else {
                LOG.info("Application did finished unsuccessfully." + " YarnState=" + state.toString()
                        + ", appFinalState=" + caffeStatus.toString() + ". Breaking monitoring loop");
                return false;
            }
        } else if (YarnApplicationState.KILLED == state || YarnApplicationState.FAILED == state) {
            LOG.info("Application did not finish." + " YarnState=" + state.toString() + ", appFinalState="
                    + caffeStatus.toString() + ". Breaking monitoring loop");
            return false;
        }
    }

}

From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java

License:Apache License

/**
 * Main run function for the application master
 *//*from  w ww  .  ja v  a  2  s .  c om*/
@SuppressWarnings({ "unchecked" })
public boolean run() throws Exception {
    int numTotalContainersToRequest = args.totalContainerNum - launchedContainers.size();
    // Setup ask for containers from RM
    // Send request for containers to RM
    // Until we get our fully allocated quota, we keep on polling RM for
    // containers
    // Keep looping until all the containers are launched and shell script
    // executed on them ( regardless of success/failure).
    for (int i = 0; i < numTotalContainersToRequest; ++i) {
        ContainerRequest containerAsk = setupContainerAskForRM();
        amRMClient.addContainerRequest(containerAsk);
    }
    requestedContainerNum.set(args.totalContainerNum);

    // wait for completion.
    while (!done && (completedContainerNum.get() != args.totalContainerNum)) {
        try {
            Thread.sleep(200);
        } catch (InterruptedException e) {
            LOG.error("Exception thrown when waiting for container completion: " + e.getMessage());
            throw e;
        }
    }

    // Join all launched threads
    // needed for when we time out
    // and we need to release containers
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.error("Exception thrown in thread join: " + e.getMessage());
            throw e;
        }
    }

    // When the application completes, it should stop all running containers
    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    // When the application completes, it should send a finish application
    // signal to the RM
    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus = getFinalAppStatus();
    String appMessage = "Diagnostics." + ", total=" + args.totalContainerNum + ", completed="
            + completedContainerNum.get() + ", allocated=" + allocatedContainerNum.get() + ", failed="
            + failedContainerNum.get();
    LOG.info(appMessage);
    try {
        amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (Exception ex) {
        LOG.error("Failed to unregister application", ex);
    }
    amRMClient.stop();
    return appStatus.equals(FinalApplicationStatus.SUCCEEDED);
}

From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java

License:Apache License

private FinalApplicationStatus getFinalAppStatus() {
    if (completedContainerNum.get() - failedContainerNum.get() >= args.totalContainerNum) {
        return FinalApplicationStatus.SUCCEEDED;
    } else {/*www  .  ja v  a  2s .  c  o m*/
        return FinalApplicationStatus.FAILED;
    }
}