Example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED

List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED.

Prototype

FinalApplicationStatus KILLED

To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus KILLED.

Click Source Link

Document

Application which was terminated by a user or admin.

Usage

From source file:org.apache.tez.dag.app.rm.TaskSchedulerManager.java

License:Apache License

public AppFinalStatus getFinalAppStatus() {
    FinalApplicationStatus finishState = FinalApplicationStatus.UNDEFINED;
    StringBuffer sb = new StringBuffer();
    if (dagAppMaster == null) {
        finishState = FinalApplicationStatus.UNDEFINED;
        sb.append("App not yet initialized");
    } else {/*from  w  w  w.ja  va2 s .co m*/
        DAGAppMasterState appMasterState = dagAppMaster.getState();
        if (appMasterState == DAGAppMasterState.SUCCEEDED) {
            finishState = FinalApplicationStatus.SUCCEEDED;
        } else if (appMasterState == DAGAppMasterState.KILLED
                || (appMasterState == DAGAppMasterState.RUNNING && isSignalled)) {
            finishState = FinalApplicationStatus.KILLED;
        } else if (appMasterState == DAGAppMasterState.FAILED || appMasterState == DAGAppMasterState.ERROR) {
            finishState = FinalApplicationStatus.FAILED;
        } else {
            finishState = FinalApplicationStatus.UNDEFINED;
        }
        List<String> diagnostics = dagAppMaster.getDiagnostics();
        if (diagnostics != null) {
            for (String s : diagnostics) {
                sb.append(s).append("\n");
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Setting job diagnostics to " + sb.toString());
    }

    // if history url is set use the same, if historyUrl is set to "" then rm ui disables the
    // history url
    return new AppFinalStatus(finishState, sb.toString(), historyUrl);
}

From source file:org.apache.tez.hadoop.shim.TestHadoopShim28.java

License:Apache License

@Test
public void testApplyFinalApplicationStatusCorrection() {
    HadoopShim shim = new HadoopShim28();
    // Session mode success/failure, change to ended
    Assert.assertEquals(FinalApplicationStatus.ENDED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.SUCCEEDED, true, false));
    Assert.assertEquals(FinalApplicationStatus.ENDED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, true, false));

    // Non-session mode success/failure, retain success/failure
    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.SUCCEEDED, false, false));
    Assert.assertEquals(FinalApplicationStatus.FAILED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, false, false));

    // Session and non-session mode error, retain failed.
    Assert.assertEquals(FinalApplicationStatus.FAILED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, true, true));
    Assert.assertEquals(FinalApplicationStatus.FAILED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.FAILED, false, true));

    // Session and non-session mode killed is killed.
    Assert.assertEquals(FinalApplicationStatus.KILLED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.KILLED, true, false));
    Assert.assertEquals(FinalApplicationStatus.KILLED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.KILLED, false, false));

    // Session and non-session mode undefined is undefined.
    Assert.assertEquals(FinalApplicationStatus.UNDEFINED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.UNDEFINED, true, false));
    Assert.assertEquals(FinalApplicationStatus.UNDEFINED,
            shim.applyFinalApplicationStatusCorrection(FinalApplicationStatus.UNDEFINED, false, false));
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Set our own configuration (ToolRunner only sets it prior to calling
    // run())//  w  w w  .j av  a 2 s  . co m
    conf = getConf();

    // Our own RM Handler
    ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId);

    // Connect
    rmHandler.getAMResourceManager();

    // Register
    try {
        rmHandler.registerApplicationMaster(masterHost, masterPort);
    } catch (YarnRemoteException ex) {
        LOG.error("Error encountered while trying to register application master", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Get file splits, configuration, etc.
    Set<ConfigurationTuple> configTuples;
    try {
        configTuples = getConfigurationTuples();
    } catch (IOException ex) {
        LOG.error("Error encountered while trying to generate configurations", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }
    // Needed for our master service later
    Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples);

    // Initial containers we want, based off of the file splits
    List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler);
    List<ContainerId> releasedContainers = new ArrayList<>();

    // Send an initial allocation request
    List<Container> allocatedContainers = new ArrayList<>();
    try {
        int needed = configTuples.size();
        int got = 0;
        int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10"));
        int attempts = 0;

        List<Container> acquiredContainers;

        while (got < needed && attempts < maxAttempts) {
            LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts
                    + ", maxAttempts=" + maxAttempts);

            acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getAllocatedContainers();

            got += acquiredContainers.size();
            attempts++;

            allocatedContainers.addAll(acquiredContainers);
            acquiredContainers.clear();

            LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size());

            Thread.sleep(2500);
        }
    } catch (YarnRemoteException ex) {
        LOG.error("Encountered an error while trying to allocate containers", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    final int numContainers = configTuples.size();

    /*
     * 
     * 
     * TODO: fix this so we try N times to get enough containers!
     * 
     * 
     * 
     * 
     */
    // Make sure we got all our containers, or else bail
    if (allocatedContainers.size() < numContainers) {
        LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers
                + ", allocated=" + allocatedContainers.size());

        requestedContainers.clear(); // We don't want new containers!

        // Add containers into released list
        for (Container c : allocatedContainers) {
            releasedContainers.add(c.getId());
        }

        // Release containers
        try {
            rmHandler.allocateRequest(requestedContainers, releasedContainers);
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to release unwanted containers", ex);
        }

        // Notify our handlers that we got a problem
        rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got "
                + allocatedContainers.size(), FinalApplicationStatus.FAILED);
        // bail
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Launch our worker process, as we now expect workers to actally do
    // something
    LOG.info("Starting master service");
    ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf,
            masterComputable, masterUpdateable, appConfig, conf);

    ExecutorService executor = Executors.newSingleThreadExecutor();
    Future<Integer> masterThread = executor.submit(masterService);

    // We got the number of containers we wanted, let's launch them
    LOG.info("Launching child containers");
    List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers);

    // Use an empty list for heartbeat purposes
    requestedContainers.clear();

    // Some local counters. Do we really need Atomic?
    AtomicInteger numCompletedContainers = new AtomicInteger();
    AtomicInteger numFailedContainers = new AtomicInteger();

    LOG.info("Waiting for containers to complete...");
    // Go into run-loop waiting for containers to finish, also our heartbeat
    while (numCompletedContainers.get() < numContainers) {
        // Don't pound the RM
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting on completed containers", ex);
            return ReturnCode.MASTER_ERROR.getCode();
        }

        // Heartbeat, effectively
        List<ContainerStatus> completedContainers;

        try {
            completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getCompletedContainersStatuses();
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex);

            continue; // Nothing to report, probably an error / endless loop
        }

        for (ContainerStatus cs : completedContainers) {
            int exitCode = cs.getExitStatus();
            if (exitCode != 0) {
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();

                masterService.fail();
                executor.shutdown();

                // Force kill our application, fail fast?
                LOG.info("At least one container failed with a non-zero exit code (" + exitCode
                        + "); killing application");
                rmHandler.finishApplication(
                        "Failing, due to at least container coming back with an non-zero exit code.",
                        FinalApplicationStatus.KILLED);

                return -10;
            } else {
                numCompletedContainers.incrementAndGet();
            }
        }
    }

    // All containers have completed
    // Wait for launch threads to complete (this shouldn't really happen)
    LOG.info("Containers completed");
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(1000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting for Launcher threads to complete", ex);
        }
    }

    // Ensure that our master service has completed as well
    if (!masterThread.isDone()) {
        masterService.fail();
    }

    int masterExit = masterThread.get();
    LOG.info("Master service completed with exitCode=" + masterExit);
    executor.shutdown();

    if (masterExit == 0) {

        String impersonatedUser = System.getenv("USER");

        UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser);
        //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser());
        ugi.doAs(new PrivilegedExceptionAction<Void>() {
            public Void run() {

                Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH));
                FileSystem fs;
                try {
                    fs = out.getFileSystem(conf);

                    FSDataOutputStream fos = fs.create(out);
                    LOG.info("Writing master results to " + out.toString());

                    masterComputable.complete(fos);

                    fos.flush();
                    fos.close();

                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

                return null;

                //FileSystem fs = FileSystem.get(conf);
                //fs.mkdir( out );
            }
        });

        /*
        LOG.info( "Here we would try to write to " + out.toString() );
        LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() );
        LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() );
                
        LOG.info( "Env Var User: " + System.getenv("USER") );
        */
        //LOG.info( "Ideally we'd be user: " + this.props.getProperty(  ) );

        //       for (Map.Entry<String, String> entry : this.conf) {
        //           LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue());
        //     }

    } else {
        LOG.warn("Not writing master results, as the master came back with errors!");
    }

    // Application finished
    ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR;

    try {
        if (numFailedContainers.get() == 0) {
            rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED);
        } else {
            String diag = "Completed with " + numFailedContainers.get() + " failed containers";
            rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED);
        }
    } catch (YarnRemoteException ex) {
        LOG.warn("Encountered an error while trying to send final status to resource manager", ex);
    }

    return rc.getCode();
}