Example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED

List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED

Introduction

In this page you can find the example usage for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.

Prototype

FinalApplicationStatus FAILED

To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.

Click Source Link

Document

Application which failed.

Usage

From source file:org.apache.tez.tests.TestExternalTezServicesErrors.java

License:Apache License

private void testFatalError(String methodName, Vertex.VertexExecutionContext lhsExecutionContext,
        String dagNameSuffix, List<String> expectedDiagMessages)
        throws IOException, TezException, YarnException, InterruptedException {
    TezConfiguration tezClientConf = new TezConfiguration(extServiceTestHelper.getConfForJobs());
    TezClient tezClient = TezClient//from  w  w  w .j  a v  a  2  s.co  m
            .newBuilder(TestExternalTezServicesErrors.class.getSimpleName() + methodName + "_session",
                    tezClientConf)
            .setIsSession(true).setServicePluginDescriptor(servicePluginsDescriptor).build();

    ApplicationId appId = null;
    try {
        tezClient.start();
        LOG.info("TezSessionStarted for " + methodName);
        tezClient.waitTillReady();
        LOG.info("TezSession ready for submission for " + methodName);

        JoinValidateConfigured joinValidate = new JoinValidateConfigured(EXECUTION_CONTEXT_DEFAULT,
                lhsExecutionContext, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, EXECUTION_CONTEXT_EXT_SERVICE_PUSH,
                dagNameSuffix);

        DAG dag = joinValidate.createDag(new TezConfiguration(extServiceTestHelper.getConfForJobs()),
                HASH_JOIN_EXPECTED_RESULT_PATH, HASH_JOIN_OUTPUT_PATH, 3);

        DAGClient dagClient = tezClient.submitDAG(dag);

        DAGStatus dagStatus = dagClient
                .waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
        assertEquals(DAGStatus.State.ERROR, dagStatus.getState());
        boolean foundDiag = false;
        for (String diag : dagStatus.getDiagnostics()) {
            foundDiag = checkDiag(diag, expectedDiagMessages);
            if (foundDiag) {
                break;
            }
        }
        appId = tezClient.getAppMasterApplicationId();
        assertTrue(foundDiag);
    } catch (InterruptedException e) {
        e.printStackTrace();
    } finally {
        tezClient.stop();
    }
    // Verify the state of the application.
    if (appId != null) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        try {
            yarnClient.init(tezClientConf);
            yarnClient.start();

            ApplicationReport appReport = yarnClient.getApplicationReport(appId);
            YarnApplicationState appState = appReport.getYarnApplicationState();
            while (!EnumSet
                    .of(YarnApplicationState.FINISHED, YarnApplicationState.FAILED, YarnApplicationState.KILLED)
                    .contains(appState)) {
                Thread.sleep(200L);
                appReport = yarnClient.getApplicationReport(appId);
                appState = appReport.getYarnApplicationState();
            }

            // TODO Workaround for YARN-4554. AppReport does not provide diagnostics - need to fetch them from ApplicationAttemptReport
            ApplicationAttemptId appAttemptId = appReport.getCurrentApplicationAttemptId();
            ApplicationAttemptReport appAttemptReport = yarnClient.getApplicationAttemptReport(appAttemptId);
            String diag = appAttemptReport.getDiagnostics();
            assertEquals(FinalApplicationStatus.FAILED, appReport.getFinalApplicationStatus());
            assertEquals(YarnApplicationState.FINISHED, appReport.getYarnApplicationState());
            checkDiag(diag, expectedDiagMessages);
        } finally {
            yarnClient.stop();
        }
    }
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Set our own configuration (ToolRunner only sets it prior to calling
    // run())//from   ww w  .  ja  v a 2 s  .  c  o m
    conf = getConf();

    // Our own RM Handler
    ResourceManagerHandler rmHandler = new ResourceManagerHandler(conf, appAttemptId);

    // Connect
    rmHandler.getAMResourceManager();

    // Register
    try {
        rmHandler.registerApplicationMaster(masterHost, masterPort);
    } catch (YarnRemoteException ex) {
        LOG.error("Error encountered while trying to register application master", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Get file splits, configuration, etc.
    Set<ConfigurationTuple> configTuples;
    try {
        configTuples = getConfigurationTuples();
    } catch (IOException ex) {
        LOG.error("Error encountered while trying to generate configurations", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }
    // Needed for our master service later
    Map<WorkerId, StartupConfiguration> startupConf = getMasterStartupConfiguration(configTuples);

    // Initial containers we want, based off of the file splits
    List<ResourceRequest> requestedContainers = getRequestedContainersList(configTuples, rmHandler);
    List<ContainerId> releasedContainers = new ArrayList<>();

    // Send an initial allocation request
    List<Container> allocatedContainers = new ArrayList<>();
    try {
        int needed = configTuples.size();
        int got = 0;
        int maxAttempts = Integer.parseInt(props.getProperty(ConfigFields.APP_ALLOCATION_MAX_ATTEMPTS, "10"));
        int attempts = 0;

        List<Container> acquiredContainers;

        while (got < needed && attempts < maxAttempts) {
            LOG.info("Requesting containers" + ", got=" + got + ", needed=" + needed + ", attempts=" + attempts
                    + ", maxAttempts=" + maxAttempts);

            acquiredContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getAllocatedContainers();

            got += acquiredContainers.size();
            attempts++;

            allocatedContainers.addAll(acquiredContainers);
            acquiredContainers.clear();

            LOG.info("Got allocation response, allocatedContainers=" + acquiredContainers.size());

            Thread.sleep(2500);
        }
    } catch (YarnRemoteException ex) {
        LOG.error("Encountered an error while trying to allocate containers", ex);
        return ReturnCode.MASTER_ERROR.getCode();
    }

    final int numContainers = configTuples.size();

    /*
     * 
     * 
     * TODO: fix this so we try N times to get enough containers!
     * 
     * 
     * 
     * 
     */
    // Make sure we got all our containers, or else bail
    if (allocatedContainers.size() < numContainers) {
        LOG.info("Unable to get required number of containers, will not continue" + ", needed=" + numContainers
                + ", allocated=" + allocatedContainers.size());

        requestedContainers.clear(); // We don't want new containers!

        // Add containers into released list
        for (Container c : allocatedContainers) {
            releasedContainers.add(c.getId());
        }

        // Release containers
        try {
            rmHandler.allocateRequest(requestedContainers, releasedContainers);
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to release unwanted containers", ex);
        }

        // Notify our handlers that we got a problem
        rmHandler.finishApplication("Unable to allocate containers, needed " + numContainers + ", but got "
                + allocatedContainers.size(), FinalApplicationStatus.FAILED);
        // bail
        return ReturnCode.MASTER_ERROR.getCode();
    }

    // Launch our worker process, as we now expect workers to actally do
    // something
    LOG.info("Starting master service");
    ApplicationMasterService<T> masterService = new ApplicationMasterService<>(masterAddr, startupConf,
            masterComputable, masterUpdateable, appConfig, conf);

    ExecutorService executor = Executors.newSingleThreadExecutor();
    Future<Integer> masterThread = executor.submit(masterService);

    // We got the number of containers we wanted, let's launch them
    LOG.info("Launching child containers");
    List<Thread> launchThreads = launchContainers(configTuples, allocatedContainers);

    // Use an empty list for heartbeat purposes
    requestedContainers.clear();

    // Some local counters. Do we really need Atomic?
    AtomicInteger numCompletedContainers = new AtomicInteger();
    AtomicInteger numFailedContainers = new AtomicInteger();

    LOG.info("Waiting for containers to complete...");
    // Go into run-loop waiting for containers to finish, also our heartbeat
    while (numCompletedContainers.get() < numContainers) {
        // Don't pound the RM
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting on completed containers", ex);
            return ReturnCode.MASTER_ERROR.getCode();
        }

        // Heartbeat, effectively
        List<ContainerStatus> completedContainers;

        try {
            completedContainers = rmHandler.allocateRequest(requestedContainers, releasedContainers)
                    .getCompletedContainersStatuses();
        } catch (YarnRemoteException ex) {
            LOG.warn("Encountered an error while trying to heartbeat to resource manager", ex);

            continue; // Nothing to report, probably an error / endless loop
        }

        for (ContainerStatus cs : completedContainers) {
            int exitCode = cs.getExitStatus();
            if (exitCode != 0) {
                numCompletedContainers.incrementAndGet();
                numFailedContainers.incrementAndGet();

                masterService.fail();
                executor.shutdown();

                // Force kill our application, fail fast?
                LOG.info("At least one container failed with a non-zero exit code (" + exitCode
                        + "); killing application");
                rmHandler.finishApplication(
                        "Failing, due to at least container coming back with an non-zero exit code.",
                        FinalApplicationStatus.KILLED);

                return -10;
            } else {
                numCompletedContainers.incrementAndGet();
            }
        }
    }

    // All containers have completed
    // Wait for launch threads to complete (this shouldn't really happen)
    LOG.info("Containers completed");
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(1000);
        } catch (InterruptedException ex) {
            LOG.warn("Interrupted while waiting for Launcher threads to complete", ex);
        }
    }

    // Ensure that our master service has completed as well
    if (!masterThread.isDone()) {
        masterService.fail();
    }

    int masterExit = masterThread.get();
    LOG.info("Master service completed with exitCode=" + masterExit);
    executor.shutdown();

    if (masterExit == 0) {

        String impersonatedUser = System.getenv("USER");

        UserGroupInformation ugi = UserGroupInformation.createRemoteUser(impersonatedUser);
        //UserGroupInformation.createProxyUser(impersonatedUser, UserGroupInformation.getLoginUser());
        ugi.doAs(new PrivilegedExceptionAction<Void>() {
            public Void run() {

                Path out = new Path(props.getProperty(ConfigFields.APP_OUTPUT_PATH));
                FileSystem fs;
                try {
                    fs = out.getFileSystem(conf);

                    FSDataOutputStream fos = fs.create(out);
                    LOG.info("Writing master results to " + out.toString());

                    masterComputable.complete(fos);

                    fos.flush();
                    fos.close();

                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

                return null;

                //FileSystem fs = FileSystem.get(conf);
                //fs.mkdir( out );
            }
        });

        /*
        LOG.info( "Here we would try to write to " + out.toString() );
        LOG.info( "As current user: " + UserGroupInformation.getCurrentUser().getShortUserName() );
        LOG.info( "As login user: " + UserGroupInformation.getLoginUser().getShortUserName() );
                
        LOG.info( "Env Var User: " + System.getenv("USER") );
        */
        //LOG.info( "Ideally we'd be user: " + this.props.getProperty(  ) );

        //       for (Map.Entry<String, String> entry : this.conf) {
        //           LOG.info("ApplicationMaster->Conf: " + entry.getKey() + " = " + entry.getValue());
        //     }

    } else {
        LOG.warn("Not writing master results, as the master came back with errors!");
    }

    // Application finished
    ReturnCode rc = (numFailedContainers.get() == 0) ? ReturnCode.OK : ReturnCode.CONTAINER_ERROR;

    try {
        if (numFailedContainers.get() == 0) {
            rmHandler.finishApplication("Completed successfully", FinalApplicationStatus.SUCCEEDED);
        } else {
            String diag = "Completed with " + numFailedContainers.get() + " failed containers";
            rmHandler.finishApplication(diag, FinalApplicationStatus.FAILED);
        }
    } catch (YarnRemoteException ex) {
        LOG.warn("Encountered an error while trying to send final status to resource manager", ex);
    }

    return rc.getCode();
}

From source file:org.dknight.app.ApplicationMaster.java

License:Apache License

private void finish() {
    // Join all launched threads
    // needed for when we time out
    // and we need to release containers
    for (Thread launchThread : launchThreads) {
        try {//from  w w  w .j  a v a 2 s.  c om
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.info("Exception thrown in thread join: " + e.getMessage());
            e.printStackTrace();
        }
    }

    //stop yarnClient
    yarnClient.stop();
    // When the application completes, it should stop all running containers
    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    // When the application completes, it should send a finish application
    // signal to the RM
    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus;
    String appMessage = null;
    success = true;
    if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) {
        appStatus = FinalApplicationStatus.SUCCEEDED;
    } else {
        appStatus = FinalApplicationStatus.FAILED;
        appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed="
                + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed="
                + numFailedContainers.get();
        success = false;
    }
    try {
        amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (YarnException ex) {
        LOG.error("Failed to unregister application", ex);
    } catch (IOException e) {
        LOG.error("Failed to unregister application", e);
    }

    amRMClient.stop();
}

From source file:org.elasticsearch.hadoop.yarn.am.AppMasterRpc.java

License:Apache License

public void failAM() {
    unregisterAM(FinalApplicationStatus.FAILED);
}

From source file:org.hdl.caffe.yarn.app.ApplicationMaster.java

License:Apache License

@VisibleForTesting
protected boolean finish() {
    // wait for completion.
    while (!done && (numCompletedContainers.get() != numTotalContainers)) {
        try {/*  ww w .j av a  2s . co  m*/
            Thread.sleep(200);
        } catch (InterruptedException ex) {

        }
    }

    // Join all launched threads
    // needed for when we time out
    // and we need to release containers
    for (Thread launchThread : launchThreads) {
        try {
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.info("Exception thrown in thread join: " + e.getMessage());
            e.printStackTrace();
        }
    }

    // When the application completes, it should stop all running containers
    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    // When the application completes, it should send a finish application
    // signal to the RM
    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus;
    String appMessage = null;
    boolean success = true;
    if (numFailedContainers.get() == 0) {
        appStatus = FinalApplicationStatus.SUCCEEDED;
    } else {
        appStatus = FinalApplicationStatus.FAILED;
        appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed="
                + numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed="
                + numFailedContainers.get();
        LOG.info(appMessage);
        success = false;
    }
    try {
        amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (YarnException ex) {
        LOG.error("Failed to unregister application", ex);
    } catch (IOException e) {
        LOG.error("Failed to unregister application", e);
    }

    amRMClient.stop();

    return success;
}

From source file:org.hdl.tensorflow.yarn.appmaster.ApplicationMaster.java

License:Apache License

private FinalApplicationStatus getFinalAppStatus() {
    if (completedContainerNum.get() - failedContainerNum.get() >= args.totalContainerNum) {
        return FinalApplicationStatus.SUCCEEDED;
    } else {/*from  w  w w  .  ja v  a2s.co m*/
        return FinalApplicationStatus.FAILED;
    }
}

From source file:org.hortonworks.dovetail.am.AppMaster.java

License:Apache License

private void finish() {
    for (Thread launchThread : launchThreads) {
        try {/*w  w w. j  av  a2  s. c  o m*/
            launchThread.join(10000);
        } catch (InterruptedException e) {
            LOG.info("Exception thrown in thread join: " + e.getMessage());
            e.printStackTrace();
        }
    }

    LOG.info("Application completed. Stopping running containers");
    nmClientAsync.stop();

    LOG.info("Application completed. Signalling finish to RM");

    FinalApplicationStatus appStatus;
    String appMessage = null;
    success = true;
    if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numContainers) {
        appStatus = FinalApplicationStatus.SUCCEEDED;
    } else {
        appStatus = FinalApplicationStatus.FAILED;
        appMessage = "Diagnostics." + ", total=" + numContainers + ", completed=" + numCompletedContainers.get()
                + ", allocated=" + numAllocatedContainers.get() + ", failed=" + numFailedContainers.get();
        success = false;
    }
    try {
        resourceManager.unregisterApplicationMaster(appStatus, appMessage, null);
    } catch (YarnException ex) {
        LOG.log(Level.SEVERE, "Failed to unregister application", ex);
    } catch (IOException e) {
        LOG.log(Level.SEVERE, "Failed to unregister application", e);
    }

    done = true;
    resourceManager.stop();
}

From source file:org.springframework.yarn.am.StaticAppmaster.java

License:Apache License

@Override
protected void onInit() throws Exception {
    super.onInit();
    getMonitor().addContainerMonitorStateListener(new ContainerMonitorListener() {
        @Override/* ww w . ja va  2s  . c o  m*/
        public void state(ContainerMonitorState state) {
            if (getMonitor().freeCount() == 0) {
                int completed = state.getCompleted();
                int failed = state.getFailed();
                if (completed + failed >= containerCount) {
                    if (failed > 0) {
                        setFinalApplicationStatus(FinalApplicationStatus.FAILED);
                    }
                    notifyCompleted();
                }
            }
        }
    });
}

From source file:org.springframework.yarn.am.StaticEventingAppmaster.java

License:Apache License

@Override
protected void onContainerCompleted(ContainerStatus status) {
    super.onContainerCompleted(status);

    getMonitor().reportContainerStatus(status);

    int exitStatus = status.getExitStatus();

    if (exitStatus == 0) {
        if (isComplete()) {
            notifyCompleted();//ww w.  j  a  v  a  2s .com
        }
    } else {
        if (!onContainerFailed(status)) {
            setFinalApplicationStatus(FinalApplicationStatus.FAILED);
            notifyCompleted();
        }
    }
}

From source file:org.springframework.yarn.batch.am.AbstractBatchAppmaster.java

License:Apache License

/**
 * Runs the given job.//from   ww w  . j  a  v  a  2 s  . c  o m
 *
 * @param job the job to run
 */
public void runJob(Job job) {
    if (job instanceof AbstractJob) {
        ((AbstractJob) job).registerJobExecutionListener(new JobExecutionListener() {
            @Override
            public void beforeJob(JobExecution jobExecution) {
            }

            @Override
            public void afterJob(JobExecution jobExecution) {
                if (jobExecution.getStatus().equals(BatchStatus.COMPLETED)) {
                    log.info("Batch status complete, notify listeners");
                    notifyCompleted();
                }
            }
        });
    }
    JobParameters jobParameters = getJobParametersConverter().getJobParameters(getParameters());
    try {
        getJobLauncher().run(job, jobParameters);
    } catch (Exception e) {
        log.error("Error running job=" + job, e);
        setFinalApplicationStatus(FinalApplicationStatus.FAILED);
        notifyCompleted();
    }
}