List of usage examples for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED
FinalApplicationStatus FAILED
To view the source code for org.apache.hadoop.yarn.api.records FinalApplicationStatus FAILED.
Click Source Link
From source file:org.apache.oozie.action.hadoop.TestLauncherMain.java
License:Apache License
@Test public void testKillChildYarnJobs() throws Exception { YarnClient yc = Mockito.mock(YarnClient.class); ApplicationReport ar = Mockito.mock(ApplicationReport.class); Mockito.when(yc.getApplicationReport(Mockito.any(ApplicationId.class))).thenReturn(ar); Mockito.when(ar.getFinalApplicationStatus()).thenReturn(FinalApplicationStatus.UNDEFINED) .thenReturn(FinalApplicationStatus.FAILED).thenReturn(FinalApplicationStatus.KILLED); ApplicationId appz[] = { ApplicationId.newInstance(System.currentTimeMillis(), 1), ApplicationId.newInstance(System.currentTimeMillis(), 2), ApplicationId.newInstance(System.currentTimeMillis(), 3) }; Collection<ApplicationId> result = LauncherMain.checkAndKillChildYarnJobs(yc, null, Arrays.asList(appz)); assertEquals(1, result.size());/*from w ww . j a v a 2 s. c om*/ assertEquals(appz[0].getId(), result.iterator().next().getId()); }
From source file:org.apache.reef.runtime.yarn.driver.YarnContainerManager.java
License:Apache License
private void onRuntimeError(final Throwable throwable) { // SHUTDOWN YARN try {/*from www. j a v a 2s .c om*/ this.reefEventHandlers.close(); this.resourceManager.unregisterApplicationMaster(FinalApplicationStatus.FAILED, throwable.getMessage(), null); } catch (final Exception e) { LOG.log(Level.WARNING, "Error shutting down YARN application", e); } finally { this.resourceManager.stop(); } final RuntimeStatusProto.Builder runtimeStatusBuilder = RuntimeStatusProto.newBuilder() .setState(ReefServiceProtos.State.FAILED).setName(RUNTIME_NAME); final Encoder<Throwable> codec = new ObjectSerializableCodec<>(); runtimeStatusBuilder.setError(ReefServiceProtos.RuntimeErrorProto.newBuilder().setName(RUNTIME_NAME) .setMessage(throwable.getMessage()).setException(ByteString.copyFrom(codec.encode(throwable))) .build()).build(); this.reefEventHandlers.onRuntimeStatus(runtimeStatusBuilder.build()); }
From source file:org.apache.samza.job.yarn.SamzaTaskManager.java
License:Apache License
/** * This methods handles the onContainerCompleted callback from the RM. Based on the ContainerExitStatus, it decides * whether a container that exited is marked as complete or failure. *//*from w w w.j av a2 s . c om*/ @Override public void onContainerCompleted(ContainerStatus containerStatus) { String containerIdStr = ConverterUtils.toString(containerStatus.getContainerId()); int containerId = -1; for (Map.Entry<Integer, YarnContainer> entry : state.runningContainers.entrySet()) { if (entry.getValue().id().equals(containerStatus.getContainerId())) { containerId = entry.getKey(); break; } } state.runningContainers.remove(containerId); int exitStatus = containerStatus.getExitStatus(); switch (exitStatus) { case ContainerExitStatus.SUCCESS: log.info("Container {} completed successfully.", containerIdStr); state.completedContainers.incrementAndGet(); if (containerId != -1) { state.finishedContainers.add(containerId); containerFailures.remove(containerId); } if (state.completedContainers.get() == state.containerCount) { log.info("Setting job status to SUCCEEDED, since all containers have been marked as completed."); state.status = FinalApplicationStatus.SUCCEEDED; } break; case ContainerExitStatus.DISKS_FAILED: case ContainerExitStatus.ABORTED: case ContainerExitStatus.PREEMPTED: log.info( "Got an exit code of {}. This means that container {} was " + "killed by YARN, either due to being released by the application " + "master or being 'lost' due to node failures etc. or due to preemption by the RM", exitStatus, containerIdStr); state.releasedContainers.incrementAndGet(); // If this container was assigned some partitions (a containerId), then // clean up, and request a new container for the tasks. This only // should happen if the container was 'lost' due to node failure, not // if the AM released the container. if (containerId != -1) { log.info( "Released container {} was assigned task group ID {}. Requesting a new container for the task group.", containerIdStr, containerId); state.neededContainers.incrementAndGet(); state.jobHealthy.set(false); // request a container on new host containerAllocator.requestContainer(containerId, ContainerAllocator.ANY_HOST); } break; default: // TODO: Handle failure more intelligently. Should track NodeFailures! log.info("Container failed for some reason. Let's start it again"); log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if (containerId != -1) { state.neededContainers.incrementAndGet(); // Find out previously running container location String lastSeenOn = state.jobCoordinator.jobModel().getContainerToHostValue(containerId, SetContainerHostMapping.HOST_KEY); if (!hostAffinityEnabled || lastSeenOn == null) { lastSeenOn = ContainerAllocator.ANY_HOST; } // A container failed for an unknown reason. Let's check to see if // we need to shutdown the whole app master if too many container // failures have happened. The rules for failing are that the // failure count for a task group id must be > the configured retry // count, and the last failure (the one prior to this one) must have // happened less than retry window ms ago. If retry count is set to // 0, the app master will fail on any container failure. If the // retry count is set to a number < 0, a container failure will // never trigger an app master failure. int retryCount = yarnConfig.getContainerRetryCount(); int retryWindowMs = yarnConfig.getContainerRetryWindowMs(); if (retryCount == 0) { log.error( "Container ID {} ({}) failed, and retry count is set to 0, so shutting down the application master, and marking the job as failed.", containerId, containerIdStr); tooManyFailedContainers = true; } else if (retryCount > 0) { int currentFailCount; long lastFailureTime; if (containerFailures.containsKey(containerId)) { ContainerFailure failure = containerFailures.get(containerId); currentFailCount = failure.getCount() + 1; lastFailureTime = failure.getLastFailure(); } else { currentFailCount = 1; lastFailureTime = 0L; } if (currentFailCount >= retryCount) { long lastFailureMsDiff = System.currentTimeMillis() - lastFailureTime; if (lastFailureMsDiff < retryWindowMs) { log.error("Container ID " + containerId + "(" + containerIdStr + ") has failed " + currentFailCount + " times, with last failure " + lastFailureMsDiff + "ms ago. This is greater than retry count of " + retryCount + " and window of " + retryWindowMs + "ms , so shutting down the application master, and marking the job as failed."); // We have too many failures, and we're within the window // boundary, so reset shut down the app master. tooManyFailedContainers = true; state.status = FinalApplicationStatus.FAILED; } else { log.info( "Resetting fail count for container ID {} back to 1, since last container failure ({}) for " + "this container ID was outside the bounds of the retry window.", containerId, containerIdStr); // Reset counter back to 1, since the last failure for this // container happened outside the window boundary. containerFailures.put(containerId, new ContainerFailure(1, System.currentTimeMillis())); } } else { log.info("Current fail count for container ID {} is {}.", containerId, currentFailCount); containerFailures.put(containerId, new ContainerFailure(currentFailCount, System.currentTimeMillis())); } } if (!tooManyFailedContainers) { // Request a new container containerAllocator.requestContainer(containerId, lastSeenOn); } } } }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test Task Manager should request a new container when a task fails with unknown exit code * When host-affinity is not enabled, it will always request for ANY_HOST */// ww w . j av a2 s . c om @Test public void testNewContainerRequestedOnFailureWithUnknownCode() throws Exception { Config conf = getConfig(); SamzaTaskManager taskManager = new SamzaTaskManager(conf, state, amRmClientAsync, new YarnConfiguration()); MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync, TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(conf)); getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator); Thread thread = new Thread(allocator); getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread); // onInit triggers a request taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); Container container = TestUtil .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); // Create first container failure taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(ContainerRequestState.ANY_HOST, allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); assertTrue(state.jobHealthy.get()); // Create a second failure taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here")); // The above failure should trigger a job shutdown because our retry count is set to 1 assertEquals(0, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertFalse(state.jobHealthy.get()); assertTrue(taskManager.shouldShutdown()); assertEquals(FinalApplicationStatus.FAILED, state.status); taskManager.onShutdown(); }
From source file:org.apache.samza.job.yarn.TestSamzaTaskManager.java
License:Apache License
/** * Test Task Manager should request a new container when a task fails with unknown exit code * When host-affinity is enabled, it will always request for the same host that it was last seen on *//* ww w . j av a 2s. c o m*/ @Test public void testSameContainerRequestedOnFailureWithUnknownCode() throws Exception { Config conf = getConfigWithHostAffinity(); SamzaTaskManager taskManager = new SamzaTaskManager(conf, state, amRmClientAsync, new YarnConfiguration()); MockContainerAllocator allocator = new MockContainerAllocator(amRmClientAsync, TestUtil.getContainerUtil(getConfig(), state), new YarnConfig(conf)); getPrivateFieldFromTaskManager("containerAllocator", taskManager).set(taskManager, allocator); Thread thread = new Thread(allocator); getPrivateFieldFromTaskManager("allocatorThread", taskManager).set(taskManager, thread); // onInit triggers a request taskManager.onInit(); assertFalse(taskManager.shouldShutdown()); assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); Container container = TestUtil .getContainer(ConverterUtils.toContainerId("container_1350670447861_0003_01_000002"), "abc", 123); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); // Create first container failure taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here")); // The above failure should trigger a container request assertEquals(1, allocator.containerRequestState.getRequestsQueue().size()); assertEquals("abc", allocator.containerRequestState.getRequestsQueue().peek().getPreferredHost()); assertFalse(taskManager.shouldShutdown()); assertFalse(state.jobHealthy.get()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); taskManager.onContainerAllocated(container); // Allow container to run and update state Thread.sleep(300); assertTrue(state.jobHealthy.get()); // Create a second failure taskManager.onContainerCompleted( TestUtil.getContainerStatus(container.getId(), 1, "Expecting a failure here")); // The above failure should trigger a job shutdown because our retry count is set to 1 assertEquals(0, allocator.containerRequestState.getRequestsQueue().size()); assertEquals(2, testAMRMClient.requests.size()); assertEquals(0, testAMRMClient.getRelease().size()); assertFalse(state.jobHealthy.get()); assertTrue(taskManager.shouldShutdown()); assertEquals(FinalApplicationStatus.FAILED, state.status); taskManager.onShutdown(); }
From source file:org.apache.slider.server.appmaster.actions.ActionStopSlider.java
License:Apache License
/** * Build from an exception./* w ww . j av a 2 s . co m*/ * <p> * If the exception implements * {@link ExitCodeProvider} then the exit code is extracted from that * @param ex exception. */ public ActionStopSlider(Exception ex) { super("stop"); if (ex instanceof ExitCodeProvider) { setExitCode(((ExitCodeProvider) ex).getExitCode()); } else { setExitCode(LauncherExitCodes.EXIT_EXCEPTION_THROWN); } setFinalApplicationStatus(FinalApplicationStatus.FAILED); setMessage(ex.getMessage()); }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
@Override //AMRMClientAsync public void onError(Throwable e) { //callback says it's time to finish LOG_YARN.error("AMRMClientAsync.onError() received " + e, e); signalAMComplete(new ActionStopSlider("stop", EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED, "AMRMClientAsync.onError() received " + e)); }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
/** * Received on listening service termination. * @param service the service that has changed. */// ww w.java 2 s .co m @Override //ServiceStateChangeListener public void stateChanged(Service service) { if (service == providerService && service.isInState(STATE.STOPPED)) { //its the current master process in play int exitCode = providerService.getExitCode(); int mappedProcessExitCode = exitCode; boolean shouldTriggerFailure = !amCompletionFlag.get() && (mappedProcessExitCode != 0); if (shouldTriggerFailure) { String reason = "Spawned process failed with raw " + exitCode + " mapped to " + mappedProcessExitCode; ActionStopSlider stop = new ActionStopSlider("stop", mappedProcessExitCode, FinalApplicationStatus.FAILED, reason); //this wasn't expected: the process finished early spawnedProcessExitedBeforeShutdownTriggered = true; log.info("Process has exited with exit code {} mapped to {} -triggering termination", exitCode, mappedProcessExitCode); //tell the AM the cluster is complete signalAMComplete(stop); } else { //we don't care log.info("Process has exited with exit code {} mapped to {} -ignoring", exitCode, mappedProcessExitCode); } } else { super.stateChanged(service); } }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
/** * Handle any exception in a thread. If the exception provides an exit * code, that is the one that will be used * @param thread thread throwing the exception * @param exception exception/*from w w w .ja va 2 s . com*/ */ public void onExceptionInThread(Thread thread, Exception exception) { log.error("Exception in {}: {}", thread.getName(), exception, exception); // if there is a teardown in progress, ignore it if (amCompletionFlag.get()) { log.info("Ignoring exception: shutdown in progress"); } else { int exitCode = EXIT_EXCEPTION_THROWN; if (exception instanceof ExitCodeProvider) { exitCode = ((ExitCodeProvider) exception).getExitCode(); } signalAMComplete( new ActionStopSlider("stop", exitCode, FinalApplicationStatus.FAILED, exception.toString())); } }
From source file:org.apache.slider.server.appmaster.SliderAppMaster.java
License:Apache License
/** * Start the chaos monkey/*from ww w .j a v a 2s .c o m*/ * @return true if it started */ private boolean maybeStartMonkey() { MapOperations internals = getGlobalInternalOptions(); Boolean enabled = internals.getOptionBool(InternalKeys.CHAOS_MONKEY_ENABLED, InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED); if (!enabled) { log.info("Chaos monkey disabled"); return false; } long monkeyInterval = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_INTERVAL, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_DAYS, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_HOURS, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_MINUTES, 0); if (monkeyInterval == 0) { log.debug("Chaos monkey not configured with a time interval...not enabling"); return false; } long monkeyDelay = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_DELAY, 0, 0, 0, (int) monkeyInterval); log.info("Adding Chaos Monkey scheduled every {} seconds ({} hours -delay {}", monkeyInterval, monkeyInterval / (60 * 60), monkeyDelay); monkey = new ChaosMonkeyService(metrics, actionQueues); initAndAddService(monkey); // configure the targets // launch failure: special case with explicit failure triggered now int amLaunchFailProbability = internals .getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_LAUNCH_FAILURE, 0); if (amLaunchFailProbability > 0 && monkey.chaosCheck(amLaunchFailProbability)) { log.info("Chaos Monkey has triggered AM Launch failure"); // trigger a failure ActionStopSlider stop = new ActionStopSlider("stop", 0, TimeUnit.SECONDS, LauncherExitCodes.EXIT_FALSE, FinalApplicationStatus.FAILED, E_TRIGGERED_LAUNCH_FAILURE); queue(stop); } int amKillProbability = internals.getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE); monkey.addTarget("AM killer", new ChaosKillAM(actionQueues, -1), amKillProbability); int containerKillProbability = internals.getOptionInt( InternalKeys.CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE); monkey.addTarget("Container killer", new ChaosKillContainer(appState, actionQueues, rmOperationHandler), containerKillProbability); // and schedule it if (monkey.schedule(monkeyDelay, monkeyInterval, TimeUnit.SECONDS)) { log.info("Chaos Monkey is running"); return true; } else { log.info("Chaos monkey not started"); return false; } }