Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.yarn; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.time.Time; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.ResourceManagerOptions; import org.apache.flink.runtime.clusterframework.ApplicationStatus; import org.apache.flink.runtime.clusterframework.types.AllocationID; import org.apache.flink.runtime.clusterframework.types.ResourceID; import org.apache.flink.runtime.clusterframework.types.ResourceProfile; import org.apache.flink.runtime.clusterframework.types.SlotID; import org.apache.flink.runtime.concurrent.ScheduledExecutor; import org.apache.flink.runtime.concurrent.ScheduledExecutorServiceAdapter; import org.apache.flink.runtime.entrypoint.ClusterInformation; import org.apache.flink.runtime.heartbeat.HeartbeatServices; import org.apache.flink.runtime.heartbeat.TestingHeartbeatServices; import org.apache.flink.runtime.highavailability.HighAvailabilityServices; import org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices; import org.apache.flink.runtime.instance.HardwareDescription; import org.apache.flink.runtime.leaderelection.TestingLeaderElectionService; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.metrics.MetricRegistry; import org.apache.flink.runtime.metrics.NoOpMetricRegistry; import org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup; import org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups; import org.apache.flink.runtime.registration.RegistrationResponse; import org.apache.flink.runtime.resourcemanager.JobLeaderIdService; import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway; import org.apache.flink.runtime.resourcemanager.SlotRequest; import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager; import org.apache.flink.runtime.rpc.FatalErrorHandler; import org.apache.flink.runtime.rpc.RpcService; import org.apache.flink.runtime.rpc.TestingRpcService; import org.apache.flink.runtime.taskexecutor.SlotReport; import org.apache.flink.runtime.taskexecutor.SlotStatus; import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway; import org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess; import org.apache.flink.runtime.testutils.DirectScheduledExecutorService; import org.apache.flink.runtime.util.TestingFatalErrorHandler; import org.apache.flink.util.TestLogger; import org.apache.flink.util.function.RunnableWithException; import org.apache.flink.shaded.guava18.com.google.common.collect.ImmutableList; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.client.api.AMRMClient; import org.apache.hadoop.yarn.client.api.NMClient; import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.hamcrest.Matchers; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import javax.annotation.Nullable; import java.io.File; import java.nio.file.Files; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import static org.apache.flink.yarn.YarnConfigKeys.ENV_APP_ID; import static org.apache.flink.yarn.YarnConfigKeys.ENV_CLIENT_HOME_DIR; import static org.apache.flink.yarn.YarnConfigKeys.ENV_CLIENT_SHIP_FILES; import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; import static org.apache.flink.yarn.YarnConfigKeys.ENV_HADOOP_USER_NAME; import static org.apache.flink.yarn.YarnConfigKeys.FLINK_JAR_PATH; import static org.apache.flink.yarn.YarnConfigKeys.FLINK_YARN_FILES; import static org.hamcrest.Matchers.instanceOf; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; /** * General tests for the YARN resource manager component. */ public class YarnResourceManagerTest extends TestLogger { private static final Time TIMEOUT = Time.seconds(10L); private Configuration flinkConfig; private Map<String, String> env; private TestingFatalErrorHandler testingFatalErrorHandler; @Rule public TemporaryFolder folder = new TemporaryFolder(); @Before public void setup() { testingFatalErrorHandler = new TestingFatalErrorHandler(); flinkConfig = new Configuration(); flinkConfig.setInteger(ResourceManagerOptions.CONTAINERIZED_HEAP_CUTOFF_MIN, 100); File root = folder.getRoot(); File home = new File(root, "home"); boolean created = home.mkdir(); assertTrue(created); env = new HashMap<>(); env.put(ENV_APP_ID, "foo"); env.put(ENV_CLIENT_HOME_DIR, home.getAbsolutePath()); env.put(ENV_CLIENT_SHIP_FILES, ""); env.put(ENV_FLINK_CLASSPATH, ""); env.put(ENV_HADOOP_USER_NAME, "foo"); env.put(FLINK_JAR_PATH, root.toURI().toString()); } @After public void teardown() throws Exception { if (testingFatalErrorHandler != null) { testingFatalErrorHandler.rethrowError(); } if (env != null) { env.clear(); } } static class TestingYarnResourceManager extends YarnResourceManager { AMRMClientAsync<AMRMClient.ContainerRequest> mockResourceManagerClient; NMClient mockNMClient; TestingYarnResourceManager(RpcService rpcService, String resourceManagerEndpointId, ResourceID resourceId, Configuration flinkConfig, Map<String, String> env, HighAvailabilityServices highAvailabilityServices, HeartbeatServices heartbeatServices, SlotManager slotManager, MetricRegistry metricRegistry, JobLeaderIdService jobLeaderIdService, ClusterInformation clusterInformation, FatalErrorHandler fatalErrorHandler, @Nullable String webInterfaceUrl, AMRMClientAsync<AMRMClient.ContainerRequest> mockResourceManagerClient, NMClient mockNMClient, JobManagerMetricGroup jobManagerMetricGroup) { super(rpcService, resourceManagerEndpointId, resourceId, flinkConfig, env, highAvailabilityServices, heartbeatServices, slotManager, metricRegistry, jobLeaderIdService, clusterInformation, fatalErrorHandler, webInterfaceUrl, jobManagerMetricGroup); this.mockNMClient = mockNMClient; this.mockResourceManagerClient = mockResourceManagerClient; } <T> CompletableFuture<T> runInMainThread(Callable<T> callable) { return callAsync(callable, TIMEOUT); } MainThreadExecutor getMainThreadExecutorForTesting() { return super.getMainThreadExecutor(); } @Override protected AMRMClientAsync<AMRMClient.ContainerRequest> createAndStartResourceManagerClient( YarnConfiguration yarnConfiguration, int yarnHeartbeatIntervalMillis, @Nullable String webInterfaceUrl) { return mockResourceManagerClient; } @Override protected NMClient createAndStartNodeManagerClient(YarnConfiguration yarnConfiguration) { return mockNMClient; } @Override protected void runAsync(final Runnable runnable) { runnable.run(); } } class Context { // services final TestingRpcService rpcService; final MockResourceManagerRuntimeServices rmServices; // RM final ResourceID rmResourceID; static final String RM_ADDRESS = "resourceManager"; final TestingYarnResourceManager resourceManager; final int dataPort = 1234; final HardwareDescription hardwareDescription = new HardwareDescription(1, 2L, 3L, 4L); // domain objects for test purposes final ResourceProfile resourceProfile1 = new ResourceProfile(1.0, 200); public ContainerId task = ContainerId .newInstance(ApplicationAttemptId.newInstance(ApplicationId.newInstance(1L, 0), 0), 1); public String taskHost = "host1"; public NMClient mockNMClient = mock(NMClient.class); public AMRMClientAsync<AMRMClient.ContainerRequest> mockResourceManagerClient = mock(AMRMClientAsync.class); public JobManagerMetricGroup mockJMMetricGroup = UnregisteredMetricGroups .createUnregisteredJobManagerMetricGroup(); /** * Create mock RM dependencies. */ Context() throws Exception { rpcService = new TestingRpcService(); rmServices = new MockResourceManagerRuntimeServices(); // resource manager rmResourceID = ResourceID.generate(); resourceManager = new TestingYarnResourceManager(rpcService, RM_ADDRESS, rmResourceID, flinkConfig, env, rmServices.highAvailabilityServices, rmServices.heartbeatServices, rmServices.slotManager, rmServices.metricRegistry, rmServices.jobLeaderIdService, new ClusterInformation("localhost", 1234), testingFatalErrorHandler, null, mockResourceManagerClient, mockNMClient, mockJMMetricGroup); } /** * Mock services needed by the resource manager. */ class MockResourceManagerRuntimeServices { private final ScheduledExecutor scheduledExecutor; private final TestingHighAvailabilityServices highAvailabilityServices; private final HeartbeatServices heartbeatServices; private final MetricRegistry metricRegistry; private final TestingLeaderElectionService rmLeaderElectionService; private final JobLeaderIdService jobLeaderIdService; private final SlotManager slotManager; private UUID rmLeaderSessionId; MockResourceManagerRuntimeServices() throws Exception { scheduledExecutor = mock(ScheduledExecutor.class); highAvailabilityServices = new TestingHighAvailabilityServices(); rmLeaderElectionService = new TestingLeaderElectionService(); highAvailabilityServices.setResourceManagerLeaderElectionService(rmLeaderElectionService); heartbeatServices = new TestingHeartbeatServices(5L, 5L, scheduledExecutor); metricRegistry = NoOpMetricRegistry.INSTANCE; slotManager = new SlotManager( new ScheduledExecutorServiceAdapter(new DirectScheduledExecutorService()), Time.seconds(10), Time.seconds(10), Time.minutes(1)); jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, rpcService.getScheduledExecutor(), Time.minutes(5L)); } void grantLeadership() throws Exception { rmLeaderSessionId = UUID.randomUUID(); rmLeaderElectionService.isLeader(rmLeaderSessionId).get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS); } } /** * Start the resource manager and grant leadership to it. */ void startResourceManager() throws Exception { resourceManager.start(); rmServices.grantLeadership(); } /** * Stop the Akka actor system. */ void stopResourceManager() throws Exception { rpcService.stopService().get(); } /** * A wrapper function for running test. Deal with setup and teardown logic * in Context. * @param testMethod the real test body. */ void runTest(RunnableWithException testMethod) throws Exception { startResourceManager(); try { testMethod.run(); } finally { stopResourceManager(); } } } private static Container mockContainer(String host, int port, int containerId) { Container mockContainer = mock(Container.class); NodeId mockNodeId = NodeId.newInstance(host, port); ContainerId mockContainerId = ContainerId.newInstance( ApplicationAttemptId.newInstance(ApplicationId.newInstance(System.currentTimeMillis(), 1), 1), containerId); when(mockContainer.getId()).thenReturn(mockContainerId); when(mockContainer.getNodeId()).thenReturn(mockNodeId); when(mockContainer.getResource()).thenReturn(Resource.newInstance(200, 1)); when(mockContainer.getPriority()).thenReturn(Priority.UNDEFINED); return mockContainer; } private static ContainerStatus mockContainerStatus(ContainerId containerId) { ContainerStatus mockContainerStatus = mock(ContainerStatus.class); when(mockContainerStatus.getContainerId()).thenReturn(containerId); when(mockContainerStatus.getState()).thenReturn(ContainerState.COMPLETE); when(mockContainerStatus.getDiagnostics()).thenReturn("Test exit"); when(mockContainerStatus.getExitStatus()).thenReturn(-1); return mockContainerStatus; } @Test public void testStopWorker() throws Exception { new Context() { { runTest(() -> { // Request slot from SlotManager. CompletableFuture<?> registerSlotRequestFuture = resourceManager.runInMainThread(() -> { rmServices.slotManager.registerSlotRequest( new SlotRequest(new JobID(), new AllocationID(), resourceProfile1, taskHost)); return null; }); // wait for the registerSlotRequest completion registerSlotRequestFuture.get(); // Callback from YARN when container is allocated. Container testingContainer = mockContainer("container", 1234, 1); resourceManager.onContainersAllocated(ImmutableList.of(testingContainer)); verify(mockResourceManagerClient).addContainerRequest(any(AMRMClient.ContainerRequest.class)); verify(mockNMClient).startContainer(eq(testingContainer), any(ContainerLaunchContext.class)); // Remote task executor registers with YarnResourceManager. TaskExecutorGateway mockTaskExecutorGateway = mock(TaskExecutorGateway.class); rpcService.registerGateway(taskHost, mockTaskExecutorGateway); final ResourceManagerGateway rmGateway = resourceManager .getSelfGateway(ResourceManagerGateway.class); final ResourceID taskManagerResourceId = new ResourceID(testingContainer.getId().toString()); final SlotReport slotReport = new SlotReport( new SlotStatus(new SlotID(taskManagerResourceId, 1), new ResourceProfile(10, 1, 1, 1, 0, Collections.emptyMap()))); CompletableFuture<Integer> numberRegisteredSlotsFuture = rmGateway .registerTaskExecutor(taskHost, taskManagerResourceId, dataPort, hardwareDescription, Time.seconds(10L)) .thenCompose((RegistrationResponse response) -> { assertThat(response, instanceOf(TaskExecutorRegistrationSuccess.class)); final TaskExecutorRegistrationSuccess success = (TaskExecutorRegistrationSuccess) response; return rmGateway.sendSlotReport(taskManagerResourceId, success.getRegistrationId(), slotReport, Time.seconds(10L)); }).handleAsync( (Acknowledge ignored, Throwable throwable) -> rmServices.slotManager .getNumberRegisteredSlots(), resourceManager.getMainThreadExecutorForTesting()); final int numberRegisteredSlots = numberRegisteredSlotsFuture.get(); assertEquals(1, numberRegisteredSlots); // Unregister all task executors and release all containers. CompletableFuture<?> unregisterAndReleaseFuture = resourceManager.runInMainThread(() -> { rmServices.slotManager.unregisterTaskManagersAndReleaseResources(); return null; }); unregisterAndReleaseFuture.get(); verify(mockNMClient).stopContainer(any(ContainerId.class), any(NodeId.class)); verify(mockResourceManagerClient).releaseAssignedContainer(any(ContainerId.class)); }); // It's now safe to access the SlotManager state since the ResourceManager has been stopped. assertThat(rmServices.slotManager.getNumberRegisteredSlots(), Matchers.equalTo(0)); assertThat(resourceManager.getNumberOfRegisteredTaskManagers().get(), Matchers.equalTo(0)); } }; } /** * Tests that application files are deleted when the YARN application master is de-registered. */ @Test public void testDeleteApplicationFiles() throws Exception { new Context() { { final File applicationDir = folder.newFolder(".flink"); env.put(FLINK_YARN_FILES, applicationDir.getCanonicalPath()); runTest(() -> { resourceManager.deregisterApplication(ApplicationStatus.SUCCEEDED, null); assertFalse("YARN application directory was not removed", Files.exists(applicationDir.toPath())); }); } }; } /** * Tests that YarnResourceManager will not request more containers than needs during * callback from Yarn when container is Completed. */ @Test public void testOnContainerCompleted() throws Exception { new Context() { { runTest(() -> { CompletableFuture<?> registerSlotRequestFuture = resourceManager.runInMainThread(() -> { rmServices.slotManager.registerSlotRequest( new SlotRequest(new JobID(), new AllocationID(), resourceProfile1, taskHost)); return null; }); // wait for the registerSlotRequest completion registerSlotRequestFuture.get(); // Callback from YARN when container is allocated. Container testingContainer = mockContainer("container", 1234, 1); resourceManager.onContainersAllocated(ImmutableList.of(testingContainer)); verify(mockResourceManagerClient).addContainerRequest(any(AMRMClient.ContainerRequest.class)); verify(mockNMClient).startContainer(eq(testingContainer), any(ContainerLaunchContext.class)); // Callback from YARN when container is Completed, pending request can not be fulfilled by pending // containers, need to request new container. ContainerStatus testingContainerStatus = mockContainerStatus(testingContainer.getId()); resourceManager.onContainersCompleted(ImmutableList.of(testingContainerStatus)); verify(mockResourceManagerClient, times(2)) .addContainerRequest(any(AMRMClient.ContainerRequest.class)); // Callback from YARN when container is Completed happened before global fail, pending request // slot is already fulfilled by pending containers, no need to request new container. resourceManager.onContainersCompleted(ImmutableList.of(testingContainerStatus)); verify(mockResourceManagerClient, times(2)) .addContainerRequest(any(AMRMClient.ContainerRequest.class)); }); } }; } }