Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.yarn; import org.apache.flink.client.CliFrontend; import org.apache.flink.client.FlinkYarnSessionCli; import org.apache.flink.configuration.ConfigConstants; import org.apache.flink.runtime.akka.AkkaUtils; import org.apache.flink.runtime.jobmanager.RecoveryMode; import org.apache.flink.runtime.yarn.AbstractFlinkYarnClient; import org.apache.flink.runtime.yarn.AbstractFlinkYarnCluster; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.QueueInfo; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.client.api.YarnClientApplication; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.util.Records; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; /** * All classes in this package contain code taken from * https://github.com/apache/hadoop-common/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java?source=cc * and * https://github.com/hortonworks/simple-yarn-app * and * https://github.com/yahoo/storm-yarn/blob/master/src/main/java/com/yahoo/storm/yarn/StormOnYarn.java * * The Flink jar is uploaded to HDFS by this client. * The application master and all the TaskManager containers get the jar file downloaded * by YARN into their local fs. * */ public abstract class FlinkYarnClientBase extends AbstractFlinkYarnClient { private static final Logger LOG = LoggerFactory.getLogger(FlinkYarnClient.class); /** * Minimum memory requirements, checked by the Client. */ private static final int MIN_JM_MEMORY = 768; // the minimum memory should be higher than the min heap cutoff private static final int MIN_TM_MEMORY = 768; private Configuration conf; private YarnClient yarnClient; private YarnClientApplication yarnApplication; private Thread deploymentFailureHook = new DeploymentFailureHook(); /** * Files (usually in a distributed file system) used for the YARN session of Flink. * Contains configuration files and jar files. */ private Path sessionFilesDir; /** * If the user has specified a different number of slots, we store them here */ private int slots = -1; private int jobManagerMemoryMb = 1024; private int taskManagerMemoryMb = 1024; private int taskManagerCount = 1; private String yarnQueue = null; private String configurationDirectory; private Path flinkConfigurationPath; private Path flinkLoggingConfigurationPath; // optional private Path flinkJarPath; private String dynamicPropertiesEncoded; private List<File> shipFiles = new ArrayList<>(); private org.apache.flink.configuration.Configuration flinkConfiguration; private boolean detached; private String customName = null; public FlinkYarnClientBase() { conf = new YarnConfiguration(); if (this.yarnClient == null) { // Create yarnClient yarnClient = YarnClient.createYarnClient(); yarnClient.init(conf); yarnClient.start(); } // for unit tests only if (System.getenv("IN_TESTS") != null) { try { conf.addResource(new File(System.getenv("YARN_CONF_DIR") + "/yarn-site.xml").toURI().toURL()); } catch (Throwable t) { throw new RuntimeException("Error", t); } } } /** * The class to bootstrap the application master of the Yarn cluster (runs main method). */ protected abstract Class<?> getApplicationMasterClass(); @Override public void setJobManagerMemory(int memoryMb) { if (memoryMb < MIN_JM_MEMORY) { throw new IllegalArgumentException("The JobManager memory (" + memoryMb + ") is below the minimum required memory amount " + "of " + MIN_JM_MEMORY + " MB"); } this.jobManagerMemoryMb = memoryMb; } @Override public void setTaskManagerMemory(int memoryMb) { if (memoryMb < MIN_TM_MEMORY) { throw new IllegalArgumentException("The TaskManager memory (" + memoryMb + ") is below the minimum required memory amount " + "of " + MIN_TM_MEMORY + " MB"); } this.taskManagerMemoryMb = memoryMb; } @Override public void setFlinkConfiguration(org.apache.flink.configuration.Configuration conf) { this.flinkConfiguration = conf; } @Override public org.apache.flink.configuration.Configuration getFlinkConfiguration() { return flinkConfiguration; } @Override public void setTaskManagerSlots(int slots) { if (slots <= 0) { throw new IllegalArgumentException("Number of TaskManager slots must be positive"); } this.slots = slots; } @Override public int getTaskManagerSlots() { return this.slots; } @Override public void setQueue(String queue) { this.yarnQueue = queue; } @Override public void setLocalJarPath(Path localJarPath) { if (!localJarPath.toString().endsWith("jar")) { throw new IllegalArgumentException( "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); } this.flinkJarPath = localJarPath; } @Override public void setConfigurationFilePath(Path confPath) { flinkConfigurationPath = confPath; } @Override public void setConfigurationDirectory(String configurationDirectory) { this.configurationDirectory = configurationDirectory; } @Override public void setFlinkLoggingConfigurationPath(Path logConfPath) { flinkLoggingConfigurationPath = logConfPath; } @Override public Path getFlinkLoggingConfigurationPath() { return flinkLoggingConfigurationPath; } @Override public void setTaskManagerCount(int tmCount) { if (tmCount < 1) { throw new IllegalArgumentException("The TaskManager count has to be at least 1."); } this.taskManagerCount = tmCount; } @Override public int getTaskManagerCount() { return this.taskManagerCount; } @Override public void setShipFiles(List<File> shipFiles) { for (File shipFile : shipFiles) { // remove uberjar from ship list (by default everything in the lib/ folder is added to // the list of files to ship, but we handle the uberjar separately. if (!(shipFile.getName().startsWith("flink-dist") && shipFile.getName().endsWith("jar"))) { this.shipFiles.add(shipFile); } } } @Override public void setDynamicPropertiesEncoded(String dynamicPropertiesEncoded) { this.dynamicPropertiesEncoded = dynamicPropertiesEncoded; } @Override public String getDynamicPropertiesEncoded() { return this.dynamicPropertiesEncoded; } public void isReadyForDeployment() throws YarnDeploymentException { if (taskManagerCount <= 0) { throw new YarnDeploymentException("Taskmanager count must be positive"); } if (this.flinkJarPath == null) { throw new YarnDeploymentException("The Flink jar path is null"); } if (this.configurationDirectory == null) { throw new YarnDeploymentException("Configuration directory not set"); } if (this.flinkConfigurationPath == null) { throw new YarnDeploymentException("Configuration path not set"); } if (this.flinkConfiguration == null) { throw new YarnDeploymentException("Flink configuration object has not been set"); } // check if required Hadoop environment variables are set. If not, warn user if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + "configuration for accessing YARN."); } } public static boolean allocateResource(int[] nodeManagers, int toAllocate) { for (int i = 0; i < nodeManagers.length; i++) { if (nodeManagers[i] >= toAllocate) { nodeManagers[i] -= toAllocate; return true; } } return false; } @Override public void setDetachedMode(boolean detachedMode) { this.detached = detachedMode; } @Override public boolean isDetached() { return detached; } @Override public AbstractFlinkYarnCluster deploy() throws Exception { UserGroupInformation.setConfiguration(conf); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); if (UserGroupInformation.isSecurityEnabled()) { if (!ugi.hasKerberosCredentials()) { throw new YarnDeploymentException( "In secure mode. Please provide Kerberos credentials in order to authenticate. " + "You may use kinit to authenticate and request a TGT from the Kerberos server."); } return ugi.doAs(new PrivilegedExceptionAction<AbstractFlinkYarnCluster>() { @Override public AbstractFlinkYarnCluster run() throws Exception { return deployInternal(); } }); } else { return deployInternal(); } } /** * This method will block until the ApplicationMaster/JobManager have been * deployed on YARN. */ protected AbstractFlinkYarnCluster deployInternal() throws Exception { isReadyForDeployment(); LOG.info("Using values:"); LOG.info("\tTaskManager count = {}", taskManagerCount); LOG.info("\tJobManager memory = {}", jobManagerMemoryMb); LOG.info("\tTaskManager memory = {}", taskManagerMemoryMb); // Create application via yarnClient yarnApplication = yarnClient.createApplication(); GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); // ------------------ Add dynamic properties to local flinkConfiguraton ------ Map<String, String> dynProperties = CliFrontend.getDynamicProperties(dynamicPropertiesEncoded); for (Map.Entry<String, String> dynProperty : dynProperties.entrySet()) { flinkConfiguration.setString(dynProperty.getKey(), dynProperty.getValue()); } try { org.apache.flink.core.fs.FileSystem.setDefaultScheme(flinkConfiguration); } catch (IOException e) { throw new IOException("Error while setting the default " + "filesystem scheme from configuration.", e); } // ------------------ Check if the specified queue exists -------------- try { List<QueueInfo> queues = yarnClient.getAllQueues(); if (queues.size() > 0 && this.yarnQueue != null) { // check only if there are queues configured in yarn and for this session. boolean queueFound = false; for (QueueInfo queue : queues) { if (queue.getQueueName().equals(this.yarnQueue)) { queueFound = true; break; } } if (!queueFound) { String queueNames = ""; for (QueueInfo queue : queues) { queueNames += queue.getQueueName() + ", "; } LOG.warn("The specified queue '" + this.yarnQueue + "' does not exist. " + "Available queues: " + queueNames); } } else { LOG.debug("The YARN cluster does not have any queues configured"); } } catch (Throwable e) { LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); if (LOG.isDebugEnabled()) { LOG.debug("Error details", e); } } // ------------------ Check if the YARN Cluster has the requested resources -------------- // the yarnMinAllocationMB specifies the smallest possible container allocation size. // all allocations below this value are automatically set to this value. final int yarnMinAllocationMB = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0); if (jobManagerMemoryMb < yarnMinAllocationMB || taskManagerMemoryMb < yarnMinAllocationMB) { LOG.warn("The JobManager or TaskManager memory is below the smallest possible YARN Container size. " + "The value of 'yarn.scheduler.minimum-allocation-mb' is '" + yarnMinAllocationMB + "'. Please increase the memory size." + "YARN will allocate the smaller containers but the scheduler will account for the minimum-allocation-mb, maybe not all instances " + "you requested will start."); } // set the memory to minAllocationMB to do the next checks correctly if (jobManagerMemoryMb < yarnMinAllocationMB) { jobManagerMemoryMb = yarnMinAllocationMB; } if (taskManagerMemoryMb < yarnMinAllocationMB) { taskManagerMemoryMb = yarnMinAllocationMB; } Resource maxRes = appResponse.getMaximumResourceCapability(); final String NOTE = "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; if (jobManagerMemoryMb > maxRes.getMemory()) { failSessionDuringDeployment(); throw new YarnDeploymentException( "The cluster does not have the requested resources for the JobManager available!\n" + "Maximum Memory: " + maxRes.getMemory() + "MB Requested: " + jobManagerMemoryMb + "MB. " + NOTE); } if (taskManagerMemoryMb > maxRes.getMemory()) { failSessionDuringDeployment(); throw new YarnDeploymentException( "The cluster does not have the requested resources for the TaskManagers available!\n" + "Maximum Memory: " + maxRes.getMemory() + " Requested: " + taskManagerMemoryMb + "MB. " + NOTE); } final String NOTE_RSC = "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + "connecting from the beginning because the resources are currently not available in the cluster. " + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + "the resources become available."; int totalMemoryRequired = jobManagerMemoryMb + taskManagerMemoryMb * taskManagerCount; ClusterResourceDescription freeClusterMem = getCurrentFreeClusterResources(yarnClient); if (freeClusterMem.totalFreeMemory < totalMemoryRequired) { LOG.warn("This YARN session requires " + totalMemoryRequired + "MB of memory in the cluster. " + "There are currently only " + freeClusterMem.totalFreeMemory + "MB available." + NOTE_RSC); } if (taskManagerMemoryMb > freeClusterMem.containerLimit) { LOG.warn("The requested amount of memory for the TaskManagers (" + taskManagerMemoryMb + "MB) is more than " + "the largest possible YARN container: " + freeClusterMem.containerLimit + NOTE_RSC); } if (jobManagerMemoryMb > freeClusterMem.containerLimit) { LOG.warn( "The requested amount of memory for the JobManager (" + jobManagerMemoryMb + "MB) is more than " + "the largest possible YARN container: " + freeClusterMem.containerLimit + NOTE_RSC); } // ----------------- check if the requested containers fit into the cluster. int[] nmFree = Arrays.copyOf(freeClusterMem.nodeManagersFree, freeClusterMem.nodeManagersFree.length); // first, allocate the jobManager somewhere. if (!allocateResource(nmFree, jobManagerMemoryMb)) { LOG.warn("Unable to find a NodeManager that can fit the JobManager/Application master. " + "The JobManager requires " + jobManagerMemoryMb + "MB. NodeManagers available: " + Arrays.toString(freeClusterMem.nodeManagersFree) + NOTE_RSC); } // allocate TaskManagers for (int i = 0; i < taskManagerCount; i++) { if (!allocateResource(nmFree, taskManagerMemoryMb)) { LOG.warn("There is not enough memory available in the YARN cluster. " + "The TaskManager(s) require " + taskManagerMemoryMb + "MB each. " + "NodeManagers available: " + Arrays.toString(freeClusterMem.nodeManagersFree) + "\n" + "After allocating the JobManager (" + jobManagerMemoryMb + "MB) and (" + i + "/" + taskManagerCount + ") TaskManagers, " + "the following NodeManagers are available: " + Arrays.toString(nmFree) + NOTE_RSC); } } // ------------------ Prepare Application Master Container ------------------------------ // respect custom JVM options in the YAML file final String javaOpts = flinkConfiguration.getString(ConfigConstants.FLINK_JVM_OPTIONS, ""); String logbackFile = configurationDirectory + File.separator + FlinkYarnSessionCli.CONFIG_FILE_LOGBACK_NAME; boolean hasLogback = new File(logbackFile).exists(); String log4jFile = configurationDirectory + File.separator + FlinkYarnSessionCli.CONFIG_FILE_LOG4J_NAME; boolean hasLog4j = new File(log4jFile).exists(); if (hasLogback) { shipFiles.add(new File(logbackFile)); } if (hasLog4j) { shipFiles.add(new File(log4jFile)); } // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); String amCommand = "$JAVA_HOME/bin/java" + " -Xmx" + Utils.calculateHeapSize(jobManagerMemoryMb, flinkConfiguration) + "M " + javaOpts; if (hasLogback || hasLog4j) { amCommand += " -Dlog.file=\"" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/jobmanager.log\""; if (hasLogback) { amCommand += " -Dlogback.configurationFile=file:" + FlinkYarnSessionCli.CONFIG_FILE_LOGBACK_NAME; } if (hasLog4j) { amCommand += " -Dlog4j.configuration=file:" + FlinkYarnSessionCli.CONFIG_FILE_LOG4J_NAME; } } amCommand += " " + getApplicationMasterClass().getName() + " " + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/jobmanager.out" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/jobmanager.err"; amContainer.setCommands(Collections.singletonList(amCommand)); LOG.debug("Application Master start command: " + amCommand); // intialize HDFS // Copy the application master jar to the filesystem // Create a local resource to point to the destination jar path final FileSystem fs = FileSystem.get(conf); // hard coded check for the GoogleHDFS client because its not overriding the getScheme() method. if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") && fs.getScheme().startsWith("file")) { LOG.warn("The file system scheme is '" + fs.getScheme() + "'. This indicates that the " + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + "The Flink YARN client needs to store its files in a distributed file system"); } // Set-up ApplicationSubmissionContext for the application ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); if (RecoveryMode.isHighAvailabilityModeActivated(flinkConfiguration)) { // activate re-execution of failed applications appContext.setMaxAppAttempts(flinkConfiguration.getInteger(ConfigConstants.YARN_APPLICATION_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); activateHighAvailabilitySupport(appContext); } else { // set number of application retries to 1 in the default case appContext .setMaxAppAttempts(flinkConfiguration.getInteger(ConfigConstants.YARN_APPLICATION_ATTEMPTS, 1)); } final ApplicationId appId = appContext.getApplicationId(); // Setup jar for ApplicationMaster LocalResource appMasterJar = Records.newRecord(LocalResource.class); LocalResource flinkConf = Records.newRecord(LocalResource.class); Path remotePathJar = Utils.setupLocalResource(fs, appId.toString(), flinkJarPath, appMasterJar, fs.getHomeDirectory()); Path remotePathConf = Utils.setupLocalResource(fs, appId.toString(), flinkConfigurationPath, flinkConf, fs.getHomeDirectory()); Map<String, LocalResource> localResources = new HashMap<>(2); localResources.put("flink.jar", appMasterJar); localResources.put("flink-conf.yaml", flinkConf); // setup security tokens (code from apache storm) final Path[] paths = new Path[2 + shipFiles.size()]; StringBuilder envShipFileList = new StringBuilder(); // upload ship files for (int i = 0; i < shipFiles.size(); i++) { File shipFile = shipFiles.get(i); LocalResource shipResources = Records.newRecord(LocalResource.class); Path shipLocalPath = new Path("file://" + shipFile.getAbsolutePath()); paths[2 + i] = Utils.setupLocalResource(fs, appId.toString(), shipLocalPath, shipResources, fs.getHomeDirectory()); localResources.put(shipFile.getName(), shipResources); envShipFileList.append(paths[2 + i]); if (i + 1 < shipFiles.size()) { envShipFileList.append(','); } } paths[0] = remotePathJar; paths[1] = remotePathConf; sessionFilesDir = new Path(fs.getHomeDirectory(), ".flink/" + appId.toString() + "/"); FsPermission permission = new FsPermission(FsAction.ALL, FsAction.NONE, FsAction.NONE); fs.setPermission(sessionFilesDir, permission); // set permission for path. Utils.setTokensFor(amContainer, paths, conf); amContainer.setLocalResources(localResources); fs.close(); // Setup CLASSPATH for ApplicationMaster Map<String, String> appMasterEnv = new HashMap<>(); // set user specified app master environment variables appMasterEnv.putAll(Utils.getEnvironmentVariables(ConfigConstants.YARN_APPLICATION_MASTER_ENV_PREFIX, flinkConfiguration)); // set classpath from YARN configuration Utils.setupEnv(conf, appMasterEnv); // set Flink on YARN internal configuration values appMasterEnv.put(YarnConfigKeys.ENV_TM_COUNT, String.valueOf(taskManagerCount)); appMasterEnv.put(YarnConfigKeys.ENV_TM_MEMORY, String.valueOf(taskManagerMemoryMb)); appMasterEnv.put(YarnConfigKeys.FLINK_JAR_PATH, remotePathJar.toString()); appMasterEnv.put(YarnConfigKeys.ENV_APP_ID, appId.toString()); appMasterEnv.put(YarnConfigKeys.ENV_CLIENT_HOME_DIR, fs.getHomeDirectory().toString()); appMasterEnv.put(YarnConfigKeys.ENV_CLIENT_SHIP_FILES, envShipFileList.toString()); appMasterEnv.put(YarnConfigKeys.ENV_CLIENT_USERNAME, UserGroupInformation.getCurrentUser().getShortUserName()); appMasterEnv.put(YarnConfigKeys.ENV_SLOTS, String.valueOf(slots)); appMasterEnv.put(YarnConfigKeys.ENV_DETACHED, String.valueOf(detached)); if (dynamicPropertiesEncoded != null) { appMasterEnv.put(YarnConfigKeys.ENV_DYNAMIC_PROPERTIES, dynamicPropertiesEncoded); } amContainer.setEnvironment(appMasterEnv); // Set up resource type requirements for ApplicationMaster Resource capability = Records.newRecord(Resource.class); capability.setMemory(jobManagerMemoryMb); capability.setVirtualCores(1); String name; if (customName == null) { name = "Flink session with " + taskManagerCount + " TaskManagers"; if (detached) { name += " (detached)"; } } else { name = customName; } appContext.setApplicationName(name); // application name appContext.setApplicationType("Apache Flink"); appContext.setAMContainerSpec(amContainer); appContext.setResource(capability); if (yarnQueue != null) { appContext.setQueue(yarnQueue); } // add a hook to clean up in case deployment fails Runtime.getRuntime().addShutdownHook(deploymentFailureHook); LOG.info("Submitting application master " + appId); yarnClient.submitApplication(appContext); LOG.info("Waiting for the cluster to be allocated"); int waittime = 0; loop: while (true) { ApplicationReport report; try { report = yarnClient.getApplicationReport(appId); } catch (IOException e) { throw new YarnDeploymentException("Failed to deploy the cluster: " + e.getMessage()); } YarnApplicationState appState = report.getYarnApplicationState(); switch (appState) { case FAILED: case FINISHED: case KILLED: throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + appState + " during deployment. \n" + "Diagnostics from YARN: " + report.getDiagnostics() + "\n" + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + "yarn logs -applicationId " + appId); //break .. case RUNNING: LOG.info("YARN application has been deployed successfully."); break loop; default: LOG.info("Deploying cluster, current state " + appState); if (waittime > 60000) { LOG.info( "Deployment took more than 60 seconds. Please check if the requested resources are available in the YARN cluster"); } } waittime += 1000; Thread.sleep(1000); } // print the application id for user to cancel themselves. if (isDetached()) { LOG.info("The Flink YARN client has been started in detached mode. In order to stop " + "Flink on YARN, use the following command or a YARN web interface to stop " + "it:\nyarn application -kill " + appId + "\nPlease also note that the " + "temporary files of the YARN session in the home directoy will not be removed."); } // since deployment was successful, remove the hook try { Runtime.getRuntime().removeShutdownHook(deploymentFailureHook); } catch (IllegalStateException e) { // we're already in the shut down hook. } // the Flink cluster is deployed in YARN. Represent cluster return new FlinkYarnCluster(yarnClient, appId, conf, flinkConfiguration, sessionFilesDir, detached); } /** * Kills YARN application and stops YARN client. * * Use this method to kill the App before it has been properly deployed */ private void failSessionDuringDeployment() { LOG.info("Killing YARN application"); try { yarnClient.killApplication(yarnApplication.getNewApplicationResponse().getApplicationId()); } catch (Exception e) { // we only log a debug message here because the "killApplication" call is a best-effort // call (we don't know if the application has been deployed when the error occured). LOG.debug("Error while killing YARN application", e); } yarnClient.stop(); } private static class ClusterResourceDescription { final public int totalFreeMemory; final public int containerLimit; final public int[] nodeManagersFree; public ClusterResourceDescription(int totalFreeMemory, int containerLimit, int[] nodeManagersFree) { this.totalFreeMemory = totalFreeMemory; this.containerLimit = containerLimit; this.nodeManagersFree = nodeManagersFree; } } private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) throws YarnException, IOException { List<NodeReport> nodes = yarnClient.getNodeReports(NodeState.RUNNING); int totalFreeMemory = 0; int containerLimit = 0; int[] nodeManagersFree = new int[nodes.size()]; for (int i = 0; i < nodes.size(); i++) { NodeReport rep = nodes.get(i); int free = rep.getCapability().getMemory() - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); nodeManagersFree[i] = free; totalFreeMemory += free; if (free > containerLimit) { containerLimit = free; } } return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); } @Override public String getClusterDescription() throws Exception { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(baos); YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); ps.append("NodeManagers in the Cluster " + metrics.getNumNodeManagers()); List<NodeReport> nodes = yarnClient.getNodeReports(NodeState.RUNNING); final String format = "|%-16s |%-16s %n"; ps.printf("|Property |Value %n"); ps.println("+---------------------------------------+"); int totalMemory = 0; int totalCores = 0; for (NodeReport rep : nodes) { final Resource res = rep.getCapability(); totalMemory += res.getMemory(); totalCores += res.getVirtualCores(); ps.format(format, "NodeID", rep.getNodeId()); ps.format(format, "Memory", res.getMemory() + " MB"); ps.format(format, "vCores", res.getVirtualCores()); ps.format(format, "HealthReport", rep.getHealthReport()); ps.format(format, "Containers", rep.getNumContainers()); ps.println("+---------------------------------------+"); } ps.println("Summary: totalMemory " + totalMemory + " totalCores " + totalCores); List<QueueInfo> qInfo = yarnClient.getAllQueues(); for (QueueInfo q : qInfo) { ps.println("Queue: " + q.getQueueName() + ", Current Capacity: " + q.getCurrentCapacity() + " Max Capacity: " + q.getMaximumCapacity() + " Applications: " + q.getApplications().size()); } yarnClient.stop(); return baos.toString(); } @Override public String getSessionFilesDir() { return sessionFilesDir.toString(); } @Override public void setName(String name) { if (name == null) { throw new IllegalArgumentException("The passed name is null"); } customName = name; } private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) throws InvocationTargetException, IllegalAccessException { ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); reflector.setAttemptFailuresValidityInterval(appContext, AkkaUtils.getTimeout(flinkConfiguration).toMillis()); } /** * Singleton object which uses reflection to determine whether the {@link ApplicationSubmissionContext} * supports the setKeepContainersAcrossApplicationAttempts and the setAttemptFailuresValidityInterval * methods. Depending on the Hadoop version these methods are supported or not. If the methods * are not supported, then nothing happens when setKeepContainersAcrossApplicationAttempts or * setAttemptFailuresValidityInterval are called. */ private static class ApplicationSubmissionContextReflector { private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); private static final ApplicationSubmissionContextReflector instance = new ApplicationSubmissionContextReflector( ApplicationSubmissionContext.class); public static ApplicationSubmissionContextReflector getInstance() { return instance; } private static final String keepContainersMethodName = "setKeepContainersAcrossApplicationAttempts"; private static final String attemptsFailuresValidityIntervalMethodName = "setAttemptFailuresValidityInterval"; private final Method keepContainersMethod; private final Method attemptFailuresValidityIntervalMethod; private ApplicationSubmissionContextReflector(Class<ApplicationSubmissionContext> clazz) { Method keepContainersMethod; Method attemptFailuresValidityIntervalMethod; try { // this method is only supported by Hadoop 2.4.0 onwards keepContainersMethod = clazz.getMethod(keepContainersMethodName, boolean.class); LOG.debug("{} supports method {}.", clazz.getCanonicalName(), keepContainersMethodName); } catch (NoSuchMethodException e) { LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), keepContainersMethodName); // assign null because the Hadoop version apparently does not support this call. keepContainersMethod = null; } this.keepContainersMethod = keepContainersMethod; try { // this method is only supported by Hadoop 2.6.0 onwards attemptFailuresValidityIntervalMethod = clazz.getMethod(attemptsFailuresValidityIntervalMethodName, long.class); LOG.debug("{} supports method {}.", clazz.getCanonicalName(), attemptsFailuresValidityIntervalMethodName); } catch (NoSuchMethodException e) { LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), attemptsFailuresValidityIntervalMethodName); // assign null because the Hadoop version apparently does not support this call. attemptFailuresValidityIntervalMethod = null; } this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; } public void setKeepContainersAcrossApplicationAttempts(ApplicationSubmissionContext appContext, boolean keepContainers) throws InvocationTargetException, IllegalAccessException { if (keepContainersMethod != null) { LOG.debug("Calling method {} of {}.", keepContainersMethod.getName(), appContext.getClass().getCanonicalName()); keepContainersMethod.invoke(appContext, keepContainers); } else { LOG.debug("{} does not support method {}. Doing nothing.", appContext.getClass().getCanonicalName(), keepContainersMethodName); } } public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) throws InvocationTargetException, IllegalAccessException { if (attemptFailuresValidityIntervalMethod != null) { LOG.debug("Calling method {} of {}.", attemptFailuresValidityIntervalMethod.getName(), appContext.getClass().getCanonicalName()); attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); } else { LOG.debug("{} does not support method {}. Doing nothing.", appContext.getClass().getCanonicalName(), attemptsFailuresValidityIntervalMethodName); } } } public static class YarnDeploymentException extends RuntimeException { private static final long serialVersionUID = -812040641215388943L; public YarnDeploymentException() { } public YarnDeploymentException(String message) { super(message); } public YarnDeploymentException(String message, Throwable cause) { super(message, cause); } } private class DeploymentFailureHook extends Thread { @Override public void run() { LOG.info("Cancelling deployment from Deployment Failure Hook"); failSessionDuringDeployment(); LOG.info("Deleting files in " + sessionFilesDir); try { FileSystem fs = FileSystem.get(conf); fs.delete(sessionFilesDir, true); fs.close(); } catch (IOException e) { LOG.error("Failed to delete Flink Jar and conf files in HDFS", e); } } } }