List of usage examples for org.apache.hadoop.yarn.api.records ContainerStatus getContainerId
@Public @Stable public abstract ContainerId getContainerId();
ContainerId
of the container. From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java
License:Apache License
public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason //TODO: Add retry numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); // we do not need to release the container as it would be done // by the RM }/*from w w w. j a v a 2s . c o m*/ } else { //nothing to do // container completed successfully numCompletedContainers.incrementAndGet(); LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } }
From source file:eu.stratosphere.yarn.ApplicationMaster.java
License:Apache License
private void run() throws Exception { //Utils.logFilesInCurrentDirectory(LOG); // Initialize clients to ResourceManager and NodeManagers Configuration conf = Utils.initializeYarnConfiguration(); FileSystem fs = FileSystem.get(conf); Map<String, String> envs = System.getenv(); final String currDir = envs.get(Environment.PWD.key()); final String logDirs = envs.get(Environment.LOG_DIRS.key()); final String ownHostname = envs.get(Environment.NM_HOST.key()); final String appId = envs.get(Client.ENV_APP_ID); final String clientHomeDir = envs.get(Client.ENV_CLIENT_HOME_DIR); final String applicationMasterHost = envs.get(Environment.NM_HOST.key()); final String remoteStratosphereJarPath = envs.get(Client.STRATOSPHERE_JAR_PATH); final String shipListString = envs.get(Client.ENV_CLIENT_SHIP_FILES); final String yarnClientUsername = envs.get(Client.ENV_CLIENT_USERNAME); final int taskManagerCount = Integer.valueOf(envs.get(Client.ENV_TM_COUNT)); final int memoryPerTaskManager = Integer.valueOf(envs.get(Client.ENV_TM_MEMORY)); final int coresPerTaskManager = Integer.valueOf(envs.get(Client.ENV_TM_CORES)); int heapLimit = Utils.calculateHeapSize(memoryPerTaskManager); if (currDir == null) { throw new RuntimeException("Current directory unknown"); }// w w w.ja v a 2s .c o m if (ownHostname == null) { throw new RuntimeException("Own hostname (" + Environment.NM_HOST + ") not set."); } LOG.info("Working directory " + currDir); // load Stratosphere configuration. Utils.getStratosphereConfiguration(currDir); final String localWebInterfaceDir = currDir + "/resources/" + ConfigConstants.DEFAULT_JOB_MANAGER_WEB_PATH_NAME; // Update yaml conf -> set jobManager address to this machine's address. FileInputStream fis = new FileInputStream(currDir + "/stratosphere-conf.yaml"); BufferedReader br = new BufferedReader(new InputStreamReader(fis)); Writer output = new BufferedWriter(new FileWriter(currDir + "/stratosphere-conf-modified.yaml")); String line; while ((line = br.readLine()) != null) { if (line.contains(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY)) { output.append(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY + ": " + ownHostname + "\n"); } else if (line.contains(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY)) { output.append(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY + ": " + "\n"); } else { output.append(line + "\n"); } } // just to make sure. output.append(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY + ": " + ownHostname + "\n"); output.append(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY + ": " + localWebInterfaceDir + "\n"); output.append(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY + ": " + logDirs + "\n"); output.close(); br.close(); File newConf = new File(currDir + "/stratosphere-conf-modified.yaml"); if (!newConf.exists()) { LOG.warn("modified yaml does not exist!"); } Utils.copyJarContents("resources/" + ConfigConstants.DEFAULT_JOB_MANAGER_WEB_PATH_NAME, ApplicationMaster.class.getProtectionDomain().getCodeSource().getLocation().getPath()); JobManager jm; { String pathToNepheleConfig = currDir + "/stratosphere-conf-modified.yaml"; String[] args = { "-executionMode", "cluster", "-configDir", pathToNepheleConfig }; // start the job manager jm = JobManager.initialize(args); // Start info server for jobmanager jm.startInfoServer(); } AMRMClient<ContainerRequest> rmClient = AMRMClient.createAMRMClient(); rmClient.init(conf); rmClient.start(); NMClient nmClient = NMClient.createNMClient(); nmClient.init(conf); nmClient.start(); // Register with ResourceManager LOG.info("registering ApplicationMaster"); rmClient.registerApplicationMaster(applicationMasterHost, 0, "http://" + applicationMasterHost + ":" + GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, "undefined")); // Priority for worker containers - priorities are intra-application Priority priority = Records.newRecord(Priority.class); priority.setPriority(0); // Resource requirements for worker containers Resource capability = Records.newRecord(Resource.class); capability.setMemory(memoryPerTaskManager); capability.setVirtualCores(coresPerTaskManager); // Make container requests to ResourceManager for (int i = 0; i < taskManagerCount; ++i) { ContainerRequest containerAsk = new ContainerRequest(capability, null, null, priority); LOG.info("Requesting TaskManager container " + i); rmClient.addContainerRequest(containerAsk); } LocalResource stratosphereJar = Records.newRecord(LocalResource.class); LocalResource stratosphereConf = Records.newRecord(LocalResource.class); // register Stratosphere Jar with remote HDFS final Path remoteJarPath = new Path(remoteStratosphereJarPath); Utils.registerLocalResource(fs, remoteJarPath, stratosphereJar); // register conf with local fs. Path remoteConfPath = Utils.setupLocalResource(conf, fs, appId, new Path("file://" + currDir + "/stratosphere-conf-modified.yaml"), stratosphereConf, new Path(clientHomeDir)); LOG.info("Prepared localresource for modified yaml: " + stratosphereConf); boolean hasLog4j = new File(currDir + "/log4j.properties").exists(); // prepare the files to ship LocalResource[] remoteShipRsc = null; String[] remoteShipPaths = shipListString.split(","); if (!shipListString.isEmpty()) { remoteShipRsc = new LocalResource[remoteShipPaths.length]; { // scope for i int i = 0; for (String remoteShipPathStr : remoteShipPaths) { if (remoteShipPathStr == null || remoteShipPathStr.isEmpty()) { continue; } remoteShipRsc[i] = Records.newRecord(LocalResource.class); Path remoteShipPath = new Path(remoteShipPathStr); Utils.registerLocalResource(fs, remoteShipPath, remoteShipRsc[i]); i++; } } } // respect custom JVM options in the YAML file final String javaOpts = GlobalConfiguration.getString(ConfigConstants.STRATOSPHERE_JVM_OPTIONS, ""); // Obtain allocated containers and launch int allocatedContainers = 0; int completedContainers = 0; while (allocatedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(0); for (Container container : response.getAllocatedContainers()) { LOG.info("Got new Container for TM " + container.getId() + " on host " + container.getNodeId().getHost()); ++allocatedContainers; // Launch container by create ContainerLaunchContext ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); String tmCommand = "$JAVA_HOME/bin/java -Xmx" + heapLimit + "m " + javaOpts; if (hasLog4j) { tmCommand += " -Dlog.file=\"" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-log4j.log\" -Dlog4j.configuration=file:log4j.properties"; } tmCommand += " eu.stratosphere.yarn.YarnTaskManagerRunner -configDir . " + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stdout.log" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stderr.log"; ctx.setCommands(Collections.singletonList(tmCommand)); LOG.info("Starting TM with command=" + tmCommand); // copy resources to the TaskManagers. Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(2); localResources.put("stratosphere.jar", stratosphereJar); localResources.put("stratosphere-conf.yaml", stratosphereConf); // add ship resources if (!shipListString.isEmpty()) { Preconditions.checkNotNull(remoteShipRsc); for (int i = 0; i < remoteShipPaths.length; i++) { localResources.put(new Path(remoteShipPaths[i]).getName(), remoteShipRsc[i]); } } ctx.setLocalResources(localResources); // Setup CLASSPATH for Container (=TaskTracker) Map<String, String> containerEnv = new HashMap<String, String>(); Utils.setupEnv(conf, containerEnv); //add stratosphere.jar to class path. containerEnv.put(Client.ENV_CLIENT_USERNAME, yarnClientUsername); ctx.setEnvironment(containerEnv); UserGroupInformation user = UserGroupInformation.getCurrentUser(); try { Credentials credentials = user.getCredentials(); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); ctx.setTokens(securityTokens); } catch (IOException e) { LOG.warn("Getting current user info failed when trying to launch the container" + e.getMessage()); } LOG.info("Launching container " + allocatedContainers); nmClient.startContainer(container, ctx); } for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container (while allocating) " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); } Thread.sleep(100); } // Now wait for containers to complete while (completedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(completedContainers / taskManagerCount); for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); } Thread.sleep(5000); } LOG.info("Shutting down JobManager"); jm.shutdown(); // Un-register with ResourceManager rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", ""); }
From source file:gobblin.yarn.YarnService.java
License:Apache License
/** * Handle the completion of a container. A new container will be requested to replace the one * that just exited. Depending on the exit status and if container host affinity is enabled, * the new container may or may not try to be started on the same node. * * A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. * A replacement container is needed in all but the last case. *//*from w ww . j a v a 2 s. c om*/ private void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap .remove(containerStatus.getContainerId()); String completedInstanceName = completedContainerEntry.getValue(); LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info(String.format("Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (this.shutdownInProgress) { return; } int retryCount = this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)) .incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the queue of unused // instance names so they can be reused by a replacement container. this.unusedHelixInstanceNames.offer(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post(new NewContainerRequest(shouldStickToTheSameNode(containerStatus.getExitStatus()) ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
From source file:gobblin.yarn.YarnService.java
License:Apache License
private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata( ContainerStatus containerStatus) { ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>(); eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString()); eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE, containerStatus.getState().toString()); if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS, containerStatus.getExitStatus() + ""); }//from w ww. ja va 2 s . c o m if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS, containerStatus.getDiagnostics()); } return eventMetadataBuilder; }
From source file:husky.server.HuskyRMCallbackHandler.java
License:Apache License
public void onContainersCompleted(List<ContainerStatus> completedContainerStatus) { LOG.info("Get response from RM for container request, completedCnt = " + completedContainerStatus.size()); mNumCompletedContainers += completedContainerStatus.size(); for (ContainerStatus status : completedContainerStatus) { LOG.info(String.format("Container %s: %s, exit status: %d", status.getContainerId().toString(), status.getState().toString(), status.getExitStatus())); if (status.getExitStatus() == 0) { mNumSuccess += 1;//from w w w . ja va2s. c om } } LOG.info("Total containers: " + mNumContainers + ", completed containers: " + mNumCompletedContainers); if (mNumContainers == mNumCompletedContainers) { // If all workers and master finish synchronized (finalResultLock) { finalResultLock.unlock(); finalResultLock.notifyAll(); } } }
From source file:hws.core.JobMaster.java
License:Apache License
public void onContainersCompleted(List<ContainerStatus> statuses) { for (ContainerStatus status : statuses) { Logger.info("[AM] Completed container " + status.getContainerId()); synchronized (this) { numContainersToWaitFor--;//from w ww . java 2 s .c om } } }
From source file:io.hops.tensorflow.TimelineHandler.java
License:Apache License
public void publishContainerEndEvent(ContainerStatus container) { final TimelineEntity entity = new TimelineEntity(); entity.setEntityId(container.getContainerId().toString()); entity.setEntityType(ApplicationMaster.YarntfEntity.YARNTF_CONTAINER.toString()); entity.setDomainId(domainId);/* ww w . java 2 s . c o m*/ entity.addPrimaryFilter("user", ugi.getShortUserName()); TimelineEvent event = new TimelineEvent(); event.setTimestamp(System.currentTimeMillis()); event.setEventType(ApplicationMaster.YarntfEvent.YARNTF_CONTAINER_END.toString()); event.addEventInfo("State", container.getState().name()); event.addEventInfo("Exit Status", container.getExitStatus()); entity.addEvent(event); try { timelineClient.putEntities(entity); } catch (YarnException | IOException e) { LOG.error("Container end event could not be published for " + container.getContainerId().toString(), e); } }
From source file:org.apache.drill.yarn.appMaster.ClusterControllerImpl.java
License:Apache License
@Override public synchronized void containersCompleted(List<ContainerStatus> statuses) { EventContext context = new EventContext(this); for (ContainerStatus status : statuses) { Task task = getTask(status.getContainerId()); if (task == null) { if (task == null) { // Will occur if a container was allocated but rejected. // Any other occurrence is unexpected and an error. LOG.warn("Container completed but no associated task state: " + status.getContainerId()); }//from w w w . j a v a2s . c om continue; } context.setTask(task); context.getState().containerCompleted(context, status); } checkStatus(); }
From source file:org.apache.flink.yarn.ApplicationMaster.java
License:Apache License
private void run() throws Exception { //Utils.logFilesInCurrentDirectory(LOG); // Initialize clients to ResourceManager and NodeManagers Configuration conf = Utils.initializeYarnConfiguration(); FileSystem fs = FileSystem.get(conf); Map<String, String> envs = System.getenv(); final String currDir = envs.get(Environment.PWD.key()); final String logDirs = envs.get(Environment.LOG_DIRS.key()); final String ownHostname = envs.get(Environment.NM_HOST.key()); final String appId = envs.get(Client.ENV_APP_ID); final String clientHomeDir = envs.get(Client.ENV_CLIENT_HOME_DIR); final String applicationMasterHost = envs.get(Environment.NM_HOST.key()); final String remoteFlinkJarPath = envs.get(Client.FLINK_JAR_PATH); final String shipListString = envs.get(Client.ENV_CLIENT_SHIP_FILES); final String yarnClientUsername = envs.get(Client.ENV_CLIENT_USERNAME); final int taskManagerCount = Integer.valueOf(envs.get(Client.ENV_TM_COUNT)); final int memoryPerTaskManager = Integer.valueOf(envs.get(Client.ENV_TM_MEMORY)); final int coresPerTaskManager = Integer.valueOf(envs.get(Client.ENV_TM_CORES)); int heapLimit = Utils.calculateHeapSize(memoryPerTaskManager); if (currDir == null) { throw new RuntimeException("Current directory unknown"); }/* www.j av a 2 s. co m*/ if (ownHostname == null) { throw new RuntimeException("Own hostname (" + Environment.NM_HOST + ") not set."); } LOG.info("Working directory " + currDir); // load Flink configuration. Utils.getFlinkConfiguration(currDir); final String localWebInterfaceDir = currDir + "/resources/" + ConfigConstants.DEFAULT_JOB_MANAGER_WEB_PATH_NAME; // Update yaml conf -> set jobManager address to this machine's address. FileInputStream fis = new FileInputStream(currDir + "/flink-conf.yaml"); BufferedReader br = new BufferedReader(new InputStreamReader(fis)); Writer output = new BufferedWriter(new FileWriter(currDir + "/flink-conf-modified.yaml")); String line; while ((line = br.readLine()) != null) { if (line.contains(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY)) { output.append(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY + ": " + ownHostname + "\n"); } else if (line.contains(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY)) { output.append(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY + ": " + "\n"); } else { output.append(line + "\n"); } } // just to make sure. output.append(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY + ": " + ownHostname + "\n"); output.append(ConfigConstants.JOB_MANAGER_WEB_ROOT_PATH_KEY + ": " + localWebInterfaceDir + "\n"); output.append(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY + ": " + logDirs + "\n"); output.close(); br.close(); File newConf = new File(currDir + "/flink-conf-modified.yaml"); if (!newConf.exists()) { LOG.warn("modified yaml does not exist!"); } Utils.copyJarContents("resources/" + ConfigConstants.DEFAULT_JOB_MANAGER_WEB_PATH_NAME, ApplicationMaster.class.getProtectionDomain().getCodeSource().getLocation().getPath()); JobManager jm; { String pathToNepheleConfig = currDir + "/flink-conf-modified.yaml"; String[] args = { "-executionMode", "cluster", "-configDir", pathToNepheleConfig }; // start the job manager jm = JobManager.initialize(args); // Start info server for jobmanager jm.startInfoServer(); } AMRMClient<ContainerRequest> rmClient = AMRMClient.createAMRMClient(); rmClient.init(conf); rmClient.start(); NMClient nmClient = NMClient.createNMClient(); nmClient.init(conf); nmClient.start(); // Register with ResourceManager LOG.info("registering ApplicationMaster"); rmClient.registerApplicationMaster(applicationMasterHost, 0, "http://" + applicationMasterHost + ":" + GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, "undefined")); // Priority for worker containers - priorities are intra-application Priority priority = Records.newRecord(Priority.class); priority.setPriority(0); // Resource requirements for worker containers Resource capability = Records.newRecord(Resource.class); capability.setMemory(memoryPerTaskManager); capability.setVirtualCores(coresPerTaskManager); // Make container requests to ResourceManager for (int i = 0; i < taskManagerCount; ++i) { ContainerRequest containerAsk = new ContainerRequest(capability, null, null, priority); LOG.info("Requesting TaskManager container " + i); rmClient.addContainerRequest(containerAsk); } LocalResource flinkJar = Records.newRecord(LocalResource.class); LocalResource flinkConf = Records.newRecord(LocalResource.class); // register Flink Jar with remote HDFS final Path remoteJarPath = new Path(remoteFlinkJarPath); Utils.registerLocalResource(fs, remoteJarPath, flinkJar); // register conf with local fs. Path remoteConfPath = Utils.setupLocalResource(conf, fs, appId, new Path("file://" + currDir + "/flink-conf-modified.yaml"), flinkConf, new Path(clientHomeDir)); LOG.info("Prepared localresource for modified yaml: " + flinkConf); boolean hasLog4j = new File(currDir + "/log4j.properties").exists(); // prepare the files to ship LocalResource[] remoteShipRsc = null; String[] remoteShipPaths = shipListString.split(","); if (!shipListString.isEmpty()) { remoteShipRsc = new LocalResource[remoteShipPaths.length]; { // scope for i int i = 0; for (String remoteShipPathStr : remoteShipPaths) { if (remoteShipPathStr == null || remoteShipPathStr.isEmpty()) { continue; } remoteShipRsc[i] = Records.newRecord(LocalResource.class); Path remoteShipPath = new Path(remoteShipPathStr); Utils.registerLocalResource(fs, remoteShipPath, remoteShipRsc[i]); i++; } } } // respect custom JVM options in the YAML file final String javaOpts = GlobalConfiguration.getString(ConfigConstants.FLINK_JVM_OPTIONS, ""); // Obtain allocated containers and launch int allocatedContainers = 0; int completedContainers = 0; while (allocatedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(0); for (Container container : response.getAllocatedContainers()) { LOG.info("Got new Container for TM " + container.getId() + " on host " + container.getNodeId().getHost()); ++allocatedContainers; // Launch container by create ContainerLaunchContext ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); String tmCommand = "$JAVA_HOME/bin/java -Xmx" + heapLimit + "m " + javaOpts; if (hasLog4j) { tmCommand += " -Dlog.file=\"" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-log4j.log\" -Dlog4j.configuration=file:log4j.properties"; } tmCommand += " org.apache.flink.yarn.YarnTaskManagerRunner -configDir . " + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stdout.log" + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/taskmanager-stderr.log"; ctx.setCommands(Collections.singletonList(tmCommand)); LOG.info("Starting TM with command=" + tmCommand); // copy resources to the TaskManagers. Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(2); localResources.put("flink.jar", flinkJar); localResources.put("flink-conf.yaml", flinkConf); // add ship resources if (!shipListString.isEmpty()) { Preconditions.checkNotNull(remoteShipRsc); for (int i = 0; i < remoteShipPaths.length; i++) { localResources.put(new Path(remoteShipPaths[i]).getName(), remoteShipRsc[i]); } } ctx.setLocalResources(localResources); // Setup CLASSPATH for Container (=TaskTracker) Map<String, String> containerEnv = new HashMap<String, String>(); Utils.setupEnv(conf, containerEnv); //add flink.jar to class path. containerEnv.put(Client.ENV_CLIENT_USERNAME, yarnClientUsername); ctx.setEnvironment(containerEnv); UserGroupInformation user = UserGroupInformation.getCurrentUser(); try { Credentials credentials = user.getCredentials(); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); ctx.setTokens(securityTokens); } catch (IOException e) { LOG.warn("Getting current user info failed when trying to launch the container" + e.getMessage()); } LOG.info("Launching container " + allocatedContainers); nmClient.startContainer(container, ctx); } for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container (while allocating) " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); } Thread.sleep(100); } // Now wait for containers to complete while (completedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(completedContainers / taskManagerCount); for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); } Thread.sleep(5000); } LOG.info("Shutting down JobManager"); jm.shutdown(); // Un-register with ResourceManager rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", ""); }
From source file:org.apache.flink.yarn.appMaster.ApplicationMaster.java
License:Apache License
private void run() throws Exception { heapLimit = Utils.calculateHeapSize(memoryPerTaskManager); nmClient = NMClient.createNMClient(); nmClient.init(conf);// w w w . j a v a 2 s. c o m nmClient.start(); nmClient.cleanupRunningContainersOnStop(true); // Register with ResourceManager String url = "http://" + applicationMasterHost + ":" + jobManagerWebPort; LOG.info("Registering ApplicationMaster with tracking url " + url); rmClient.registerApplicationMaster(applicationMasterHost, 0, url); // Priority for worker containers - priorities are intra-application Priority priority = Records.newRecord(Priority.class); priority.setPriority(0); // Resource requirements for worker containers Resource capability = Records.newRecord(Resource.class); capability.setMemory(memoryPerTaskManager); capability.setVirtualCores(coresPerTaskManager); // Make container requests to ResourceManager for (int i = 0; i < taskManagerCount; ++i) { ContainerRequest containerAsk = new ContainerRequest(capability, null, null, priority); LOG.info("Requesting TaskManager container " + i); rmClient.addContainerRequest(containerAsk); } LocalResource flinkJar = Records.newRecord(LocalResource.class); LocalResource flinkConf = Records.newRecord(LocalResource.class); // register Flink Jar with remote HDFS final Path remoteJarPath = new Path(remoteFlinkJarPath); Utils.registerLocalResource(fs, remoteJarPath, flinkJar); // register conf with local fs. Utils.setupLocalResource(conf, fs, appId, new Path("file://" + currDir + "/flink-conf-modified.yaml"), flinkConf, new Path(clientHomeDir)); LOG.info("Prepared local resource for modified yaml: " + flinkConf); hasLogback = new File(currDir + "/logback.xml").exists(); // prepare the files to ship LocalResource[] remoteShipRsc = null; String[] remoteShipPaths = shipListString.split(","); if (!shipListString.isEmpty()) { remoteShipRsc = new LocalResource[remoteShipPaths.length]; { // scope for i int i = 0; for (String remoteShipPathStr : remoteShipPaths) { if (remoteShipPathStr == null || remoteShipPathStr.isEmpty()) { continue; } remoteShipRsc[i] = Records.newRecord(LocalResource.class); Path remoteShipPath = new Path(remoteShipPathStr); Utils.registerLocalResource(fs, remoteShipPath, remoteShipRsc[i]); i++; } } } // copy resources to the TaskManagers. taskManagerLocalResources = new HashMap<String, LocalResource>(2); taskManagerLocalResources.put("flink.jar", flinkJar); taskManagerLocalResources.put("flink-conf.yaml", flinkConf); // add ship resources if (!shipListString.isEmpty()) { Preconditions.checkNotNull(remoteShipRsc); for (int i = 0; i < remoteShipPaths.length; i++) { taskManagerLocalResources.put(new Path(remoteShipPaths[i]).getName(), remoteShipRsc[i]); } } completedContainers = 0; // Obtain allocated containers and launch StringBuffer containerDiag = new StringBuffer(); // diagnostics log for the containers. allocateOutstandingContainer(containerDiag); LOG.info("Allocated all initial containers"); // Now wait for containers to complete while (completedContainers < taskManagerCount) { AllocateResponse response = rmClient.allocate(completedContainers / taskManagerCount); for (ContainerStatus status : response.getCompletedContainersStatuses()) { ++completedContainers; LOG.info("Completed container " + status.getContainerId() + ". Total Completed:" + completedContainers); LOG.info("Diagnostics " + status.getDiagnostics()); logDeadContainer(status, containerDiag); } Thread.sleep(5000); } if (isClosed) { return; } // Un-register with ResourceManager final String diagnosticsMessage = "Application Master shut down after all " + "containers finished\n" + containerDiag.toString(); LOG.info("Diagnostics message: " + diagnosticsMessage); rmClient.unregisterApplicationMaster(FinalApplicationStatus.FAILED, diagnosticsMessage, ""); this.close(); amRpcServer.stop(); // we need to manually stop the RPC service. Usually, the Client stops the RPC, // but at this point, the AM has been shut down (for some reason). LOG.info("Application Master shutdown completed."); }