Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.giraph.yarn; import static org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import java.util.Set; import org.apache.giraph.conf.GiraphConfiguration; import org.apache.giraph.conf.GiraphConstants; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.client.api.YarnClientApplication; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.util.Records; import org.apache.log4j.Logger; import java.io.IOException; import java.util.List; import java.util.Map; import java.nio.ByteBuffer; /** * The initial launcher for a YARN-based Giraph job. This class attempts to * configure and send a request to the ResourceManager for a single * application container to host GiraphApplicationMaster. The RPC connection * between the RM and GiraphYarnClient is the YARN ApplicationManager. */ public class GiraphYarnClient { static { Configuration.addDefaultResource("giraph-site.xml"); } /** Class logger */ private static final Logger LOG = Logger.getLogger(GiraphYarnClient.class); /** Sleep time between silent progress checks */ private static final int JOB_STATUS_INTERVAL_MSECS = 800; /** Memory (in MB) to allocate for our ApplicationMaster container */ private static final int YARN_APP_MASTER_MEMORY_MB = 512; /** human-readable job name */ private final String jobName; /** Helper configuration from the job */ private final GiraphConfiguration giraphConf; /** ApplicationId object (needed for RPC to ResourceManager) */ private ApplicationId appId; /** # of sleeps between progress reports to client */ private int reportCounter; /** Yarn client object */ private YarnClient yarnClient; /** * Constructor. Requires caller to hand us a GiraphConfiguration. * * @param giraphConf User-defined configuration * @param jobName User-defined job name */ public GiraphYarnClient(GiraphConfiguration giraphConf, String jobName) throws IOException { this.reportCounter = 0; this.jobName = jobName; this.appId = null; // can't set this until after start() this.giraphConf = giraphConf; verifyOutputDirDoesNotExist(); yarnClient = YarnClient.createYarnClient(); yarnClient.init(giraphConf); } /** * Submit a request to the Hadoop YARN cluster's ResourceManager * to obtain an application container. This will run our ApplicationMaster, * which will in turn request app containers for Giraphs' master and all * worker tasks. * @param verbose Not implemented yet, to provide compatibility w/GiraphJob * @return true if job is successful */ public boolean run(final boolean verbose) throws YarnException, IOException { // init our connection to YARN ResourceManager RPC LOG.info("Running Client"); yarnClient.start(); // request an application id from the RM // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse getNewAppResponse = app.getNewApplicationResponse(); checkPerNodeResourcesAvailable(getNewAppResponse); // configure our request for an exec container for GiraphApplicationMaster ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); appId = appContext.getApplicationId(); //createAppSubmissionContext(appContext); appContext.setApplicationId(appId); appContext.setApplicationName(jobName); LOG.info("Obtained new Application ID: " + appId); // sanity check applyConfigsForYarnGiraphJob(); ContainerLaunchContext containerContext = buildContainerLaunchContext(); appContext.setResource(buildContainerMemory()); appContext.setAMContainerSpec(containerContext); LOG.info("ApplicationSumbissionContext for GiraphApplicationMaster " + "launch container is populated."); //TODO: priority and queue // Set the priority for the application master //Priority pri = Records.newRecord(Priority.class); // TODO - what is the range for priority? how to decide? //pri.setPriority(amPriority); //appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM //appContext.setQueue(amQueue); // make the request, blow up if fail, loop and report job progress if not try { LOG.info("Submitting application to ASM"); // obtain an "updated copy" of the appId for status checks/job kill later appId = yarnClient.submitApplication(appContext); LOG.info("Got new appId after submission :" + appId); } catch (YarnException yre) { // TODO // Try submitting the same request again // app submission failure? throw new RuntimeException("submitApplication(appContext) FAILED.", yre); } LOG.info("GiraphApplicationMaster container request was submitted to " + "ResourceManager for job: " + jobName); return awaitGiraphJobCompletion(); } /** * Without Hadoop MR to check for us, make sure the output dir doesn't exist! */ private void verifyOutputDirDoesNotExist() { Path outDir = null; try { FileSystem fs = FileSystem.get(giraphConf); String errorMsg = "__ERROR_NO_OUTPUT_DIR_SET__"; outDir = new Path(fs.getHomeDirectory(), giraphConf.get(OUTDIR, errorMsg)); FileStatus outStatus = fs.getFileStatus(outDir); if (outStatus.isDirectory() || outStatus.isFile() || outStatus.isSymlink()) { throw new IllegalStateException("Path " + outDir + " already exists."); } } catch (IOException ioe) { LOG.info("Final output path is: " + outDir); } } /** * Configuration settings we need to customize for a Giraph on YARN * job. We need to call this EARLY in the job, before the GiraphConfiguration * is exported to HDFS for localization in each task container. */ private void applyConfigsForYarnGiraphJob() { GiraphConstants.IS_PURE_YARN_JOB.set(giraphConf, true); GiraphConstants.SPLIT_MASTER_WORKER.set(giraphConf, true); giraphConf.set("mapred.job.id", "giraph_yarn_" + appId); // ZK app base path } /** * Utility to make sure we have the cluster resources we need to run this * job. If they are not available, we should die here before too much setup. * @param cluster the GetNewApplicationResponse from the YARN RM. */ private void checkPerNodeResourcesAvailable(final GetNewApplicationResponse cluster) throws YarnException, IOException { // are there enough containers to go around for our Giraph job? List<NodeReport> nodes = null; long totalAvailable = 0; try { nodes = yarnClient.getNodeReports(NodeState.RUNNING); } catch (YarnException yre) { throw new RuntimeException("GiraphYarnClient could not connect with " + "the YARN ResourceManager to determine the number of available " + "application containers.", yre); } for (NodeReport node : nodes) { LOG.info("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress " + node.getHttpAddress() + ", nodeRackName " + node.getRackName() + ", nodeNumContainers " + node.getNumContainers()); totalAvailable += node.getCapability().getMemory(); } // 1 master + all workers in -w command line arg final int workers = giraphConf.getMaxWorkers() + 1; checkAndAdjustPerTaskHeapSize(cluster); final long totalAsk = giraphConf.getYarnTaskHeapMb() * workers; if (totalAsk > totalAvailable) { throw new IllegalStateException("Giraph's estimated cluster heap " + totalAsk + "MB ask is greater than the current available cluster " + "heap of " + totalAvailable + "MB. Aborting Job."); } } /** * Adjust the user-supplied <code>-yh</code> and <code>-w</code> * settings if they are too small or large for the current cluster, * and re-record the new settings in the GiraphConfiguration for export. * @param gnar the GetNewAppResponse from the YARN ResourceManager. */ private void checkAndAdjustPerTaskHeapSize(final GetNewApplicationResponse gnar) { // do we have the right heap size on these cluster nodes to run our job? //TODO: //final int minCapacity = gnar.getMinimumResourceCapability().getMemory(); final int maxCapacity = gnar.getMaximumResourceCapability().getMemory(); // make sure heap size is OK for this cluster's available containers int giraphMem = giraphConf.getYarnTaskHeapMb(); if (giraphMem == GiraphConstants.GIRAPH_YARN_TASK_HEAP_MB_DEFAULT) { LOG.info("Defaulting per-task heap size to " + giraphMem + "MB."); } if (giraphMem > maxCapacity) { LOG.info("Giraph's request of heap MB per-task is more than the " + "minimum; downgrading Giraph to" + maxCapacity + "MB."); giraphMem = maxCapacity; } /*if (giraphMem < minCapacity) { //TODO: LOG.info("Giraph's request of heap MB per-task is less than the " + "minimum; upgrading Giraph to " + minCapacity + "MB."); giraphMem = minCapacity; }*/ giraphConf.setYarnTaskHeapMb(giraphMem); // record any changes made } /** * Kill time for the client, report progress occasionally, and otherwise * just sleep and wait for the job to finish. If no AM response, kill the app. * @return true if job run is successful. */ private boolean awaitGiraphJobCompletion() throws YarnException, IOException { boolean done; ApplicationReport report = null; try { do { try { Thread.sleep(JOB_STATUS_INTERVAL_MSECS); } catch (InterruptedException ir) { LOG.info("Progress reporter's sleep was interrupted!", ir); } report = yarnClient.getApplicationReport(appId); done = checkProgress(report); } while (!done); if (!giraphConf.metricsEnabled()) { cleanupJarCache(); } } catch (IOException ex) { final String diagnostics = (null == report) ? "" : "Diagnostics: " + report.getDiagnostics(); LOG.error("Fatal fault encountered, failing " + jobName + ". " + diagnostics, ex); try { LOG.error("FORCIBLY KILLING Application from AppMaster."); yarnClient.killApplication(appId); } catch (YarnException yre) { LOG.error("Exception raised in attempt to kill application.", yre); } return false; } return printFinalJobReport(); } /** * Deletes the HDFS cache in YARN, which replaces DistributedCache of Hadoop. * If metrics are enabled this will not get called (so you can examine cache.) * @throws IOException if bad things happen. */ private void cleanupJarCache() throws IOException { FileSystem fs = FileSystem.get(giraphConf); Path baseCacheDir = YarnUtils.getFsCachePath(fs, appId); if (fs.exists(baseCacheDir)) { LOG.info("Cleaning up HDFS distributed cache directory for Giraph job."); fs.delete(baseCacheDir, true); // stuff inside fs.delete(baseCacheDir, false); // dir itself } } /** * Print final formatted job report for local client that initiated this run. * @return true for app success, false for failure. */ private boolean printFinalJobReport() throws YarnException, IOException { ApplicationReport report; try { report = yarnClient.getApplicationReport(appId); FinalApplicationStatus finalAppStatus = report.getFinalApplicationStatus(); final long secs = (report.getFinishTime() - report.getStartTime()) / 1000L; final String time = String.format("%d minutes, %d seconds.", secs / 60L, secs % 60L); LOG.info("Completed " + jobName + ": " + finalAppStatus.name() + ", total running time: " + time); } catch (YarnException yre) { LOG.error("Exception encountered while attempting to request " + "a final job report for " + jobName, yre); return false; } return true; } /** * Compose the ContainerLaunchContext for the Application Master. * @return the CLC object populated and configured. */ private ContainerLaunchContext buildContainerLaunchContext() throws IOException { ContainerLaunchContext appMasterContainer = Records.newRecord(ContainerLaunchContext.class); appMasterContainer.setEnvironment(buildEnvironment()); appMasterContainer.setLocalResources(buildLocalResourceMap()); appMasterContainer.setCommands(buildAppMasterExecCommand()); //appMasterContainer.setResource(buildContainerMemory()); //appMasterContainer.setUser(ApplicationConstants.Environment.USER.name()); setToken(appMasterContainer); return appMasterContainer; } /** * Set delegation tokens for AM container * @param amContainer AM container * @return */ private void setToken(ContainerLaunchContext amContainer) throws IOException { // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = giraphConf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } FileSystem fs = FileSystem.get(giraphConf); // For now, only getting tokens for the default file-system. final Token<?>[] tokens = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } } /** * Assess whether job is already finished/failed and 'done' flag needs to be * set, prints progress display for client if all is going well. * @param report the application report to assess. * @return true if job report indicates the job run is over. */ private boolean checkProgress(final ApplicationReport report) { YarnApplicationState jobState = report.getYarnApplicationState(); if (jobState == YarnApplicationState.FINISHED || jobState == YarnApplicationState.KILLED) { return true; } else if (jobState == YarnApplicationState.FAILED) { LOG.error(jobName + " reports FAILED state, diagnostics show: " + report.getDiagnostics()); return true; } else { if (reportCounter++ % 5 == 0) { displayJobReport(report); } } return false; } /** * Display a formatted summary of the job progress report from the AM. * @param report the report to display. */ private void displayJobReport(final ApplicationReport report) { if (null == report) { throw new IllegalStateException( "[*] Latest ApplicationReport for job " + jobName + " was not received by the local client."); } final float elapsed = (System.currentTimeMillis() - report.getStartTime()) / 1000.0f; LOG.info(jobName + ", Elapsed: " + String.format("%.2f secs", elapsed)); LOG.info(report.getCurrentApplicationAttemptId() + ", State: " + report.getYarnApplicationState().name() + ", Containers used: " + report.getApplicationResourceUsageReport().getNumUsedContainers()); } /** * Utility to produce the command line to activate the AM from the shell. * @return A <code>List<String></code> of shell commands to execute in * the container allocated to us by the RM to host our App Master. */ private List<String> buildAppMasterExecCommand() { // 'gam-' prefix is for GiraphApplicationMaster in log file names return ImmutableList.of("${JAVA_HOME}/bin/java " + "-Xmx" + YARN_APP_MASTER_MEMORY_MB + "M " + "-Xms" + YARN_APP_MASTER_MEMORY_MB + "M " + // TODO: REMOVE examples jar! //TODO: Make constant "-cp .:${CLASSPATH} org.apache.giraph.yarn.GiraphApplicationMaster " + "1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/gam-stdout.log " + "2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/gam-stderr.log "); } /** * Check if the job's configuration is for a local run. These can all be * removed as we expand the functionality of the "pure YARN" Giraph profile. */ private void checkJobLocalZooKeeperSupported() { final boolean isZkExternal = giraphConf.isZookeeperExternal(); final String checkZkList = giraphConf.getZookeeperList(); if (!isZkExternal || checkZkList.isEmpty()) { throw new IllegalArgumentException("Giraph on YARN does not currently" + "support Giraph-managed ZK instances: use a standalone ZooKeeper."); } } /** * Register all local jar files from GiraphConstants.GIRAPH_YARN_LIBJARS * in the LocalResources map, copy to HDFS on that same registered path. * @param map the LocalResources list to populate. */ private void addLocalJarsToResourceMap(Map<String, LocalResource> map) throws IOException { Set<String> jars = Sets.newHashSet(); LOG.info("LIB JARS :" + giraphConf.getYarnLibJars()); String[] libJars = giraphConf.getYarnLibJars().split(","); for (String libJar : libJars) { jars.add(libJar); } FileSystem fs = FileSystem.get(giraphConf); Path baseDir = YarnUtils.getFsCachePath(fs, appId); for (Path jar : YarnUtils.getLocalFiles(jars)) { Path dest = new Path(baseDir, jar.getName()); LOG.info("Made local resource for :" + jar + " to " + dest); fs.copyFromLocalFile(false, true, jar, dest); YarnUtils.addFileToResourceMap(map, fs, dest); } } /** * Construct the memory requirements for the AppMaster's container request. * @return A Resource that wraps the memory request. */ private Resource buildContainerMemory() { Resource capability = Records.newRecord(Resource.class); capability.setMemory(YARN_APP_MASTER_MEMORY_MB); //Configurable thru CLI? return capability; } /** * Create the mapping of environment vars that will be visible to the * ApplicationMaster in its remote app container. * @return a map of environment vars to set up for the AppMaster. */ private Map<String, String> buildEnvironment() { Map<String, String> environment = Maps.<String, String>newHashMap(); LOG.info("Set the environment for the application master"); YarnUtils.addLocalClasspathToEnv(environment, giraphConf); //TODO: add the runtime classpath needed for tests to work LOG.info("Environment for AM :" + environment); return environment; } /** * Create the mapping of files and JARs to send to the GiraphApplicationMaster * and from there on to the Giraph tasks. * @return the map of jars to local resource paths for transport * to the host container that will run our AppMaster. */ private Map<String, LocalResource> buildLocalResourceMap() { // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master //is part of the local resources Map<String, LocalResource> localResources = Maps.<String, LocalResource>newHashMap(); LOG.info("buildLocalResourceMap ...."); try { // export the GiraphConfiguration to HDFS for localization to remote tasks //Ques: Merge the following two method YarnUtils.exportGiraphConfiguration(giraphConf, appId); YarnUtils.addGiraphConfToLocalResourceMap(giraphConf, appId, localResources); // add jars from '-yj' cmd-line arg to resource map for localization addLocalJarsToResourceMap(localResources); //TODO: log4j? return localResources; } catch (IOException ioe) { throw new IllegalStateException("Failed to build LocalResouce map.", ioe); } } }