List of usage examples for org.apache.hadoop.yarn.api.records ApplicationAttemptId fromString
@Public @Stable public static ApplicationAttemptId fromString(String appAttemptIdStr)
From source file:com.github.hdl.tensorflow.yarn.app.ApplicationMaster.java
License:Apache License
/** * Parse command line options// w w w. j ava 2 s .co m * * @param args Command line args * @return Whether init successful and run should be invoked * @throws ParseException * @throws IOException */ public boolean init(String[] args) throws ParseException, IOException { Options opts = new Options(); opts.addOption(TFApplication.OPT_TF_APP_ATTEMPT_ID, true, "App Attempt ID. Not to be used unless for testing purposes"); opts.addOption(TFApplication.OPT_TF_CONTAINER_MEMORY, true, "Amount of memory in MB to be requested to run the shell command"); opts.addOption(TFApplication.OPT_TF_CONTAINER_VCORES, true, "Amount of virtual cores to be requested to run the shell command"); opts.addOption(TFApplication.OPT_TF_PRIORITY, true, "Application Priority. Default 0"); opts.addOption(TFApplication.OPT_TF_CONTAINER_RETRY_POLICY, true, "Retry policy when container fails to run, " + "0: NEVER_RETRY, 1: RETRY_ON_ALL_ERRORS, " + "2: RETRY_ON_SPECIFIC_ERROR_CODES"); opts.addOption(TFApplication.OPT_TF_CONTAINER_RETRY_ERROR_CODES, true, "When retry policy is set to RETRY_ON_SPECIFIC_ERROR_CODES, error " + "codes is specified with this option, " + "e.g. --container_retry_error_codes 1,2,3"); opts.addOption(TFApplication.OPT_TF_CONTAINER_MAX_RETRIES, true, "If container could retry, it specifies max retires"); opts.addOption(TFApplication.OPT_TF_CONTAINER_RETRY_INTERVAL, true, "Interval between each retry, unit is milliseconds"); opts.addOption(TFApplication.OPT_TF_SERVER_JAR, true, "Provide container jar of tensorflow"); opts.addOption(TFApplication.OPT_TF_JNI_SO, true, "jni so of tensorflow"); opts.addOption(TFApplication.OPT_TF_WORKER_NUM, true, "Provide worker server number of tensorflow"); opts.addOption(TFApplication.OPT_TF_PS_NUM, true, "Provide ps server number of tensorflow"); CommandLine cliParser = new GnuParser().parse(opts, args); if (args.length == 0) { printUsage(opts); throw new IllegalArgumentException("No args specified for application master to initialize"); } if (fileExist(log4jPath)) { try { Log4jPropertyHelper.updateLog4jConfiguration(ApplicationMaster.class, log4jPath); } catch (Exception e) { LOG.warn("Can not set up custom log4j properties. " + e); } } Map<String, String> envs = System.getenv(); if (!envs.containsKey(Environment.CONTAINER_ID.name())) { if (cliParser.hasOption(TFApplication.OPT_TF_APP_ATTEMPT_ID)) { String appIdStr = cliParser.getOptionValue(TFApplication.OPT_TF_APP_ATTEMPT_ID, ""); appAttemptID = ApplicationAttemptId.fromString(appIdStr); } else { throw new IllegalArgumentException("Application Attempt Id not set in the environment"); } } else { ContainerId containerId = ContainerId.fromString(envs.get(Environment.CONTAINER_ID.name())); appAttemptID = containerId.getApplicationAttemptId(); } if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) { throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV + " not set in the environment"); } if (!envs.containsKey(Environment.NM_HOST.name())) { throw new RuntimeException(Environment.NM_HOST.name() + " not set in the environment"); } if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) { throw new RuntimeException(Environment.NM_HTTP_PORT + " not set in the environment"); } if (!envs.containsKey(Environment.NM_PORT.name())) { throw new RuntimeException(Environment.NM_PORT.name() + " not set in the environment"); } LOG.info("Application master for app" + ", appId=" + appAttemptID.getApplicationId().getId() + ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp() + ", attemptId=" + appAttemptID.getAttemptId()); containerMemory = Integer.parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_MEMORY, "256")); containerVirtualCores = Integer .parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_VCORES, "1")); numTotalWokerContainers = Integer.parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_WORKER_NUM, "1")); if (numTotalWokerContainers == 0) { throw new IllegalArgumentException("Cannot run tensroflow application with no worker containers"); } numTotalParamServerContainer = Integer.parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_PS_NUM, "0")); numTotalContainers = numTotalWokerContainers + numTotalParamServerContainer; if (numTotalContainers == 0) { throw new IllegalArgumentException("Cannot run distributed shell with no containers"); } requestPriority = Integer.parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_PRIORITY, "0")); containerRetryPolicy = ContainerRetryPolicy.values()[Integer .parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_RETRY_POLICY, "0"))]; if (cliParser.hasOption(TFApplication.OPT_TF_CONTAINER_RETRY_ERROR_CODES)) { containerRetryErrorCodes = new HashSet<>(); for (String errorCode : cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_RETRY_ERROR_CODES) .split(",")) { containerRetryErrorCodes.add(Integer.parseInt(errorCode)); } } containerMaxRetries = Integer .parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_MAX_RETRIES, "0")); containrRetryInterval = Integer .parseInt(cliParser.getOptionValue(TFApplication.OPT_TF_CONTAINER_RETRY_INTERVAL, "0")); tfServerJar = cliParser.getOptionValue(TFApplication.OPT_TF_SERVER_JAR, TFAmContainer.APPMASTER_JAR_PATH); jniSoDfsPath = cliParser.getOptionValue(TFApplication.OPT_TF_JNI_SO, ""); clusterSpec = ClusterSpec.makeClusterSpec(numTotalWokerContainers, numTotalParamServerContainer); return true; }
From source file:org.hdl.caffe.yarn.app.ApplicationMaster.java
License:Apache License
/** * Parse command line options//from w ww.j ava2 s .c om * * @param args Command line args * @return Whether init successful and run should be invoked * @throws ParseException * @throws IOException */ public boolean init(String[] args) throws ParseException, IOException { Options opts = new Options(); opts.addOption(CaffeApplication.OPT_CAFFE_APP_ATTEMPT_ID, true, "App Attempt ID. Not to be used unless for testing purposes"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_MEMORY, true, "Amount of memory in MB to be requested to run the shell command"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_VCORES, true, "Amount of virtual cores to be requested to run the shell command"); opts.addOption(CaffeApplication.OPT_CAFFE_PRIORITY, true, "Application Priority. Default 0"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_POLICY, true, "Retry policy when container fails to run, " + "0: NEVER_RETRY, 1: RETRY_ON_ALL_ERRORS, " + "2: RETRY_ON_SPECIFIC_ERROR_CODES"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_ERROR_CODES, true, "When retry policy is set to RETRY_ON_SPECIFIC_ERROR_CODES, error " + "codes is specified with this option, " + "e.g. --container_retry_error_codes 1,2,3"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_MAX_RETRIES, true, "If container could retry, it specifies max retires"); opts.addOption(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_INTERVAL, true, "Interval between each retry, unit is milliseconds"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_JAR, true, "Provide container jar of caffe"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_NUM, true, "Provide processor number of caffe"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_SOLVER, true, "solver_configuration"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_TRAIN, true, "training_mode"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_FEATURES, true, "name_of_output_blobs"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_LABEL, true, "name of label blobs to be included in features"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_MODEL, true, "model path"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_OUTPUT, true, "output path"); opts.addOption(CaffeApplication.OPT_CAFFE_PROCESSOR_CONNECTION, true, "network mode"); CommandLine cliParser = new GnuParser().parse(opts, args); if (args.length == 0) { printUsage(opts); throw new IllegalArgumentException("No args specified for application master to initialize"); } if (fileExist(log4jPath)) { try { Log4jPropertyHelper.updateLog4jConfiguration(ApplicationMaster.class, log4jPath); } catch (Exception e) { LOG.warn("Can not set up custom log4j properties. " + e); } } Map<String, String> envs = System.getenv(); if (!envs.containsKey(Environment.CONTAINER_ID.name())) { if (cliParser.hasOption(CaffeApplication.OPT_CAFFE_APP_ATTEMPT_ID)) { String appIdStr = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_APP_ATTEMPT_ID, ""); appAttemptID = ApplicationAttemptId.fromString(appIdStr); } else { throw new IllegalArgumentException("Application AttemptId not set in the environment"); } } else { ContainerId containerId = ContainerId.fromString(envs.get(Environment.CONTAINER_ID.name())); appAttemptID = containerId.getApplicationAttemptId(); } if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) { throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV + " not set in the environment"); } if (!envs.containsKey(Environment.NM_HOST.name())) { throw new RuntimeException(Environment.NM_HOST.name() + " not set in the environment"); } if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) { throw new RuntimeException(Environment.NM_HTTP_PORT + " not set in the environment"); } if (!envs.containsKey(Environment.NM_PORT.name())) { throw new RuntimeException(Environment.NM_PORT.name() + " not set in the environment"); } LOG.info("Application master for app" + ", appId=" + appAttemptID.getApplicationId().getId() + ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp() + ", attemptId=" + appAttemptID.getAttemptId()); containerMemory = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_MEMORY, "256")); containerVirtualCores = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_VCORES, "1")); numTotalProcessorContainers = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_NUM, "1")); if (numTotalProcessorContainers == 0) { throw new IllegalArgumentException("Cannot run caffe application with no containers"); } numTotalContainers = numTotalProcessorContainers; if (numTotalContainers == 0) { throw new IllegalArgumentException("Cannot run distributed shell with no containers"); } requestPriority = Integer.parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PRIORITY, "0")); containerRetryPolicy = ContainerRetryPolicy.values()[Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_POLICY, "0"))]; if (cliParser.hasOption(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_ERROR_CODES)) { containerRetryErrorCodes = new HashSet<>(); for (String errorCode : cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_ERROR_CODES) .split(",")) { containerRetryErrorCodes.add(Integer.parseInt(errorCode)); } } containerMaxRetries = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_MAX_RETRIES, "0")); containrRetryInterval = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_CONTAINER_RETRY_INTERVAL, "0")); caffeProcessorJar = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_JAR, CaffeAmContainer.APPMASTER_JAR_PATH); solver = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_SOLVER, ""); train = cliParser.hasOption(CaffeApplication.OPT_CAFFE_PROCESSOR_TRAIN); feature = cliParser.hasOption(CaffeApplication.OPT_CAFFE_PROCESSOR_FEATURES); label = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_LABEL, ""); model = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_MODEL, ""); output = cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_OUTPUT, ""); connection = Integer .parseInt(cliParser.getOptionValue(CaffeApplication.OPT_CAFFE_PROCESSOR_CONNECTION, "2")); clusterSpec = ClusterSpec.makeClusterSpec(numTotalProcessorContainers); return true; }