List of usage examples for org.apache.hadoop.fs FileSystem getHomeDirectory
public Path getHomeDirectory()
From source file:com.yss.util.YarnUtil.java
License:Open Source License
@SuppressWarnings("rawtypes") public static Path createConfigurationFileInFs(FileSystem fs, String appHome, Map stormConf, YarnConfiguration yarnConf) throws IOException { // dump stringwriter's content into FS conf/storm.yaml Path confDst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + STORM_CONF_PATH_STRING); Path dirDst = confDst.getParent(); fs.mkdirs(dirDst);//w w w . j a va 2 s.com //storm.yaml FSDataOutputStream out = fs.create(confDst); Yaml yaml = new Yaml(); OutputStreamWriter writer = new OutputStreamWriter(out); rmNulls(stormConf); yaml.dump(stormConf, writer); writer.close(); out.close(); //yarn-site.xml Path yarn_site_xml = new Path(dirDst, "yarn-site.xml"); out = fs.create(yarn_site_xml); writer = new OutputStreamWriter(out); yarnConf.writeXml(writer); writer.close(); out.close(); return dirDst; }
From source file:com.yss.yarn.launch.APPLaunch.java
License:Open Source License
private void launchApp(String appName, String queue, int amMB, String appMasterJar, String classPath, String lanchMainClass, Map<String, String> runenv) throws Exception { YarnClientApplication client_app = _yarn.createApplication(); GetNewApplicationResponse app = client_app.getNewApplicationResponse(); _appId = app.getApplicationId();/* w w w .j a v a 2s. c om*/ LOG.debug("_appId:" + _appId); ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); appContext.setApplicationId(app.getApplicationId()); appContext.setApplicationName(appName); appContext.setQueue(queue); ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); LOG.info("Copy App Master jar from local filesystem and add to local environment"); LOG.info("load " + appMasterJar); FileSystem fs = FileSystem.get(_hadoopConf); Path src = new Path(appMasterJar); String appHome = YarnUtil.getApplicationHomeForId(_appId.toString()); Path dst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + "AppMaster.jar"); fs.copyFromLocalFile(false, true, src, dst); localResources.put("AppMaster.jar", YarnUtil.newYarnAppResource(fs, dst)); amContainer.setLocalResources(localResources); LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./AppMaster.jar"); List<String> yarn_classpath_cmd = java.util.Arrays.asList("yarn", "classpath"); LOG.info("YARN CLASSPATH COMMAND = [" + yarn_classpath_cmd + "]"); String yarn_class_path = classPath; LOG.info("YARN CLASSPATH = [" + yarn_class_path + "]"); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), yarn_class_path); String java_home = System.getenv("JAVA_HOME"); if ((java_home != null) && !java_home.isEmpty()) { env.put("JAVA_HOME", java_home); } LOG.info("Using JAVA_HOME = [" + env.get("JAVA_HOME") + "]"); env.put("appJar", appMasterJar); env.put("appName", appName); env.put("appId", _appId.toString()); env.put("STORM_LOG_DIR", ApplicationConstants.LOG_DIR_EXPANSION_VAR); env.putAll(runenv); amContainer.setEnvironment(env); // Set the necessary command to execute the application master Vector<String> vargs = new Vector<String>(); if ((java_home != null) && !java_home.isEmpty()) { vargs.add(env.get("JAVA_HOME") + "/bin/java"); } else { vargs.add("java"); } vargs.add("-Dlogfile.name=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/master.log"); vargs.add(lanchMainClass); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); // Set java executable command LOG.info("Setting up app master command:" + vargs); amContainer.setCommands(vargs); Resource capability = Records.newRecord(Resource.class); appContext.setResource(capability); appContext.setAMContainerSpec(amContainer); appContext.setApplicationName(appName); _yarn.submitApplication(appContext); }
From source file:com.zqh.hadoop.moya.core.yarn.Client.java
License:Apache License
/** * Main run function for the client/*from w w w . ja va2 s .com*/ * * @return true if application completed successfully * @throws java.io.IOException * @throws org.apache.hadoop.yarn.exceptions.YarnException */ public boolean run() throws IOException, YarnException { LOG.info("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(); LOG.info("Got Cluster node info from ASM"); for (NodeReport node : clusterNodeReports) { LOG.info("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress() + ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers()); } QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); LOG.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity() + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size() + ", queueChildQueueCount=" + queueInfo.getChildQueues().size()); List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); // TODO get min/max resource capabilities from RM and change memory ask // if needed // If we do not have min/max, we may not be able to correctly request // the required resources from the RM for the app master // Memory ask has to be a multiple of min and less than max. // Dump out information about cluster capability as seen by the resource // manager int maxMem = appResponse.getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); // A resource ask cannot exceed the max. if (amMemory > maxMem) { LOG.info("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); ApplicationId appId = appContext.getApplicationId(); appContext.setApplicationName(appName); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of // the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); LOG.info("Copy App Master jar from local filesystem and add to local environment"); // Copy the application master jar to the filesystem // Create a local resource to point to the destination jar path FileSystem fs = FileSystem.get(conf); Path src = new Path(appMasterJar); String pathSuffix = appName + "/" + appId.getId() + "/AppMaster.jar"; Path dst = new Path(fs.getHomeDirectory(), pathSuffix); fs.copyFromLocalFile(false, true, src, dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource amJarRsrc = Records.newRecord(LocalResource.class); // Set the type of resource - file or archive // archives are untarred at destination // we don't need the jar file to be untarred amJarRsrc.setType(LocalResourceType.FILE); // Set visibility of the resource // Setting to most private option amJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION); // Set the resource to be copied over amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); // Set timestamp and length of file so that the framework // can do basic sanity checks for the local resource // after it has been copied over to ensure it is the same // resource the client intended to use with the application amJarRsrc.setTimestamp(destStatus.getModificationTime()); amJarRsrc.setSize(destStatus.getLen()); localResources.put("AppMaster.jar", amJarRsrc); // Setup App Master Constants String amJarLocation = ""; long amJarLen = 0; long amJarTimestamp = 0; // adding info so we can add the jar to the App master container path amJarLocation = dst.toUri().toString(); FileStatus shellFileStatus = fs.getFileStatus(dst); amJarLen = shellFileStatus.getLen(); amJarTimestamp = shellFileStatus.getModificationTime(); // ADD libs needed that will be untared // Keep it all archived for now so add it as a file... src = new Path(localLibJar); pathSuffix = appName + "/" + appId.getId() + "/Runnable.jar"; dst = new Path(fs.getHomeDirectory(), pathSuffix); fs.copyFromLocalFile(false, true, src, dst); destStatus = fs.getFileStatus(dst); LocalResource libsJarRsrc = Records.newRecord(LocalResource.class); libsJarRsrc.setType(LocalResourceType.FILE); libsJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION); libsJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); libsJarRsrc.setTimestamp(destStatus.getModificationTime()); localResources.put("Runnable.jar", libsJarRsrc); // Setup Libs Constants String libsLocation = ""; long libsLen = 0; long libsTimestamp = 0; // adding info so we can add the jar to the App master container path libsLocation = dst.toUri().toString(); FileStatus libsFileStatus = fs.getFileStatus(dst); libsLen = libsFileStatus.getLen(); libsTimestamp = libsFileStatus.getModificationTime(); // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(fs.getHomeDirectory(), "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the env variables to be setup in the env where the application // master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // put the AM jar into env and MOYA Runnable // using the env info, the application master will create the correct // local resource for the // eventual containers that will be launched to execute the shell // scripts env.put(MConstants.APPLICATIONMASTERJARLOCATION, amJarLocation); env.put(MConstants.APPLICATIONMASTERJARTIMESTAMP, Long.toString(amJarTimestamp)); env.put(MConstants.APPLICATIONMASTERJARLEN, Long.toString(amJarLen)); env.put(MConstants.LIBSLOCATION, libsLocation); env.put(MConstants.LIBSTIMESTAMP, Long.toString(libsTimestamp)); env.put(MConstants.LIBSLEN, Long.toString(libsLen)); env.put(MConstants.ZOOKEEPERHOSTS, ZKHosts); // Add AppMaster.jar location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()).append(File.pathSeparatorChar) .append("./*"); for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { classPathEnv.append(File.pathSeparatorChar); classPathEnv.append(c.trim()); } classPathEnv.append(File.pathSeparatorChar).append("./log4j.properties"); // add the runtime classpath needed for tests to work if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(':'); classPathEnv.append(System.getProperty("java.class.path")); } env.put("CLASSPATH", classPathEnv.toString()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master Vector<CharSequence> vargs = new Vector<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(Environment.JAVA_HOME.$() + "/bin/java"); // Set Xmx based on am memory size vargs.add("-Xmx" + amMemory + "m"); // Set class name vargs.add(appMasterMainClass); // Set params for Application Master vargs.add("--container_memory " + String.valueOf(containerMemory)); vargs.add("--num_containers " + String.valueOf(numContainers)); vargs.add("--priority " + String.valueOf(moyaPriority)); if (!localLibJar.isEmpty()) { vargs.add("--lib " + localLibJar + ""); } if (debugFlag) { vargs.add("--debug"); } vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final commmand StringBuilder command = new StringBuilder(); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); // The following are not required for launching an application master // amContainer.setContainerId(containerId); appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); // TODO - what is the range for priority? how to decide? pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = // applicationsManager.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on // success // or an exception thrown to denote some form of a failure LOG.info("Submitting application to ASM"); yarnClient.submitApplication(appContext); // TODO // Try submitting the same request again // app submission failure? // Monitor the application return monitorApplication(appId); }
From source file:distributed.hadoop.HDFSUtils.java
License:Open Source License
/** * Adds a file in HDFS to the classpath for hadoop nodes (via the * DistributedCache)//from w w w. j av a2 s .co m * * @param hdfsConfig the HDFSConfig object with host and port set * @param conf the Configuration object that will be changed by this operation * @param path the path to the file (in HDFS) to be added to the classpath for * hadopp nodes * @param env any environment variables * @throws IOException if a problem occurs */ public static void addFileToClasspath(HDFSConfig hdfsConfig, Configuration conf, String path, Environment env) throws IOException { // conf.set(HDFSConfig.FS_DEFAULT_NAME, // HDFSConfig.constructHostURL(hdfsConfig, env)); hdfsConfig.configureForHadoop(conf, env); FileSystem fs = FileSystem.get(conf); if (path.startsWith("hdfs://")) { throw new IOException("Path should not include 'hdfs://host:port'"); } if (env != null) { try { path = env.substitute(path); } catch (Exception ex) { } } // if (!path.startsWith("/")) { // path = "/" + path; // } // We know that all job-specific jars are installed in the user's home // directory Path destPath = new Path(path); String userHome = fs.getHomeDirectory().toString(); String absolutePath = userHome + "/" + destPath.toString(); if (absolutePath.startsWith("hdfs://")) { // strip this off - for some reason under CDH4 // DistributedCache.addFileToClassPath() keeps the hdfs:// part // of the URL in the classpath spec! Apache does not do this. absolutePath = absolutePath.replace("hdfs://", ""); absolutePath = absolutePath.substring(absolutePath.indexOf("/"), absolutePath.length()); } destPath = new Path(absolutePath); DistributedCache.addFileToClassPath(destPath, conf, fs); checkForWindowsAccessingHadoopOnLinux(conf); }
From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java
License:Apache License
public static LocalResource addLocalResource(String filePath) throws Exception { File file = new File(filePath); String fullFilePath = file.getAbsolutePath(); FileSystem fs = FileSystem.get(IO.getConf()); Path src = new Path(fullFilePath); String pathSuffix = "local-tmp/" + file.getName(); Path dst = new Path(fs.getHomeDirectory(), pathSuffix); fs.copyFromLocalFile(false, true, src, dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource resource = Records.newRecord(LocalResource.class); resource.setType(LocalResourceType.FILE); resource.setVisibility(LocalResourceVisibility.APPLICATION); resource.setResource(ConverterUtils.getYarnUrlFromPath(dst)); resource.setTimestamp(destStatus.getModificationTime()); resource.setSize(destStatus.getLen()); return resource; }
From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java
License:BSD License
/** * @param args/* w w w . j a v a 2 s . co m*/ */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the index and events " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option thresholdOption = new Option("t", "use-custom-thresholds", false, "use custom thresholds for regular and rare events, defined in HDFS_HOME/" + FrameworkUtils.thresholdDir + " file"); thresholdOption.setRequired(false); options.addOption(thresholdOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the indices and events" + " will be computed"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetIndex = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>(); HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>(); Path path = null; FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br; boolean removeExistingFiles = cmd.hasOption("f"); boolean isThresholdUserDefined = cmd.hasOption("t"); for (String dataset : cmd.getOptionValues("g")) { // getting aggregates String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3); if (aggregate.length == 0) { System.out.println("No aggregates found for " + dataset + "."); continue; } // getting aggregates header String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3); if (aggregatesHeaderFileName == null) { System.out.println("No aggregate header for " + dataset); continue; } String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName; shortDataset.add(dataset); datasetId.put(dataset, null); if (s3) { path = new Path(aggregatesHeader); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader); } br = new BufferedReader(new InputStreamReader(fs.open(path))); datasetAgg.put(dataset, br.readLine().split("\t")[1]); br.close(); if (s3) fs.close(); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } // getting user defined thresholds if (isThresholdUserDefined) { if (s3) { path = new Path(s3bucket + FrameworkUtils.thresholdDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { // getting dataset name String dataset = line.trim(); HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>(); HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>(); line = br.readLine(); while ((line != null) && (line.split("\t").length > 1)) { // getting attribute ids and thresholds String[] keyVals = line.trim().split("\t"); int att = Integer.parseInt(keyVals[0].trim()); regThresholds.put(att, Double.parseDouble(keyVals[1].trim())); rareThresholds.put(att, Double.parseDouble(keyVals[2].trim())); line = br.readLine(); } datasetRegThreshold.put(dataset, regThresholds); datasetRareThreshold.put(dataset, rareThresholds); } br.close(); } if (s3) fs.close(); // datasets that will use existing merge tree ArrayList<String> useMergeTree = new ArrayList<String>(); // creating index for each spatio-temporal resolution FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3); FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3); } else if (datasetRegThreshold.containsKey(dataset)) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) { useMergeTree.add(dataset); } } if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) { input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset); shortDatasetIndex.add(dataset); } } if (input.isEmpty()) { System.out.println("All the input datasets have indices."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } String aggregateDatasets = ""; it = input.iterator(); while (it.hasNext()) { aggregateDatasets += it.next() + ","; } Job icJob = null; Configuration icConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "index"; String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/"; FrameworkUtils.removeFile(indexOutputDir, s3conf, s3); icConf.set("dataset-name", datasetNames); icConf.set("dataset-id", datasetIds); if (!useMergeTree.isEmpty()) { String useMergeTreeStr = ""; for (String dt : useMergeTree) { useMergeTreeStr += dt + ","; } icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1)); } for (int i = 0; i < shortDataset.size(); i++) { String dataset = shortDataset.get(i); String id = datasetId.get(dataset); icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset)); if (datasetRegThreshold.containsKey(dataset)) { HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset); String thresholds = ""; for (int att : regThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ","; } icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1)); } if (datasetRareThreshold.containsKey(dataset)) { HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset); String thresholds = ""; for (int att : rareThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ","; } icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1)); } } icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); icConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); icConf.set("mapreduce.task.io.sort.mb", "200"); icConf.set("mapreduce.task.io.sort.factor", "100"); //icConf.set("mapreduce.task.timeout", "1800000"); machineConf.setMachineConfiguration(icConf); if (s3) { machineConf.setMachineConfiguration(icConf); icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); icConf.set("bucket", s3bucket); } if (snappyCompression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } icJob = new Job(icConf); icJob.setJobName(jobName); icJob.setMapOutputKeyClass(AttributeResolutionWritable.class); icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class); icJob.setOutputKeyClass(AttributeResolutionWritable.class); icJob.setOutputValueClass(TopologyTimeSeriesWritable.class); //icJob.setOutputKeyClass(Text.class); //icJob.setOutputValueClass(Text.class); icJob.setMapperClass(IndexCreationMapper.class); icJob.setReducerClass(IndexCreationReducer.class); icJob.setNumReduceTasks(machineConf.getNumberReduces()); icJob.setInputFormatClass(SequenceFileInputFormat.class); //icJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(icJob, true); SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(icJob, true); FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1)); FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir)); icJob.setJarByClass(IndexCreation.class); long start = System.currentTimeMillis(); icJob.submit(); icJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetIndex) { String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }
From source file:edu.nyu.vida.data_polygamy.pre_processing.PreProcessingMapper.java
License:BSD License
@Override public void cleanup(Context context) throws IOException { if (records > 0) { conf = context.getConfiguration(); Path headerFile = null;//from ww w .j a v a2 s . c o m FileSystem fs = null; if (s3) { headerFile = new Path(conf.get("aggregates", "")); fs = FileSystem.get(headerFile.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); headerFile = new Path( fs.getHomeDirectory() + "/" + context.getConfiguration().get("aggregates", "")); } // we cannot have multiple mappers writing the same file if (fs.exists(headerFile)) { if (s3) fs.close(); return; } FSDataOutputStream fsDataOutputStream = fs.create(headerFile); String output = ""; for (int i = 0; i < keyNames.length; i++) { if (keyNames[i] != null) output += keyNames[i] + ","; } output = output.substring(0, output.length() - 1) + "\t"; Iterator<Integer> it = aggregatesIndex.iterator(); while (it.hasNext()) { Integer index = it.next(); output += aggregates.get(index) + ","; } BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream)); bw.write(output.substring(0, output.length() - 1) + "\n"); bw.write(String.valueOf(nbParameters)); bw.close(); if (s3) fs.close(); } }
From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java
License:BSD License
public static void addDatasets(String[] groupCmd, ArrayList<String> group, ArrayList<String> datasets, HashMap<String, String> datasetAgg, Path path, FileSystem fs, Configuration s3conf, boolean s3, String s3bucket) throws IOException { for (String dataset : groupCmd) { // getting indices String[] index = FrameworkUtils.searchIndex(dataset, s3conf, s3); if (index.length == 0) { System.out.println("No indices found for " + dataset + "."); continue; }/*from w w w . j a v a 2 s . c o m*/ // getting aggregate headers String aggregatesHeaderFileName1 = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3); if (aggregatesHeaderFileName1 == null) { System.out.println("No aggregate header for " + dataset); continue; } String aggregatesHeader1 = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName1; if (s3) { path = new Path(aggregatesHeader1); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader1); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); datasetAgg.put(dataset, br.readLine().split("\t")[1]); br.close(); if (s3) fs.close(); datasets.add(dataset); group.add(dataset); } }
From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java
License:BSD License
/** * @param args//www. j a v a 2 s. c om * @throws ParseException */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the relationship " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score"); scoreOption.setRequired(false); scoreOption.setArgName("SCORE THRESHOLD"); options.addOption(scoreOption); Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength"); strengthOption.setRequired(false); strengthOption.setArgName("STRENGTH THRESHOLD"); options.addOption(strengthOption); Option completeRandomizationOption = new Option("c", "complete-randomization", false, "use complete randomization when performing significance tests"); completeRandomizationOption.setRequired(false); options.addOption(completeRandomizationOption); Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes"); idOption.setRequired(false); options.addOption(idOption); Option g1Option = new Option("g1", "first-group", true, "set first group of datasets"); g1Option.setRequired(true); g1Option.setArgName("FIRST GROUP"); g1Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g1Option); Option g2Option = new Option("g2", "second-group", true, "set second group of datasets"); g2Option.setRequired(false); g2Option.setArgName("SECOND GROUP"); g2Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g2Option); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); Option removeOption = new Option("r", "remove-not-significant", false, "remove relationships that are not" + "significant from the final output"); removeOption.setRequired(false); options.addOption(removeOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Path path = null; FileSystem fs = FileSystem.get(new Configuration()); ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> firstGroup = new ArrayList<String>(); ArrayList<String> secondGroup = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); boolean removeNotSignificant = cmd.hasOption("r"); boolean removeExistingFiles = cmd.hasOption("f"); boolean completeRandomization = cmd.hasOption("c"); boolean hasScoreThreshold = cmd.hasOption("sc"); boolean hasStrengthThreshold = cmd.hasOption("st"); boolean outputIds = cmd.hasOption("id"); String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : ""; String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : ""; // all datasets ArrayList<String> all_datasets = new ArrayList<String>(); if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { all_datasets.add(line.split("\t")[0]); line = br.readLine(); } br.close(); if (s3) fs.close(); String[] all_datasets_array = new String[all_datasets.size()]; all_datasets.toArray(all_datasets_array); String[] firstGroupCmd = cmd.getOptionValues("g1"); String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array; addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } if (firstGroup.isEmpty()) { System.out.println("No indices from datasets in G1."); System.exit(0); } if (secondGroup.isEmpty()) { System.out.println("No indices from datasets in G2."); System.exit(0); } // getting dataset ids String datasetNames = ""; String datasetIds = ""; HashMap<String, String> datasetId = new HashMap<String, String>(); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { datasetId.put(it.next(), null); } if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); all_datasets.add(dt[0]); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } String firstGroupStr = ""; String secondGroupStr = ""; for (String dataset : firstGroup) { firstGroupStr += datasetId.get(dataset) + ","; } for (String dataset : secondGroup) { secondGroupStr += datasetId.get(dataset) + ","; } firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1); secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1); String relationshipsDir = ""; if (outputIds) { relationshipsDir = FrameworkUtils.relationshipsIdsDir; } else { relationshipsDir = FrameworkUtils.relationshipsDir; } FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3); String random = completeRandomization ? "complete" : "restricted"; String indexInputDirs = ""; String noRelationship = ""; HashSet<String> dirs = new HashSet<String>(); String dataset1; String dataset2; String datasetId1; String datasetId2; for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } datasetId1 = datasetId.get(dataset1); datasetId2 = datasetId.get(dataset2); if (dataset1.equals(dataset2)) continue; String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) { dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1); dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2); } else { noRelationship += datasetId1 + "-" + datasetId2 + ","; } } } if (dirs.isEmpty()) { System.out.println("All the relationships were already computed."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } for (String dir : dirs) { indexInputDirs += dir + ","; } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "relationship" + "-" + random; String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/"; FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3); for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i))); } for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size", Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length)); } conf.set("dataset-keys", datasetIds); conf.set("dataset-names", datasetNames); conf.set("first-group", firstGroupStr); conf.set("second-group", secondGroupStr); conf.set("complete-random", String.valueOf(completeRandomization)); conf.set("output-ids", String.valueOf(outputIds)); conf.set("complete-random-str", random); conf.set("main-dataset-id", datasetId.get(shortDataset.get(0))); conf.set("remove-not-significant", String.valueOf(removeNotSignificant)); if (noRelationship.length() > 0) { conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1)); } if (hasScoreThreshold) { conf.set("score-threshold", scoreThreshold); } if (hasStrengthThreshold) { conf.set("strength-threshold", strengthThreshold); } conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); conf.set("mapreduce.task.timeout", "2400000"); if (s3) { machineConf.setMachineConfiguration(conf); conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("bucket", s3bucket); } if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } Job job = new Job(conf); job.setJobName(jobName); job.setMapOutputKeyClass(PairAttributeWritable.class); job.setMapOutputValueClass(TopologyTimeSeriesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CorrelationMapper.class); job.setReducerClass(CorrelationReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); job.setInputFormatClass(SequenceFileInputFormat.class); //job.setOutputFormatClass(TextOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1)); FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir)); job.setJarByClass(Relationship.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } if (dataset1.equals(dataset2)) continue; String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/"; String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } } }
From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java
License:BSD License
/** * @param args// w ww . jav a 2 s . co m */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the aggregate functions " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions" + " will be computed, followed by their temporal and spatial attribute indices"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; String preProcessingDatasets = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetAggregation = new ArrayList<String>(); HashMap<String, String> datasetTempAtt = new HashMap<String, String>(); HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>(); HashMap<String, String> preProcessingDataset = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); boolean removeExistingFiles = cmd.hasOption("f"); String[] datasetArgs = cmd.getOptionValues("g"); for (int i = 0; i < datasetArgs.length; i += 3) { String dataset = datasetArgs[i]; // getting pre-processing String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3); if (tempPreProcessing == null) { System.out.println("No pre-processing available for " + dataset); continue; } preProcessingDataset.put(dataset, tempPreProcessing); shortDataset.add(dataset); datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1])); datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2])); datasetId.put(dataset, null); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id Path path = null; FileSystem fs = null; if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3); // getting smallest resolution HashMap<String, String> tempResMap = new HashMap<String, String>(); HashMap<String, String> spatialResMap = new HashMap<String, String>(); HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>(); HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>(); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String[] datasetArray = preProcessingDataset.get(dataset).split("-"); String datasetTemporalStr = datasetArray[datasetArray.length - 2]; int datasetTemporal = utils.temporalResolution(datasetTemporalStr); String datasetSpatialStr = datasetArray[datasetArray.length - 1]; int datasetSpatial = utils.spatialResolution(datasetSpatialStr); // finding all possible resolutions String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal); String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial); String temporalResolution = ""; String spatialResolution = ""; String tempRes = ""; String spatialRes = ""; boolean dataAdded = false; for (int i = 0; i < temporalResolutions.length; i++) { for (int j = 0; j < spatialResolutions.length; j++) { temporalResolution = temporalResolutions[i]; spatialResolution = spatialResolutions[j]; String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) { dataAdded = true; tempRes += temporalResolution + "-"; spatialRes += spatialResolution + "-"; } } } if (dataAdded) { input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); shortDatasetAggregation.add(dataset); tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1)); spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1)); datasetTemporalStrMap.put(dataset, datasetTemporalStr); datasetSpatialStrMap.put(dataset, datasetSpatialStr); } } if (input.isEmpty()) { System.out.println("All the input datasets have aggregates."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } it = input.iterator(); while (it.hasNext()) { preProcessingDatasets += it.next() + ","; } Job aggJob = null; String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/"; String jobName = "aggregates"; FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3); Configuration aggConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); aggConf.set("dataset-name", datasetNames); aggConf.set("dataset-id", datasetIds); for (int i = 0; i < shortDatasetAggregation.size(); i++) { String dataset = shortDatasetAggregation.get(i); String id = datasetId.get(dataset); aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset)); aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset)); aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset)); aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset)); if (s3) aggConf.set("dataset-" + id, s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); else aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/" + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); } aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); aggConf.set("mapreduce.task.io.sort.mb", "200"); aggConf.set("mapreduce.task.io.sort.factor", "100"); machineConf.setMachineConfiguration(aggConf); if (s3) { machineConf.setMachineConfiguration(aggConf); aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); } if (snappyCompression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } aggJob = new Job(aggConf); aggJob.setJobName(jobName); aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class); aggJob.setMapOutputValueClass(AggregationArrayWritable.class); aggJob.setOutputKeyClass(SpatioTemporalWritable.class); aggJob.setOutputValueClass(FloatArrayWritable.class); //aggJob.setOutputKeyClass(Text.class); //aggJob.setOutputValueClass(Text.class); aggJob.setMapperClass(AggregationMapper.class); aggJob.setCombinerClass(AggregationCombiner.class); aggJob.setReducerClass(AggregationReducer.class); aggJob.setNumReduceTasks(machineConf.getNumberReduces()); aggJob.setInputFormatClass(SequenceFileInputFormat.class); //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(aggJob, true); SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(aggJob, true); FileInputFormat.setInputPaths(aggJob, preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1)); FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir)); aggJob.setJarByClass(Aggregation.class); long start = System.currentTimeMillis(); aggJob.submit(); aggJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetAggregation) { String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }