List of usage examples for org.apache.hadoop.fs Path makeQualified
@Deprecated
public Path makeQualified(FileSystem fs)
From source file:com.splout.db.hadoop.StoreDeployerTool.java
License:Apache License
/** * Deploy already generated tablespaces at a time. *//* w ww. ja v a2 s . c om*/ @SuppressWarnings("unchecked") public void deploy(Collection<TablespaceDepSpec> deployments) throws JSONSerDeException, IOException { // We now query for the alive DNodes and build deployRequests accordingly DeployRequest[] deployRequests = new DeployRequest[deployments.size()]; log.info("Querying Splout QNode for list of DNodes..."); SploutClient client = new SploutClient(qnode); List<String> dnodes = client.dNodeList(); if (dnodes == null || dnodes.size() == 0) { throw new IOException("No available DNodes in Splout cluster."); } int tIndex = 0; for (TablespaceDepSpec tablespace : deployments) { Path tablespaceOut = new Path(tablespace.getSourcePath()); // Define a DeployRequest for this Tablespace deployRequests[tIndex] = new DeployRequest(); // Splout only accepts absolute URIs FileSystem sourceFs = tablespaceOut.getFileSystem(conf); if (!sourceFs.exists(tablespaceOut)) { throw new IllegalArgumentException("Folder doesn't exist: " + tablespaceOut); } @SuppressWarnings("deprecation") Path absoluteOutPath = tablespaceOut.makeQualified(sourceFs); Path partitionMapPath = new Path(tablespaceOut, TablespaceGenerator.OUT_PARTITION_MAP); if (!sourceFs.exists(partitionMapPath)) { throw new IllegalArgumentException( "Invalid tablespace folder: " + tablespaceOut + " doesn't contain a partition-map file."); } // Load the partition map PartitionMap partitionMap = JSONSerDe.deSer(HadoopUtils.fileToString(sourceFs, partitionMapPath), PartitionMap.class); // Load the init statements, if they exist ArrayList<String> initStatements = new ArrayList<String>(); Path initStatementsPath = new Path(tablespaceOut, TablespaceGenerator.OUT_INIT_STATEMENTS); if (sourceFs.exists(initStatementsPath)) { initStatements.addAll( JSONSerDe.deSer(HadoopUtils.fileToString(sourceFs, initStatementsPath), ArrayList.class)); } // Add the other initStatements coming in the deploy request if (tablespace.getInitStatements() != null) { initStatements.addAll(tablespace.getInitStatements()); } String engine = DefaultEngine.class.getName(); // New : load the engine id used in the generation tool, if exists ( to maintain backwards compatibility ) Path engineId = new Path(tablespaceOut, TablespaceGenerator.OUT_ENGINE); if (sourceFs.exists(engineId)) { engine = HadoopUtils.fileToString(sourceFs, engineId); log.info("Using generated engine id: " + engine); } // Finally set deployRequests[tIndex].setInitStatements(initStatements); deployRequests[tIndex].setEngine(engine); deployRequests[tIndex].setTablespace(tablespace.getTablespace()); deployRequests[tIndex].setData_uri(new Path(absoluteOutPath, "store").toUri().toString()); deployRequests[tIndex].setPartitionMap(partitionMap.getPartitionEntries()); // If rep > dnodes, impossible to reach this level of replication int repFactor = tablespace.getReplication(); if (dnodes.size() < repFactor) { log.warn( "WARNING: Replication factor " + repFactor + " for tablespace " + tablespace.getTablespace() + " is bigger than the number of serving DNodes. Adjusting replication factor to " + dnodes.size()); repFactor = dnodes.size(); } deployRequests[tIndex] .setReplicationMap(ReplicationMap.roundRobinMap(partitionMap.getPartitionEntries().size(), repFactor, dnodes.toArray(new String[0])).getReplicationEntries()); tIndex++; } // Finally we send the deploy request DeployInfo dInfo = client.deploy(deployRequests); log.info("Deploy request of [" + deployments.size() + "] tablespaces performed. Deploy on [" + qnode + "] with version [" + dInfo.getVersion() + "] in progress."); }
From source file:com.splout.db.integration.HadoopIntegrationTest.java
License:Apache License
@Override public int run(String[] args) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Splout Hadoop Compatibility Integration Test"); try {/*from w w w . ja v a 2 s .co m*/ jComm.parse(args); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } Path tmpHdfsPath = new Path( "tmp-" + HadoopIntegrationTest.class.getName() + "-" + System.currentTimeMillis()); FileSystem fS = tmpHdfsPath.getFileSystem(getConf()); fS.mkdirs(tmpHdfsPath); fS.mkdirs(new Path(tmpHdfsPath, "input")); fS.mkdirs(new Path(tmpHdfsPath, "output")); boolean isLocal = FileSystem.get(conf).equals(FileSystem.getLocal(conf)); if (!isLocal) { SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf); } tmpHdfsPath = tmpHdfsPath.makeQualified(fS); Path pageCounts = new Path(input); FileUtil.copy(FileSystem.getLocal(getConf()), pageCounts, fS, new Path(tmpHdfsPath, "input"), false, getConf()); SimpleGeneratorCMD generator = new SimpleGeneratorCMD(); generator.setConf(getConf()); if (generator.run(new String[] { "-tb", "pagecountsintegration", "-t", "pagecounts", "-i", tmpHdfsPath + "/input", "-o", tmpHdfsPath + "/output", "-s", "projectcode:string, pagename:string, visits:int, bytes:long", "-pby", "projectcode,pagename", "-sep", "\" \"", "-p", "2", "-e", engine }) < 0) { throw new RuntimeException("Generator failed!"); } SploutClient client = new SploutClient(qnode); QNodeStatus status = client.overview(); long previousVersion = -1; if (status.getTablespaceMap().get("pagecountsintegration") != null) { previousVersion = status.getTablespaceMap().get("pagecountsintegration").getVersion(); } DeployerCMD deployer = new DeployerCMD(); deployer.setConf(getConf()); if (deployer.run(new String[] { "-r", "2", "-q", qnode, "-root", tmpHdfsPath + "/output", "-ts", "pagecountsintegration" }) < 0) { throw new RuntimeException("Deployer failed!"); } long waitedSoFar = 0; status = client.overview(); while (status.getTablespaceMap().get("pagecountsintegration") == null || previousVersion == status.getTablespaceMap().get("pagecountsintegration").getVersion()) { Thread.sleep(2000); waitedSoFar += 2000; status = client.overview(); if (waitedSoFar > 90000) { throw new RuntimeException( "Deploy must have failed in Splout's server. Waiting too much for it to complete."); } } previousVersion = status.getTablespaceMap().get("pagecountsintegration").getVersion(); QueryStatus qStatus = client.query("pagecountsintegration", "*", "SELECT * FROM pagecounts;", null); System.out.println(qStatus.getResult()); if (qStatus.getResult() == null) { throw new RuntimeException("Something failed as query() is returning null!"); } System.out.println("Everything fine."); return 1; }
From source file:com.test.hadoop.JhhSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from www . j ava2 s. c o m*/ * * @throws IOException * When there is communication problems with the job tracker. */ @SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSort.class); jobConf.setJobName("sorter"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = TextInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = LongWritable.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.yahoo.storm.yarn.StormOnYarn.java
License:Open Source License
private void launchApp(String appName, String queue, int amMB, String storm_zip_location) throws Exception { LOG.debug("StormOnYarn:launchApp() ..."); YarnClientApplication client_app = _yarn.createApplication(); GetNewApplicationResponse app = client_app.getNewApplicationResponse(); _appId = app.getApplicationId();/*from w w w . j a va 2s.com*/ LOG.debug("_appId:" + _appId); if (amMB > app.getMaximumResourceCapability().getMemory()) { //TODO need some sanity checks amMB = app.getMaximumResourceCapability().getMemory(); } ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); appContext.setApplicationId(app.getApplicationId()); appContext.setApplicationName(appName); appContext.setQueue(queue); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the // local resources LOG.info("Copy App Master jar from local filesystem and add to local environment"); // Copy the application master jar to the filesystem // Create a local resource to point to the destination jar path String appMasterJar = findContainingJar(MasterServer.class); FileSystem fs = FileSystem.get(_hadoopConf); Path src = new Path(appMasterJar); String appHome = Util.getApplicationHomeForId(_appId.toString()); Path dst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + "AppMaster.jar"); fs.copyFromLocalFile(false, true, src, dst); localResources.put("AppMaster.jar", Util.newYarnAppResource(fs, dst)); String stormVersion = Util.getStormVersion(); Path zip; if (storm_zip_location != null) { zip = new Path(storm_zip_location); } else { zip = new Path("/lib/storm/" + stormVersion + "/storm.zip"); } _stormConf.put("storm.zip.path", zip.makeQualified(fs).toUri().getPath()); LocalResourceVisibility visibility = LocalResourceVisibility.PUBLIC; _stormConf.put("storm.zip.visibility", "PUBLIC"); if (!Util.isPublic(fs, zip)) { visibility = LocalResourceVisibility.APPLICATION; _stormConf.put("storm.zip.visibility", "APPLICATION"); } localResources.put("storm", Util.newYarnAppResource(fs, zip, LocalResourceType.ARCHIVE, visibility)); Path confDst = Util.createConfigurationFileInFs(fs, appHome, _stormConf, _hadoopConf); // establish a symbolic link to conf directory localResources.put("conf", Util.newYarnAppResource(fs, confDst)); // Setup security tokens Path[] paths = new Path[3]; paths[0] = dst; paths[1] = zip; paths[2] = confDst; Credentials credentials = new Credentials(); TokenCache.obtainTokensForNamenodes(credentials, paths, _hadoopConf); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); //security tokens for HDFS distributed cache amContainer.setTokens(securityTokens); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the env variables to be setup in the env where the application master // will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // add the runtime classpath needed for tests to work Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./conf"); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./AppMaster.jar"); //Make sure that AppMaster has access to all YARN JARs List<String> yarn_classpath_cmd = java.util.Arrays.asList("yarn", "classpath"); ProcessBuilder pb = new ProcessBuilder(yarn_classpath_cmd).redirectError(Redirect.INHERIT); LOG.info("YARN CLASSPATH COMMAND = [" + yarn_classpath_cmd + "]"); pb.environment().putAll(System.getenv()); Process proc = pb.start(); BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); String line = ""; String yarn_class_path = (String) _stormConf.get("storm.yarn.yarn_classpath"); if (yarn_class_path == null) { StringBuilder yarn_class_path_builder = new StringBuilder(); while ((line = reader.readLine()) != null) { yarn_class_path_builder.append(line); } yarn_class_path = yarn_class_path_builder.toString(); } LOG.info("YARN CLASSPATH = [" + yarn_class_path + "]"); proc.waitFor(); reader.close(); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), yarn_class_path); String stormHomeInZip = Util.getStormHomeInZip(fs, zip, stormVersion); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./storm/" + stormHomeInZip + "/*"); Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./storm/" + stormHomeInZip + "/lib/*"); String java_home = (String) _stormConf.get("storm.yarn.java_home"); if (java_home == null) java_home = System.getenv("JAVA_HOME"); if (java_home != null && !java_home.isEmpty()) env.put("JAVA_HOME", java_home); LOG.info("Using JAVA_HOME = [" + env.get("JAVA_HOME") + "]"); env.put("appJar", appMasterJar); env.put("appName", appName); env.put("appId", new Integer(_appId.getId()).toString()); env.put("STORM_LOG_DIR", ApplicationConstants.LOG_DIR_EXPANSION_VAR); amContainer.setEnvironment(env); // Set the necessary command to execute the application master Vector<String> vargs = new Vector<String>(); if (java_home != null && !java_home.isEmpty()) vargs.add(env.get("JAVA_HOME") + "/bin/java"); else vargs.add("java"); vargs.add("-Dstorm.home=./storm/" + stormHomeInZip + "/"); vargs.add("-Dlogfile.name=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/master.log"); //vargs.add("-verbose:class"); vargs.add("com.yahoo.storm.yarn.MasterServer"); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); // Set java executable command LOG.info("Setting up app master command:" + vargs); amContainer.setCommands(vargs); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMB); appContext.setResource(capability); appContext.setAMContainerSpec(amContainer); _yarn.submitApplication(appContext); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }/*from w ww . j av a 2 s. co m*/ conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.HdfsResource.java
License:Apache License
@SuppressWarnings("deprecation") HdfsResource(Path path, FileSystem fs) { Assert.notNull(path, "a valid path is required"); Assert.notNull(fs, "non null file system required"); this.location = path.toString(); this.fs = fs; this.path = path.makeQualified(fs); boolean exists = false; try {//from w ww .ja va 2 s . c o m exists = fs.exists(path); } catch (final Exception ex) { } this.exists = exists; FileStatus status = null; try { status = fs.getFileStatus(path); } catch (final Exception ex) { } this.status = status; }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Ensures that the given class is in the class path of running jobs. * If the jar is not already in the class path, it is added to the * DisributedCache of the given job to ensure the associated job will work * fine.// ww w . j a v a 2 s .c om * @param conf * @param klass */ public static void addClassToPath(Configuration conf, Class<?> klass) { // Check if we need to add the containing jar to class path String klassJar = findContainingJar(klass); String shadoopJar = findContainingJar(SpatialSite.class); if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar))) return; Path containingJar = new Path(findContainingJar(klass)); Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf); if (existingClassPaths != null) { for (Path existingClassPath : existingClassPaths) { if (containingJar.getName().equals(existingClassPath.getName())) return; } } // The containing jar is a new one and needs to be copied to class path try { LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path"); FileSystem defaultFS = FileSystem.get(conf); Path libFolder; if (existingClassPaths != null && existingClassPaths.length > 0) { libFolder = existingClassPaths[0].getParent(); } else { // First jar to be added like this. Create a new lib folder do { libFolder = new Path("lib_" + (int) (Math.random() * 100000)); } while (defaultFS.exists(libFolder)); defaultFS.mkdirs(libFolder); defaultFS.deleteOnExit(libFolder); } defaultFS.copyFromLocalFile(containingJar, libFolder); Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS); jarFullPath = jarFullPath.makeQualified(defaultFS); DistributedCache.addArchiveToClassPath(jarFullPath, conf); } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Creates a full spatio-temporal hierarchy for a source folder * @throws ParseException /*from w w w .ja va2s . com*/ * @throws InterruptedException */ public static void directoryIndexer(final OperationsParams params) throws IOException, ParseException, InterruptedException { Path inputDir = params.getInputPath(); FileSystem sourceFs = inputDir.getFileSystem(params); final Path sourceDir = inputDir.makeQualified(sourceFs); Path destDir = params.getOutputPath(); final FileSystem destFs = destDir.getFileSystem(params); TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; // Create daily indexes that do not exist final Path dailyIndexDir = new Path(destDir, "daily"); FileStatus[] mathcingDays = timeRange == null ? sourceFs.listStatus(inputDir) : sourceFs.listStatus(inputDir, timeRange); final Vector<Path> sourceFiles = new Vector<Path>(); for (FileStatus matchingDay : mathcingDays) { for (FileStatus matchingTile : sourceFs.listStatus(matchingDay.getPath())) { sourceFiles.add(matchingTile.getPath()); } } // Shuffle the array for better load balancing across threads Collections.shuffle(sourceFiles); final String datasetName = params.get("dataset"); Parallel.forEach(sourceFiles.size(), new RunnableRange<Object>() { @Override public Object run(int i1, int i2) { LOG.info("Worker [" + i1 + "," + i2 + ") started"); for (int i = i1; i < i2; i++) { Path sourceFile = sourceFiles.get(i); try { Path relativeSourceFile = makeRelative(sourceDir, sourceFile); Path destFilePath = new Path(dailyIndexDir, relativeSourceFile); if (!destFs.exists(destFilePath)) { LOG.info("Worker [" + i1 + "," + i2 + ") indexing: " + sourceFile.getName()); Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (destFs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(destFs); if (datasetName == null) throw new RuntimeException( "Please provide the name of dataset you would like to index"); AggregateQuadTree.build(params, sourceFile, datasetName, tmpFile); synchronized (destFs) { Path destDir = destFilePath.getParent(); if (!destFs.exists(destDir)) destFs.mkdirs(destDir); } destFs.rename(tmpFile, destFilePath); } } catch (IOException e) { throw new RuntimeException("Error building an index for " + sourceFile, e); } } LOG.info("Worker [" + i1 + "," + i2 + ") finished"); return null; } }); LOG.info("Done generating daily indexes"); // Merge daily indexes into monthly indexes Path monthlyIndexDir = new Path(destDir, "monthly"); final SimpleDateFormat dayFormat = new SimpleDateFormat("yyyy.MM.dd"); final SimpleDateFormat monthFormat = new SimpleDateFormat("yyyy.MM"); mergeIndexes(destFs, dailyIndexDir, monthlyIndexDir, dayFormat, monthFormat, params); LOG.info("Done generating monthly indexes"); // Merge daily indexes into monthly indexes Path yearlyIndexDir = new Path(destDir, "yearly"); final SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy"); mergeIndexes(destFs, monthlyIndexDir, yearlyIndexDir, monthFormat, yearFormat, params); LOG.info("Done generating yearly indexes"); }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Merges a set of indexes into larger indexes * @param fs//from w w w.j ava 2s . co m * @param srcIndexDir * @param dstIndexDir * @param srcFormat * @param dstFormat * @param params * @throws IOException * @throws ParseException * @throws InterruptedException */ private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir, SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params) throws IOException, ParseException, InterruptedException { TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir) : fs.listStatus(srcIndexDir, timeRange); Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here // Scan the source indexes and merge each consecutive run belonging to the // same unit int i1 = 0; while (i1 < sourceIndexes.length) { final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName())); int i2 = i1 + 1; // Keep scanning as long as the source index belongs to the same dest index while (i2 < sourceIndexes.length && dstFormat .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate)) i2++; // Merge all source indexes in the range [i1, i2) into one dest index // Copy i1, i2 to other variables as final to be accessible from threads final int firstIndex = i1; final int lastIndex = i2; final Path destIndex = new Path(dstIndexDir, indexToCreate); // For each tile, merge all values in all source indexes /*A regular expression to catch the tile identifier of a MODIS grid cell*/ final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$"); final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath()); // Shuffle the array for better load balancing across threads Random rand = new Random(); for (int i = 0; i < tilesInFirstDay.length - 1; i++) { // Swap the entry at i with any following entry int j = i + rand.nextInt(tilesInFirstDay.length - i - 1); FileStatus temp = tilesInFirstDay[i]; tilesInFirstDay[i] = tilesInFirstDay[j]; tilesInFirstDay[j] = temp; } Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() { @Override public Object run(int i_file1, int i_file2) { for (int i_file = i_file1; i_file < i_file2; i_file++) { try { FileStatus tileInFirstDay = tilesInFirstDay[i_file]; // Extract tile ID Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName()); if (!matcher.matches()) { LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath()); continue; } final String tileID = matcher.group(1); Path destIndexFile = new Path(destIndex, tileID); PathFilter tileFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileID); } }; // Find matching tiles in all source indexes to merge Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex); filesToMerge.add(tileInFirstDay.getPath()); for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) { FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(), tileFilter); if (matchedTileFile.length == 0) LOG.warn("Could not find tile " + tileID + " in dir " + sourceIndexes[iDailyIndex].getPath()); else if (matchedTileFile.length == 1) filesToMerge.add(matchedTileFile[0].getPath()); } if (fs.exists(destIndexFile)) { // Destination file already exists // Check the date of the destination and source files to see // whether it needs to be updated or not long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime(); boolean needsUpdate = false; for (Path fileToMerge : filesToMerge) { long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime(); if (sourceTimestamp > destTimestamp) { needsUpdate = true; break; } } if (!needsUpdate) continue; else LOG.info("Updating file " + destIndexFile.getName()); } // Do the merge Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (fs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(fs); LOG.info("Merging tile " + tileID + " into file " + destIndexFile); AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]), tmpFile); synchronized (fs) { Path destDir = destIndexFile.getParent(); if (!fs.exists(destDir)) fs.mkdirs(destDir); } fs.rename(tmpFile, destIndexFile); } catch (IOException e) { e.printStackTrace(); } } return null; } }); i1 = i2; } }
From source file:edu.umn.cs.spatialHadoop.nasa.HTTPFileSystem.java
License:Open Source License
/** * Lists all files and directories in a given Path that points to a directory. * While this function is written in a generic way, it was designed and tested * only with LP DAAC archives./*from ww w .j a v a2 s .c om*/ */ @Override public FileStatus[] listStatus(Path f) throws IOException { Vector<FileStatus> statuses = new Vector<FileStatus>(); final Pattern httpEntryPattern = Pattern .compile("<a href=\"[^\"]+\">(.+)</a>\\s*(\\d+-\\w+-\\d+)\\s+(\\d+:\\d+)\\s+([\\d\\.]+[KMG]|-)"); f = f.makeQualified(this); URL url = f.toUri().toURL(); int retryCount = HTTPFileSystem.retries; BufferedReader inBuffer = null; try { while (inBuffer == null && retryCount-- > 0) { try { inBuffer = new BufferedReader(new InputStreamReader(url.openStream())); } catch (java.net.SocketException e) { if (retryCount == 0) throw e; LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount); try { Thread.sleep(1000); } catch (InterruptedException e1) { } } catch (java.net.UnknownHostException e) { if (retryCount == 0) throw e; LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount); try { Thread.sleep(1000); } catch (InterruptedException e1) { } } } if (inBuffer == null) throw new RuntimeException("Could not access URL " + f); String line; while ((line = inBuffer.readLine()) != null) { Matcher matcher = httpEntryPattern.matcher(line); while (matcher.find()) { String entryName = matcher.group(1); Path entryPath = new Path(f, entryName); String entryDate = matcher.group(2); String entryTime = matcher.group(3); long modificationTime = parseDateTime(entryDate, entryTime); String size = matcher.group(4); boolean isDir = size.equals("-"); long length = isDir ? 0 : parseSize(size); FileStatus fstatus = new FileStatus(length, isDir, 1, 4096, modificationTime, modificationTime, null, null, null, entryPath); statuses.add(fstatus); } } } finally { if (inBuffer != null) inBuffer.close(); } return statuses.toArray(new FileStatus[statuses.size()]); }