List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(FileSystem fs, Path file) { // HDF files are not splittable if (file.getName().toLowerCase().endsWith(".hdf")) return false; final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null && !(codec instanceof SplittableCompressionCodec)) return false; try {/*w w w . j a v a 2 s . c o m*/ // To avoid opening the file and checking the first 8-bytes to look for // an R-tree signature, we never split a file read over HTTP if (fs instanceof HTTPFileSystem) return false; // ... and never split a file less than 150MB to perform better with many small files if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024) return false; return !SpatialSite.isRTree(fs, file); } catch (IOException e) { return super.isSplitable(fs, file); } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path file) { try {/*from w w w . java 2 s.c o m*/ // Create compressionCodecs to be used by isSplitable method if (compressionCodecs == null) compressionCodecs = new CompressionCodecFactory(context.getConfiguration()); FileSystem fs = file.getFileSystem(context.getConfiguration()); // HDF files are not splittable if (file.getName().toLowerCase().endsWith(".hdf")) return false; final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null && !(codec instanceof SplittableCompressionCodec)) return false; // To avoid opening the file and checking the first 8-bytes to look for // an R-tree signature, we never split a file read over HTTP if (fs instanceof HTTPFileSystem) return false; // ... and never split a file less than 150MB to perform better with many small files if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024) return false; return !SpatialSite.isRTree(fs, file); } catch (IOException e) { LOG.warn("Error while determining whether a file is splittable", e); return false; // Safer to not split it } }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Creates a full spatio-temporal hierarchy for a source folder * @throws ParseException /*from ww w. ja va 2 s . c o m*/ * @throws InterruptedException */ public static void directoryIndexer(final OperationsParams params) throws IOException, ParseException, InterruptedException { Path inputDir = params.getInputPath(); FileSystem sourceFs = inputDir.getFileSystem(params); final Path sourceDir = inputDir.makeQualified(sourceFs); Path destDir = params.getOutputPath(); final FileSystem destFs = destDir.getFileSystem(params); TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; // Create daily indexes that do not exist final Path dailyIndexDir = new Path(destDir, "daily"); FileStatus[] mathcingDays = timeRange == null ? sourceFs.listStatus(inputDir) : sourceFs.listStatus(inputDir, timeRange); final Vector<Path> sourceFiles = new Vector<Path>(); for (FileStatus matchingDay : mathcingDays) { for (FileStatus matchingTile : sourceFs.listStatus(matchingDay.getPath())) { sourceFiles.add(matchingTile.getPath()); } } // Shuffle the array for better load balancing across threads Collections.shuffle(sourceFiles); final String datasetName = params.get("dataset"); Parallel.forEach(sourceFiles.size(), new RunnableRange<Object>() { @Override public Object run(int i1, int i2) { LOG.info("Worker [" + i1 + "," + i2 + ") started"); for (int i = i1; i < i2; i++) { Path sourceFile = sourceFiles.get(i); try { Path relativeSourceFile = makeRelative(sourceDir, sourceFile); Path destFilePath = new Path(dailyIndexDir, relativeSourceFile); if (!destFs.exists(destFilePath)) { LOG.info("Worker [" + i1 + "," + i2 + ") indexing: " + sourceFile.getName()); Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (destFs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(destFs); if (datasetName == null) throw new RuntimeException( "Please provide the name of dataset you would like to index"); AggregateQuadTree.build(params, sourceFile, datasetName, tmpFile); synchronized (destFs) { Path destDir = destFilePath.getParent(); if (!destFs.exists(destDir)) destFs.mkdirs(destDir); } destFs.rename(tmpFile, destFilePath); } } catch (IOException e) { throw new RuntimeException("Error building an index for " + sourceFile, e); } } LOG.info("Worker [" + i1 + "," + i2 + ") finished"); return null; } }); LOG.info("Done generating daily indexes"); // Merge daily indexes into monthly indexes Path monthlyIndexDir = new Path(destDir, "monthly"); final SimpleDateFormat dayFormat = new SimpleDateFormat("yyyy.MM.dd"); final SimpleDateFormat monthFormat = new SimpleDateFormat("yyyy.MM"); mergeIndexes(destFs, dailyIndexDir, monthlyIndexDir, dayFormat, monthFormat, params); LOG.info("Done generating monthly indexes"); // Merge daily indexes into monthly indexes Path yearlyIndexDir = new Path(destDir, "yearly"); final SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy"); mergeIndexes(destFs, monthlyIndexDir, yearlyIndexDir, monthFormat, yearFormat, params); LOG.info("Done generating yearly indexes"); }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Merges a set of indexes into larger indexes * @param fs/*from w ww. j a v a 2s . c o m*/ * @param srcIndexDir * @param dstIndexDir * @param srcFormat * @param dstFormat * @param params * @throws IOException * @throws ParseException * @throws InterruptedException */ private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir, SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params) throws IOException, ParseException, InterruptedException { TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir) : fs.listStatus(srcIndexDir, timeRange); Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here // Scan the source indexes and merge each consecutive run belonging to the // same unit int i1 = 0; while (i1 < sourceIndexes.length) { final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName())); int i2 = i1 + 1; // Keep scanning as long as the source index belongs to the same dest index while (i2 < sourceIndexes.length && dstFormat .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate)) i2++; // Merge all source indexes in the range [i1, i2) into one dest index // Copy i1, i2 to other variables as final to be accessible from threads final int firstIndex = i1; final int lastIndex = i2; final Path destIndex = new Path(dstIndexDir, indexToCreate); // For each tile, merge all values in all source indexes /*A regular expression to catch the tile identifier of a MODIS grid cell*/ final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$"); final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath()); // Shuffle the array for better load balancing across threads Random rand = new Random(); for (int i = 0; i < tilesInFirstDay.length - 1; i++) { // Swap the entry at i with any following entry int j = i + rand.nextInt(tilesInFirstDay.length - i - 1); FileStatus temp = tilesInFirstDay[i]; tilesInFirstDay[i] = tilesInFirstDay[j]; tilesInFirstDay[j] = temp; } Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() { @Override public Object run(int i_file1, int i_file2) { for (int i_file = i_file1; i_file < i_file2; i_file++) { try { FileStatus tileInFirstDay = tilesInFirstDay[i_file]; // Extract tile ID Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName()); if (!matcher.matches()) { LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath()); continue; } final String tileID = matcher.group(1); Path destIndexFile = new Path(destIndex, tileID); PathFilter tileFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileID); } }; // Find matching tiles in all source indexes to merge Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex); filesToMerge.add(tileInFirstDay.getPath()); for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) { FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(), tileFilter); if (matchedTileFile.length == 0) LOG.warn("Could not find tile " + tileID + " in dir " + sourceIndexes[iDailyIndex].getPath()); else if (matchedTileFile.length == 1) filesToMerge.add(matchedTileFile[0].getPath()); } if (fs.exists(destIndexFile)) { // Destination file already exists // Check the date of the destination and source files to see // whether it needs to be updated or not long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime(); boolean needsUpdate = false; for (Path fileToMerge : filesToMerge) { long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime(); if (sourceTimestamp > destTimestamp) { needsUpdate = true; break; } } if (!needsUpdate) continue; else LOG.info("Updating file " + destIndexFile.getName()); } // Do the merge Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (fs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(fs); LOG.info("Merging tile " + tileID + " into file " + destIndexFile); AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]), tmpFile); synchronized (fs) { Path destDir = destIndexFile.getParent(); if (!fs.exists(destDir)) fs.mkdirs(destDir); } fs.rename(tmpFile, destIndexFile); } catch (IOException e) { e.printStackTrace(); } } return null; } }); i1 = i2; } }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Make a path relative to another path by removing all common ancestors * @param parent/* www . j ava 2 s. com*/ * @param descendant * @return */ private static Path makeRelative(Path parent, Path descendant) { Stack<String> components = new Stack<String>(); while (descendant.depth() > parent.depth()) { components.push(descendant.getName()); descendant = descendant.getParent(); } if (!descendant.equals(parent)) throw new RuntimeException("descendant not a child of parent"); if (components.isEmpty()) return new Path("."); Path relative = new Path(components.pop()); while (!components.isEmpty()) relative = new Path(relative, components.pop()); return relative; }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFPlot.java
License:Open Source License
public static Job plotHeatMap(Path[] inFiles, Path outFile, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { if (params.get("shape") == null) { // Set the default shape value params.setClass("shape", NASARectangle.class, Shape.class); } else if (!(params.getShape("shape") instanceof NASAShape)) { System.err.println("The specified shape " + params.get("shape") + " in not an instance of NASAShape"); System.exit(1);//from w w w. j av a 2 s .c o m } if (params.get("mbr") == null) { // Set to the same value as query rectangle or the whole world params.set("mbr", params.get("rect", "-180,-90,180,90")); } // Restrict to HDF files if working on a directory for (int i = 0; i < inFiles.length; i++) { if (!inFiles[i].getName().toLowerCase().endsWith(".hdf")) inFiles[i] = new Path(inFiles[i], "*.hdf"); } String recover = params.get("recover", "none").toLowerCase(); if (recover.equals("none")) { // Don't recover holes params.setBoolean("recoverholes", false); } else if (recover.equals("read")) { // Recover holes on read params.setBoolean("recoverholes", true); } else if (recover.equals("write")) { // Recover holes upon writing the final image params.setBoolean("recoverholes", false); if (params.get(PREPROCESSED_WATERMARK) == null) { OperationsParams params2 = new OperationsParams(params); params2.setBoolean("background", false); Path wmImage = new Path(outFile.getParent(), outFile.getName() + "_WaterMask"); generateWaterMask(wmImage, params2); params.set(PREPROCESSED_WATERMARK, wmImage.toString()); } } if (params.getBoolean("pyramid", false)) return MultilevelPlot.plot(inFiles, outFile, HDFRasterizer.class, params); else return SingleLevelPlot.plot(inFiles, outFile, HDFRasterizer.class, params); }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFRecordReader.java
License:Open Source License
/** * Recover fill values in the array {@link Values}. * @param conf//from w ww . j ava 2 s. co m * @throws IOException * @throws Exception */ private void recoverFillValues(Configuration conf) throws IOException { // For now, we can only recover values of type short HDFFile waterMaskFile = null; try { // Read water mask Path wmPath = new Path( conf.get(WATER_MASK_PATH, "http://e4ftl01.cr.usgs.gov/MOLT/MOD44W.005/2000.02.24/")); final String tileIdentifier = String.format("h%02dv%02d", nasaDataset.h, nasaDataset.v); FileSystem wmFs = wmPath.getFileSystem(conf); FileStatus[] wmFile = wmFs.listStatus(wmPath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileIdentifier); } }); if (wmFile.length == 0) { LOG.warn("Could not find water mask for tile '" + tileIdentifier + "'"); return; } Path wmFileToLoad = wmFile[0].getPath(); if (wmFs instanceof HTTPFileSystem) { wmFileToLoad = new Path(FileUtil.copyFile(conf, wmFileToLoad)); wmFs = FileSystem.getLocal(conf); } waterMaskFile = new HDFFile(wmFs.open(wmFileToLoad)); DDVGroup waterMaskGroup = waterMaskFile.findGroupByName("water_mask"); if (waterMaskGroup == null) { LOG.warn("Water mask dataset 'water_mask' not found in file " + wmFile[0]); return; } byte[] waterMask = null; for (DataDescriptor dd : waterMaskGroup.getContents()) { if (dd instanceof DDNumericDataGroup) { DDNumericDataGroup numericDataGroup = (DDNumericDataGroup) dd; waterMask = (byte[]) numericDataGroup.getAsByteArray(); } } // Convert the waterMask to a BinArray of the right size int size = 4800 / nasaDataset.resolution; BitArray waterMaskBits = convertWaterMaskToBits(ByteBuffer.wrap(waterMask), size); short fillValueShort = (short) HDFConstants.readAsInteger(fillValueBytes, 0, fillValueBytes.length); recoverXYShorts(ByteBuffer.wrap(unparsedDataArray), fillValueShort, waterMaskBits); } finally { if (waterMaskFile != null) waterMaskFile.close(); } }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFRecordReader3.java
License:Open Source License
/** * Recover fill values in the array {@link Values}. * @param conf//ww w . j a v a 2 s.c o m * @throws IOException * @throws Exception */ private void recoverFillValues(Configuration conf) throws IOException { HDFFile waterMaskFile = null; try { // Read water mask Path wmPath = new Path( conf.get(WATER_MASK_PATH, "http://e4ftl01.cr.usgs.gov/MOLT/MOD44W.005/2000.02.24/")); final String tileIdentifier = String.format("h%02dv%02d", nasaDataset.h, nasaDataset.v); FileSystem wmFs = wmPath.getFileSystem(conf); FileStatus[] wmFile = wmFs.listStatus(wmPath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileIdentifier); } }); if (wmFile.length == 0) { LOG.warn("Could not find water mask for tile '" + tileIdentifier + "'"); return; } Path wmFileToLoad = wmFile[0].getPath(); if (wmFs instanceof HTTPFileSystem) { wmFileToLoad = new Path(FileUtil.copyFile(conf, wmFileToLoad)); wmFs = FileSystem.getLocal(conf); } waterMaskFile = new HDFFile(wmFs.open(wmFileToLoad)); DDVGroup waterMaskGroup = waterMaskFile.findGroupByName("water_mask"); if (waterMaskGroup == null) { LOG.warn("Water mask dataset 'water_mask' not found in file " + wmFile[0]); return; } byte[] waterMask = null; for (DataDescriptor dd : waterMaskGroup.getContents()) { if (dd instanceof DDNumericDataGroup) { DDNumericDataGroup numericDataGroup = (DDNumericDataGroup) dd; waterMask = (byte[]) numericDataGroup.getAsAnArray(); } } // Stores which values has been recovered by copying a single value // without interpolation in the x-direction byte[] valueStatus = new byte[dataArray.length]; recoverXDirection(waterMask, valueStatus); recoverYDirection(waterMask, valueStatus); } finally { if (waterMaskFile != null) waterMaskFile.close(); } }
From source file:edu.umn.cs.spatialHadoop.nasa.HTTPFileSystem.java
License:Open Source License
/** * Returns the status of a file. This method is designed specifically to work * with LP DAAC archive and will not work correctly with other web sites. * Since HTTP does not tell whether a URL points to a file or directory, * we assume that URLs ending with HDF, XML and JPG are files while anything * else is considered a directory.//from w w w . ja va 2 s . com */ @Override public FileStatus getFileStatus(Path f) throws IOException { f = f.makeQualified(this); URL url = f.toUri().toURL(); int retryCount = HTTPFileSystem.retries; HttpURLConnection connection = null; try { while (connection == null && retryCount-- > 0) { try { connection = (HttpURLConnection) url.openConnection(); } catch (java.net.SocketException e) { if (retryCount == 0) throw e; LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount); try { ; Thread.sleep(1000); } catch (InterruptedException e1) { } } catch (java.net.UnknownHostException e) { if (retryCount == 0) throw e; LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount); try { Thread.sleep(1000); } catch (InterruptedException e1) { } } } if (connection == null) throw new RuntimeException("Could not connect to " + f); String lengthStr = connection.getHeaderField("content-Length"); long length = lengthStr == null ? -1 : Long.parseLong(lengthStr); if (length == -1) LOG.info("Unknown HTTP file length " + length); long modificationTime = connection.getLastModified(); if (modificationTime == 0) modificationTime = connection.getDate(); // Hard coded to work with LP DAAC archives boolean isdir = !f.getName().matches("(?i:([^*\\?])*\\.(hdf|xml|jpg|gz|bz2|zip|txt|csv|tsv)$)"); return new FileStatus(length, isdir, 1, BLOCK_SIZE, modificationTime, 0, null, null, null, f); } finally { if (connection != null) connection.disconnect(); } }
From source file:edu.umn.cs.spatialHadoop.nasa.ImageCompare.java
License:Open Source License
/** * Compares two directories for similar images with matching names. * @param dir1//from w ww .jav a 2s . com * @param dir2 * @throws IOException */ public static void compareFolders(Path dir1, Path dir2) throws IOException { final PathFilter png_filter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().toLowerCase().endsWith(".png"); } }; // Retrieve all images in dir1 FileStatus[] images1 = dir1.getFileSystem(new Configuration()).listStatus(dir1, png_filter); Map<String, Path> images1ByName = new HashMap<String, Path>(); for (FileStatus fstatus : images1) images1ByName.put(fstatus.getPath().getName(), fstatus.getPath()); // Retrieve all images in dir2 FileStatus[] images2 = dir2.getFileSystem(new Configuration()).listStatus(dir2, png_filter); Map<String, Path> images2ByName = new HashMap<String, Path>(); for (FileStatus fstatus : images2) images2ByName.put(fstatus.getPath().getName(), fstatus.getPath()); final Vector<Double> similarities = new Vector<Double>(); final Vector<String> names = new Vector<String>(); // Compare every pair of images with similar names for (String imageName : images2ByName.keySet()) { Path image1 = images1ByName.get(imageName); if (image1 == null) continue; Path image2 = images2ByName.get(imageName); double similarity = compareImages(image1, image2); if (similarity > 0.1) { System.out.println(image1 + "," + image2 + "," + similarity); similarities.add(similarity); names.add(imageName); } } /* // Sort images by similarity IndexedSortable sortable = new IndexedSortable() { @Override public int compare(int i, int j) { double diff = similarities.get(i) - similarities.get(j); if (diff < 0) return -1; if (diff > 0) return 1; return 0; } @Override public void swap(int i, int j) { double tempSim = similarities.get(i); similarities.set(i, similarities.get(j)); similarities.set(j, tempSim); String tempName = names.get(i); names.set(i, names.get(j)); names.set(j, tempName); } }; final IndexedSorter sorter = new QuickSort(); sorter.sort(sortable, 0, names.size()); final float threshold = 0.0f; // Display to 10 percentile matches for (int i = (int) (names.size() * threshold); i < names.size(); i++) { System.out.println(similarities.get(i)+ " ... "+names.get(i)); }*/ }