List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Checks whether a file is indexed using an R-tree or not. This allows * an operation to use the R-tree to speedup the processing if it exists. * This function opens the specified file and reads the first eight bytes * which include the R-tree signature. If the signatures matches with the * R-tree signature, true is returned. Otherwise, false is returned. * If the parameter is a path to a directory, only the first data file in that * directory is tested.//ww w . ja va 2s. c om * @param fs * @param path * @return * @throws IOException */ public static boolean isRTree(FileSystem fs, Path path) throws IOException { if (FileUtil.getExtensionWithoutCompression(path).equals("rtree")) return true; FileStatus file = fs.getFileStatus(path); Path fileToCheck; if (file.isDir()) { // Check any cell (e.g., first cell) GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path); if (gIndex == null) return false; fileToCheck = new Path(path, gIndex.iterator().next().filename); } else { fileToCheck = file.getPath(); } InputStream fileIn = fs.open(fileToCheck); // Check if file is compressed CompressionCodec codec = compressionCodecs.getCodec(fileToCheck); Decompressor decompressor = null; if (codec != null) { synchronized (compressionCodecs) { // CodecPool is not thread-safe decompressor = CodecPool.getDecompressor(codec); } fileIn = codec.createInputStream(fileIn, decompressor); } byte[] signature = new byte[RTreeFileMarkerB.length]; fileIn.read(signature); fileIn.close(); if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB); }
From source file:edu.umn.cs.spatialHadoop.indexing.Indexer.java
License:Open Source License
private static void indexLocal(Path inPath, final Path outPath, OperationsParams params) throws IOException, InterruptedException { Job job = Job.getInstance(params);/*from w w w . j a v a 2 s .c o m*/ final Configuration conf = job.getConfiguration(); final String sindex = conf.get("sindex"); // Start reading input file List<InputSplit> splits = new ArrayList<InputSplit>(); final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); FileSystem inFs = inPath.getFileSystem(conf); FileStatus inFStatus = inFs.getFileStatus(inPath); if (inFStatus != null && !inFStatus.isDir()) { // One file, retrieve it immediately. // This is useful if the input is a hidden file which is automatically // skipped by FileInputFormat. We need to plot a hidden file for the case // of plotting partition boundaries of a spatial index splits.add(new FileSplit(inPath, 0, inFStatus.getLen(), new String[0])); } else { SpatialInputFormat3.setInputPaths(job, inPath); for (InputSplit s : inputFormat.getSplits(job)) splits.add(s); } // Copy splits to a final array to be used in parallel final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]); boolean replicate = PartitionerReplicate.get(sindex); // Set input file MBR if not already set Rectangle inputMBR = (Rectangle) OperationsParams.getShape(conf, "mbr"); if (inputMBR == null) { inputMBR = FileMBR.fileMBR(inPath, new OperationsParams(conf)); OperationsParams.setShape(conf, "mbr", inputMBR); } setLocalIndexer(conf, sindex); final Partitioner partitioner = createPartitioner(inPath, outPath, conf, sindex); final IndexRecordWriter<Shape> recordWriter = new IndexRecordWriter<Shape>(partitioner, replicate, sindex, outPath, conf); for (FileSplit fsplit : fsplits) { RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, conf); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, conf); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, conf); } else { throw new RuntimeException("Unknown record reader"); } final IntWritable partitionID = new IntWritable(); while (reader.nextKeyValue()) { Iterable<Shape> shapes = reader.getCurrentValue(); if (replicate) { for (final Shape s : shapes) { partitioner.overlapPartitions(s, new ResultCollector<Integer>() { @Override public void collect(Integer id) { partitionID.set(id); try { recordWriter.write(partitionID, s); } catch (IOException e) { throw new RuntimeException(e); } } }); } } else { for (final Shape s : shapes) { int pid = partitioner.overlapPartition(s); if (pid != -1) { partitionID.set(pid); recordWriter.write(partitionID, s); } } } } reader.close(); } recordWriter.close(null); // Write the WKT formatted master file Path masterPath = new Path(outPath, "_master." + sindex); FileSystem outFs = outPath.getFileSystem(params); Path wktPath = new Path(outPath, "_" + sindex + ".wkt"); PrintStream wktOut = new PrintStream(outFs.create(wktPath)); wktOut.println("ID\tBoundaries\tRecord Count\tSize\tFile name"); Text tempLine = new Text2(); Partition tempPartition = new Partition(); LineReader in = new LineReader(outFs.open(masterPath)); while (in.readLine(tempLine) > 0) { tempPartition.fromText(tempLine); wktOut.println(tempPartition.toWKT()); } in.close(); wktOut.close(); }
From source file:edu.umn.cs.spatialHadoop.io.RandomCompressedInputStream.java
License:Open Source License
public RandomCompressedInputStream(FileSystem fs, Path p) throws IOException { this(fs.open(p), fs.getFileStatus(p).getLen()); }
From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java
License:Apache License
public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); FileSystem fs = path.getFileSystem(job); FileStatus file = fs.getFileStatus(path); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (length != 0) { long blockSize = file.getBlockSize(); long splitSize = blockSize; long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; }//ww w . j av a2 s . c om if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } }
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java
License:Open Source License
protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter) throws IOException { GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir); if (gindex == null) { FileStatus[] listStatus;//from w w w. ja va 2 s . c o m if (OperationsParams.isWildcard(dir)) { // Wild card listStatus = fs.globStatus(dir); } else { listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter); } // Add all files under this directory for (FileStatus status : listStatus) { if (status.isDir()) { listStatus(fs, status.getPath(), result, filter); } else if (status.getPath().getName().toLowerCase().endsWith(".list")) { LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(), Integer.MAX_VALUE); LongWritable key = in.createKey(); Text value = in.createValue(); while (in.next(key, value)) { result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString()))); } in.close(); } else { result.add(status); } } } else { final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir; // Use the global index to limit files filter.selectCells(gindex, new ResultCollector<Partition>() { @Override public void collect(Partition partition) { try { Path cell_path = new Path(indexDir, partition.filename); if (!fs.exists(cell_path)) LOG.warn("Matched file not found: " + cell_path); result.add(fs.getFileStatus(cell_path)); } catch (IOException e) { e.printStackTrace(); } } }); } }
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(FileSystem fs, Path file) { // HDF files are not splittable if (file.getName().toLowerCase().endsWith(".hdf")) return false; final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null && !(codec instanceof SplittableCompressionCodec)) return false; try {//from www. jav a2 s . c o m // To avoid opening the file and checking the first 8-bytes to look for // an R-tree signature, we never split a file read over HTTP if (fs instanceof HTTPFileSystem) return false; // ... and never split a file less than 150MB to perform better with many small files if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024) return false; return !SpatialSite.isRTree(fs, file); } catch (IOException e) { return super.isSplitable(fs, file); } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter) throws IOException { GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir); if (gindex == null || filter == null) { // No global index which means we cannot use the filter function FileStatus[] listStatus;/*from www . j a va 2 s . c om*/ if (OperationsParams.isWildcard(dir)) { // Wild card listStatus = fs.globStatus(dir); } else { listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter); } // Add all files under this directory for (FileStatus status : listStatus) { if (status.isDir()) { // Recursively go in subdir listStatus(fs, status.getPath(), result, filter); } else { // A file, just add it result.add(status); } } } else { final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir; // Use the global index to limit files filter.selectCells(gindex, new ResultCollector<Partition>() { @Override public void collect(Partition partition) { try { Path cell_path = new Path(indexDir, partition.filename); if (!fs.exists(cell_path)) LOG.warn("Matched file not found: " + cell_path); result.add(fs.getFileStatus(cell_path)); } catch (IOException e) { e.printStackTrace(); } } }); } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path file) { try {//w w w. j a v a 2 s . com // Create compressionCodecs to be used by isSplitable method if (compressionCodecs == null) compressionCodecs = new CompressionCodecFactory(context.getConfiguration()); FileSystem fs = file.getFileSystem(context.getConfiguration()); // HDF files are not splittable if (file.getName().toLowerCase().endsWith(".hdf")) return false; final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null && !(codec instanceof SplittableCompressionCodec)) return false; // To avoid opening the file and checking the first 8-bytes to look for // an R-tree signature, we never split a file read over HTTP if (fs instanceof HTTPFileSystem) return false; // ... and never split a file less than 150MB to perform better with many small files if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024) return false; return !SpatialSite.isRTree(fs, file); } catch (IOException e) { LOG.warn("Error while determining whether a file is splittable", e); return false; // Safer to not split it } }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Merges a set of indexes into larger indexes * @param fs//w w w .j a v a2s. c o m * @param srcIndexDir * @param dstIndexDir * @param srcFormat * @param dstFormat * @param params * @throws IOException * @throws ParseException * @throws InterruptedException */ private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir, SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params) throws IOException, ParseException, InterruptedException { TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir) : fs.listStatus(srcIndexDir, timeRange); Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here // Scan the source indexes and merge each consecutive run belonging to the // same unit int i1 = 0; while (i1 < sourceIndexes.length) { final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName())); int i2 = i1 + 1; // Keep scanning as long as the source index belongs to the same dest index while (i2 < sourceIndexes.length && dstFormat .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate)) i2++; // Merge all source indexes in the range [i1, i2) into one dest index // Copy i1, i2 to other variables as final to be accessible from threads final int firstIndex = i1; final int lastIndex = i2; final Path destIndex = new Path(dstIndexDir, indexToCreate); // For each tile, merge all values in all source indexes /*A regular expression to catch the tile identifier of a MODIS grid cell*/ final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$"); final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath()); // Shuffle the array for better load balancing across threads Random rand = new Random(); for (int i = 0; i < tilesInFirstDay.length - 1; i++) { // Swap the entry at i with any following entry int j = i + rand.nextInt(tilesInFirstDay.length - i - 1); FileStatus temp = tilesInFirstDay[i]; tilesInFirstDay[i] = tilesInFirstDay[j]; tilesInFirstDay[j] = temp; } Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() { @Override public Object run(int i_file1, int i_file2) { for (int i_file = i_file1; i_file < i_file2; i_file++) { try { FileStatus tileInFirstDay = tilesInFirstDay[i_file]; // Extract tile ID Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName()); if (!matcher.matches()) { LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath()); continue; } final String tileID = matcher.group(1); Path destIndexFile = new Path(destIndex, tileID); PathFilter tileFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileID); } }; // Find matching tiles in all source indexes to merge Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex); filesToMerge.add(tileInFirstDay.getPath()); for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) { FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(), tileFilter); if (matchedTileFile.length == 0) LOG.warn("Could not find tile " + tileID + " in dir " + sourceIndexes[iDailyIndex].getPath()); else if (matchedTileFile.length == 1) filesToMerge.add(matchedTileFile[0].getPath()); } if (fs.exists(destIndexFile)) { // Destination file already exists // Check the date of the destination and source files to see // whether it needs to be updated or not long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime(); boolean needsUpdate = false; for (Path fileToMerge : filesToMerge) { long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime(); if (sourceTimestamp > destTimestamp) { needsUpdate = true; break; } } if (!needsUpdate) continue; else LOG.info("Updating file " + destIndexFile.getName()); } // Do the merge Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (fs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(fs); LOG.info("Merging tile " + tileID + " into file " + destIndexFile); AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]), tmpFile); synchronized (fs) { Path destDir = destIndexFile.getParent(); if (!fs.exists(destDir)) fs.mkdirs(destDir); } fs.rename(tmpFile, destIndexFile); } catch (IOException e) { e.printStackTrace(); } } return null; } }); i1 = i2; } }
From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java
License:Open Source License
/** * Calculates the union of a set of shapes categorized by some user defined * category./*from w w w . ja v a 2 s . com*/ * @param shapeFile * @param categoryFile * @return * @throws IOException */ public static Map<Integer, OGCGeometry> unionLocal(Path shapeFile, Path categoryFile) throws IOException { long t1 = System.currentTimeMillis(); // 1- Build a hashtable of categories (given their size is small) Map<Integer, Integer> idToCategory = new HashMap<Integer, Integer>(); readCategories(categoryFile, idToCategory); long t2 = System.currentTimeMillis(); // 2- Read shapes from the shape file and relate each one to a category // Prepare a hash that stores shapes in each category Map<Integer, Vector<OGCGeometry>> categoryShapes = new HashMap<Integer, Vector<OGCGeometry>>(); FileSystem fs1 = shapeFile.getFileSystem(new Configuration()); long file_size1 = fs1.getFileStatus(shapeFile).getLen(); ShapeIterRecordReader shapeReader = new ShapeIterRecordReader(fs1.open(shapeFile), 0, file_size1); Rectangle cellInfo = shapeReader.createKey(); ShapeIterator shapes = shapeReader.createValue(); while (shapeReader.next(cellInfo, shapes)) { for (Shape shape : shapes) { //int shape_zip = Integer.parseInt(shape.extra.split(",", 7)[5]); //Integer category = idToCategory.get(shape_zip); Integer category = null; if (category != null) { Vector<OGCGeometry> geometries = categoryShapes.get(category); if (geometries == null) { geometries = new Vector<OGCGeometry>(); categoryShapes.put(category, geometries); } geometries.add(((OGCESRIShape) shape).geom); } } } shapeReader.close(); long t3 = System.currentTimeMillis(); // 3- Find the union of each category Map<Integer, OGCGeometry> final_result = new HashMap<Integer, OGCGeometry>(); for (Map.Entry<Integer, Vector<OGCGeometry>> category : categoryShapes.entrySet()) { if (!category.getValue().isEmpty()) { OGCGeometryCollection geom_collection = new OGCConcreteGeometryCollection(category.getValue(), category.getValue().firstElement().esriSR); OGCGeometry union = geom_collection.union(category.getValue().firstElement()); final_result.put(category.getKey(), union); // Free up some memory category.getValue().clear(); } } long t4 = System.currentTimeMillis(); System.out.println("Time reading categories: " + (t2 - t1) + " millis"); System.out.println("Time reading records: " + (t3 - t2) + " millis"); System.out.println("Time union categories: " + (t4 - t3) + " millis"); return final_result; }