Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Checks whether a file is indexed using an R-tree or not. This allows
 * an operation to use the R-tree to speedup the processing if it exists.
 * This function opens the specified file and reads the first eight bytes
 * which include the R-tree signature. If the signatures matches with the
 * R-tree signature, true is returned. Otherwise, false is returned.
 * If the parameter is a path to a directory, only the first data file in that
 * directory is tested.//ww  w .  ja  va 2s. c om
 * @param fs
 * @param path
 * @return
 * @throws IOException
 */
public static boolean isRTree(FileSystem fs, Path path) throws IOException {
    if (FileUtil.getExtensionWithoutCompression(path).equals("rtree"))
        return true;

    FileStatus file = fs.getFileStatus(path);
    Path fileToCheck;
    if (file.isDir()) {
        // Check any cell (e.g., first cell)
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return false;
        fileToCheck = new Path(path, gIndex.iterator().next().filename);
    } else {
        fileToCheck = file.getPath();
    }
    InputStream fileIn = fs.open(fileToCheck);

    // Check if file is compressed
    CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
    Decompressor decompressor = null;
    if (codec != null) {
        synchronized (compressionCodecs) {
            // CodecPool is not thread-safe
            decompressor = CodecPool.getDecompressor(codec);
        }
        fileIn = codec.createInputStream(fileIn, decompressor);
    }
    byte[] signature = new byte[RTreeFileMarkerB.length];
    fileIn.read(signature);
    fileIn.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
}

From source file:edu.umn.cs.spatialHadoop.indexing.Indexer.java

License:Open Source License

private static void indexLocal(Path inPath, final Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    Job job = Job.getInstance(params);/*from  w  w  w . j  a v a 2 s  .c o m*/
    final Configuration conf = job.getConfiguration();

    final String sindex = conf.get("sindex");

    // Start reading input file
    List<InputSplit> splits = new ArrayList<InputSplit>();
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    FileSystem inFs = inPath.getFileSystem(conf);
    FileStatus inFStatus = inFs.getFileStatus(inPath);
    if (inFStatus != null && !inFStatus.isDir()) {
        // One file, retrieve it immediately.
        // This is useful if the input is a hidden file which is automatically
        // skipped by FileInputFormat. We need to plot a hidden file for the case
        // of plotting partition boundaries of a spatial index
        splits.add(new FileSplit(inPath, 0, inFStatus.getLen(), new String[0]));
    } else {
        SpatialInputFormat3.setInputPaths(job, inPath);
        for (InputSplit s : inputFormat.getSplits(job))
            splits.add(s);
    }

    // Copy splits to a final array to be used in parallel
    final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]);
    boolean replicate = PartitionerReplicate.get(sindex);

    // Set input file MBR if not already set
    Rectangle inputMBR = (Rectangle) OperationsParams.getShape(conf, "mbr");
    if (inputMBR == null) {
        inputMBR = FileMBR.fileMBR(inPath, new OperationsParams(conf));
        OperationsParams.setShape(conf, "mbr", inputMBR);
    }

    setLocalIndexer(conf, sindex);
    final Partitioner partitioner = createPartitioner(inPath, outPath, conf, sindex);

    final IndexRecordWriter<Shape> recordWriter = new IndexRecordWriter<Shape>(partitioner, replicate, sindex,
            outPath, conf);
    for (FileSplit fsplit : fsplits) {
        RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
        if (reader instanceof SpatialRecordReader3) {
            ((SpatialRecordReader3) reader).initialize(fsplit, conf);
        } else if (reader instanceof RTreeRecordReader3) {
            ((RTreeRecordReader3) reader).initialize(fsplit, conf);
        } else if (reader instanceof HDFRecordReader) {
            ((HDFRecordReader) reader).initialize(fsplit, conf);
        } else {
            throw new RuntimeException("Unknown record reader");
        }

        final IntWritable partitionID = new IntWritable();

        while (reader.nextKeyValue()) {
            Iterable<Shape> shapes = reader.getCurrentValue();
            if (replicate) {
                for (final Shape s : shapes) {
                    partitioner.overlapPartitions(s, new ResultCollector<Integer>() {
                        @Override
                        public void collect(Integer id) {
                            partitionID.set(id);
                            try {
                                recordWriter.write(partitionID, s);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    });
                }
            } else {
                for (final Shape s : shapes) {
                    int pid = partitioner.overlapPartition(s);
                    if (pid != -1) {
                        partitionID.set(pid);
                        recordWriter.write(partitionID, s);
                    }
                }
            }
        }
        reader.close();
    }
    recordWriter.close(null);

    // Write the WKT formatted master file
    Path masterPath = new Path(outPath, "_master." + sindex);
    FileSystem outFs = outPath.getFileSystem(params);
    Path wktPath = new Path(outPath, "_" + sindex + ".wkt");
    PrintStream wktOut = new PrintStream(outFs.create(wktPath));
    wktOut.println("ID\tBoundaries\tRecord Count\tSize\tFile name");
    Text tempLine = new Text2();
    Partition tempPartition = new Partition();
    LineReader in = new LineReader(outFs.open(masterPath));
    while (in.readLine(tempLine) > 0) {
        tempPartition.fromText(tempLine);
        wktOut.println(tempPartition.toWKT());
    }
    in.close();
    wktOut.close();
}

From source file:edu.umn.cs.spatialHadoop.io.RandomCompressedInputStream.java

License:Open Source License

public RandomCompressedInputStream(FileSystem fs, Path p) throws IOException {
    this(fs.open(p), fs.getFileStatus(p).getLen());
}

From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }//ww  w  .  j av a2  s  . c  om

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null) {
        FileStatus[] listStatus;//from   w  w w.  ja  va 2 s  .  c  o m
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else if (status.getPath().getName().toLowerCase().endsWith(".list")) {
                LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(),
                        Integer.MAX_VALUE);
                LongWritable key = in.createKey();
                Text value = in.createValue();
                while (in.next(key, value)) {
                    result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString())));
                }
                in.close();
            } else {
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(FileSystem fs, Path file) {
    // HDF files are not splittable
    if (file.getName().toLowerCase().endsWith(".hdf"))
        return false;
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec != null && !(codec instanceof SplittableCompressionCodec))
        return false;

    try {//from www. jav  a2  s . c  o m
        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        return super.isSplitable(fs, file);
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null || filter == null) {
        // No global index which means we cannot use the filter function
        FileStatus[] listStatus;/*from   www  .  j  a va 2 s .  c  om*/
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                // Recursively go in subdir
                listStatus(fs, status.getPath(), result, filter);
            } else {
                // A file, just add it
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    try {//w w w. j a v  a 2 s .  com
        // Create compressionCodecs to be used by isSplitable method
        if (compressionCodecs == null)
            compressionCodecs = new CompressionCodecFactory(context.getConfiguration());
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        // HDF files are not splittable
        if (file.getName().toLowerCase().endsWith(".hdf"))
            return false;
        final CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null && !(codec instanceof SplittableCompressionCodec))
            return false;

        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        LOG.warn("Error while determining whether a file is splittable", e);
        return false; // Safer to not split it
    }
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Merges a set of indexes into larger indexes
 * @param fs//w w w  .j  a  v  a2s.  c o m
 * @param srcIndexDir
 * @param dstIndexDir
 * @param srcFormat
 * @param dstFormat
 * @param params
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 */
private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir,
        SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params)
        throws IOException, ParseException, InterruptedException {
    TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null;
    final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir)
            : fs.listStatus(srcIndexDir, timeRange);
    Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here

    // Scan the source indexes and merge each consecutive run belonging to the
    // same unit
    int i1 = 0;
    while (i1 < sourceIndexes.length) {
        final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName()));
        int i2 = i1 + 1;
        // Keep scanning as long as the source index belongs to the same dest index
        while (i2 < sourceIndexes.length && dstFormat
                .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate))
            i2++;

        // Merge all source indexes in the range [i1, i2) into one dest index

        // Copy i1, i2 to other variables as final to be accessible from threads
        final int firstIndex = i1;
        final int lastIndex = i2;

        final Path destIndex = new Path(dstIndexDir, indexToCreate);

        // For each tile, merge all values in all source indexes
        /*A regular expression to catch the tile identifier of a MODIS grid cell*/
        final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$");
        final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath());
        // Shuffle the array for better load balancing across threads
        Random rand = new Random();
        for (int i = 0; i < tilesInFirstDay.length - 1; i++) {
            // Swap the entry at i with any following entry
            int j = i + rand.nextInt(tilesInFirstDay.length - i - 1);
            FileStatus temp = tilesInFirstDay[i];
            tilesInFirstDay[i] = tilesInFirstDay[j];
            tilesInFirstDay[j] = temp;
        }
        Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() {
            @Override
            public Object run(int i_file1, int i_file2) {
                for (int i_file = i_file1; i_file < i_file2; i_file++) {
                    try {
                        FileStatus tileInFirstDay = tilesInFirstDay[i_file];

                        // Extract tile ID
                        Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName());
                        if (!matcher.matches()) {
                            LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath());
                            continue;
                        }

                        final String tileID = matcher.group(1);
                        Path destIndexFile = new Path(destIndex, tileID);

                        PathFilter tileFilter = new PathFilter() {
                            @Override
                            public boolean accept(Path path) {
                                return path.getName().contains(tileID);
                            }
                        };

                        // Find matching tiles in all source indexes to merge
                        Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex);
                        filesToMerge.add(tileInFirstDay.getPath());
                        for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) {
                            FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(),
                                    tileFilter);
                            if (matchedTileFile.length == 0)
                                LOG.warn("Could not find tile " + tileID + " in dir "
                                        + sourceIndexes[iDailyIndex].getPath());
                            else if (matchedTileFile.length == 1)
                                filesToMerge.add(matchedTileFile[0].getPath());
                        }

                        if (fs.exists(destIndexFile)) {
                            // Destination file already exists
                            // Check the date of the destination and source files to see
                            // whether it needs to be updated or not
                            long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime();
                            boolean needsUpdate = false;
                            for (Path fileToMerge : filesToMerge) {
                                long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime();
                                if (sourceTimestamp > destTimestamp) {
                                    needsUpdate = true;
                                    break;
                                }
                            }
                            if (!needsUpdate)
                                continue;
                            else
                                LOG.info("Updating file " + destIndexFile.getName());
                        }

                        // Do the merge
                        Path tmpFile;
                        do {
                            tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp");
                        } while (fs.exists(tmpFile));
                        tmpFile = tmpFile.makeQualified(fs);
                        LOG.info("Merging tile " + tileID + " into file " + destIndexFile);
                        AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]),
                                tmpFile);
                        synchronized (fs) {
                            Path destDir = destIndexFile.getParent();
                            if (!fs.exists(destDir))
                                fs.mkdirs(destDir);
                        }
                        fs.rename(tmpFile, destIndexFile);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                return null;
            }
        });
        i1 = i2;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Calculates the union of a set of shapes categorized by some user defined
 * category./*from w  w w . ja  v  a  2  s  .  com*/
 * @param shapeFile
 * @param categoryFile
 * @return
 * @throws IOException
 */
public static Map<Integer, OGCGeometry> unionLocal(Path shapeFile, Path categoryFile) throws IOException {
    long t1 = System.currentTimeMillis();
    // 1- Build a hashtable of categories (given their size is small)
    Map<Integer, Integer> idToCategory = new HashMap<Integer, Integer>();
    readCategories(categoryFile, idToCategory);
    long t2 = System.currentTimeMillis();

    // 2- Read shapes from the shape file and relate each one to a category

    // Prepare a hash that stores shapes in each category
    Map<Integer, Vector<OGCGeometry>> categoryShapes = new HashMap<Integer, Vector<OGCGeometry>>();

    FileSystem fs1 = shapeFile.getFileSystem(new Configuration());
    long file_size1 = fs1.getFileStatus(shapeFile).getLen();

    ShapeIterRecordReader shapeReader = new ShapeIterRecordReader(fs1.open(shapeFile), 0, file_size1);
    Rectangle cellInfo = shapeReader.createKey();
    ShapeIterator shapes = shapeReader.createValue();

    while (shapeReader.next(cellInfo, shapes)) {
        for (Shape shape : shapes) {
            //int shape_zip = Integer.parseInt(shape.extra.split(",", 7)[5]);
            //Integer category = idToCategory.get(shape_zip);
            Integer category = null;
            if (category != null) {
                Vector<OGCGeometry> geometries = categoryShapes.get(category);
                if (geometries == null) {
                    geometries = new Vector<OGCGeometry>();
                    categoryShapes.put(category, geometries);
                }
                geometries.add(((OGCESRIShape) shape).geom);
            }
        }
    }

    shapeReader.close();
    long t3 = System.currentTimeMillis();

    // 3- Find the union of each category
    Map<Integer, OGCGeometry> final_result = new HashMap<Integer, OGCGeometry>();
    for (Map.Entry<Integer, Vector<OGCGeometry>> category : categoryShapes.entrySet()) {
        if (!category.getValue().isEmpty()) {
            OGCGeometryCollection geom_collection = new OGCConcreteGeometryCollection(category.getValue(),
                    category.getValue().firstElement().esriSR);
            OGCGeometry union = geom_collection.union(category.getValue().firstElement());
            final_result.put(category.getKey(), union);
            // Free up some memory
            category.getValue().clear();
        }
    }
    long t4 = System.currentTimeMillis();

    System.out.println("Time reading categories: " + (t2 - t1) + " millis");
    System.out.println("Time reading records: " + (t3 - t2) + " millis");
    System.out.println("Time union categories: " + (t4 - t3) + " millis");

    return final_result;
}