Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Checks whether a file is indexed using an R-tree or not. This allows
 * an operation to use the R-tree to speedup the processing if it exists.
 * This function opens the specified file and reads the first eight bytes
 * which include the R-tree signature. If the signatures matches with the
 * R-tree signature, true is returned. Otherwise, false is returned.
 * If the parameter is a path to a directory, only the first data file in that
 * directory is tested.//ww  w .  ja  va 2s. c om
 * @param fs
 * @param path
 * @return
 * @throws IOException
 */
public static boolean isRTree(FileSystem fs, Path path) throws IOException {
    if (FileUtil.getExtensionWithoutCompression(path).equals("rtree"))
        return true;

    FileStatus file = fs.getFileStatus(path);
    Path fileToCheck;
    if (file.isDir()) {
        // Check any cell (e.g., first cell)
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return false;
        fileToCheck = new Path(path, gIndex.iterator().next().filename);
    } else {
        fileToCheck = file.getPath();
    }
    InputStream fileIn = fs.open(fileToCheck);

    // Check if file is compressed
    CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
    Decompressor decompressor = null;
    if (codec != null) {
        synchronized (compressionCodecs) {
            // CodecPool is not thread-safe
            decompressor = CodecPool.getDecompressor(codec);
        }
        fileIn = codec.createInputStream(fileIn, decompressor);
    }
    byte[] signature = new byte[RTreeFileMarkerB.length];
    fileIn.read(signature);
    fileIn.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
}

From source file:edu.umn.cs.spatialHadoop.indexing.Indexer.java

License:Open Source License

private static void indexLocal(Path inPath, final Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    Job job = Job.getInstance(params);/*from  w  w  w . j  a v a 2 s  .c o m*/
    final Configuration conf = job.getConfiguration();

    final String sindex = conf.get("sindex");

    // Start reading input file
    List<InputSplit> splits = new ArrayList<InputSplit>();
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    FileSystem inFs = inPath.getFileSystem(conf);
    FileStatus inFStatus = inFs.getFileStatus(inPath);
    if (inFStatus != null && !inFStatus.isDir()) {
        // One file, retrieve it immediately.
        // This is useful if the input is a hidden file which is automatically
        // skipped by FileInputFormat. We need to plot a hidden file for the case
        // of plotting partition boundaries of a spatial index
        splits.add(new FileSplit(inPath, 0, inFStatus.getLen(), new String[0]));
    } else {
        SpatialInputFormat3.setInputPaths(job, inPath);
        for (InputSplit s : inputFormat.getSplits(job))
            splits.add(s);
    }

    // Copy splits to a final array to be used in parallel
    final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]);
    boolean replicate = PartitionerReplicate.get(sindex);

    // Set input file MBR if not already set
    Rectangle inputMBR = (Rectangle) OperationsParams.getShape(conf, "mbr");
    if (inputMBR == null) {
        inputMBR = FileMBR.fileMBR(inPath, new OperationsParams(conf));
        OperationsParams.setShape(conf, "mbr", inputMBR);
    }

    setLocalIndexer(conf, sindex);
    final Partitioner partitioner = createPartitioner(inPath, outPath, conf, sindex);

    final IndexRecordWriter<Shape> recordWriter = new IndexRecordWriter<Shape>(partitioner, replicate, sindex,
            outPath, conf);
    for (FileSplit fsplit : fsplits) {
        RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
        if (reader instanceof SpatialRecordReader3) {
            ((SpatialRecordReader3) reader).initialize(fsplit, conf);
        } else if (reader instanceof RTreeRecordReader3) {
            ((RTreeRecordReader3) reader).initialize(fsplit, conf);
        } else if (reader instanceof HDFRecordReader) {
            ((HDFRecordReader) reader).initialize(fsplit, conf);
        } else {
            throw new RuntimeException("Unknown record reader");
        }

        final IntWritable partitionID = new IntWritable();

        while (reader.nextKeyValue()) {
            Iterable<Shape> shapes = reader.getCurrentValue();
            if (replicate) {
                for (final Shape s : shapes) {
                    partitioner.overlapPartitions(s, new ResultCollector<Integer>() {
                        @Override
                        public void collect(Integer id) {
                            partitionID.set(id);
                            try {
                                recordWriter.write(partitionID, s);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    });
                }
            } else {
                for (final Shape s : shapes) {
                    int pid = partitioner.overlapPartition(s);
                    if (pid != -1) {
                        partitionID.set(pid);
                        recordWriter.write(partitionID, s);
                    }
                }
            }
        }
        reader.close();
    }
    recordWriter.close(null);

    // Write the WKT formatted master file
    Path masterPath = new Path(outPath, "_master." + sindex);
    FileSystem outFs = outPath.getFileSystem(params);
    Path wktPath = new Path(outPath, "_" + sindex + ".wkt");
    PrintStream wktOut = new PrintStream(outFs.create(wktPath));
    wktOut.println("ID\tBoundaries\tRecord Count\tSize\tFile name");
    Text tempLine = new Text2();
    Partition tempPartition = new Partition();
    LineReader in = new LineReader(outFs.open(masterPath));
    while (in.readLine(tempLine) > 0) {
        tempPartition.fromText(tempLine);
        wktOut.println(tempPartition.toWKT());
    }
    in.close();
    wktOut.close();
}

From source file:edu.umn.cs.spatialHadoop.io.RandomCompressedInputStream.java

License:Open Source License

public RandomCompressedInputStream(FileSystem fs, Path p) throws IOException {
    this(fs.open(p), fs.getFileStatus(p).getLen());
}

From source file:edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }//ww  w  .  j av a2  s  . c  om

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null) {
        FileStatus[] listStatus;//from   w  w w.  ja  va 2 s  .  c  o m
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else if (status.getPath().getName().toLowerCase().endsWith(".list")) {
                LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(),
                        Integer.MAX_VALUE);
                LongWritable key = in.createKey();
                Text value = in.createValue();
                while (in.next(key, value)) {
                    result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString())));
                }
                in.close();
            } else {
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(FileSystem fs, Path file) {
    // HDF files are not splittable
    if (file.getName().toLowerCase().endsWith(".hdf"))
        return false;
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec != null && !(codec instanceof SplittableCompressionCodec))
        return false;

    try {//from www. jav  a2  s . c  o m
        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        return super.isSplitable(fs, file);
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null || filter == null) {
        // No global index which means we cannot use the filter function
        FileStatus[] listStatus;/*from   www  .  j  a va 2 s .  c  om*/
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                // Recursively go in subdir
                listStatus(fs, status.getPath(), result, filter);
            } else {
                // A file, just add it
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    try {//w w w. j a v  a 2 s .  com
        // Create compressionCodecs to be used by isSplitable method
        if (compressionCodecs == null)
            compressionCodecs = new CompressionCodecFactory(context.getConfiguration());
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        // HDF files are not splittable
        if (file.getName().toLowerCase().endsWith(".hdf"))
            return false;
        final CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null && !(codec instanceof SplittableCompressionCodec))
            return false;

        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        LOG.warn("Error while determining whether a file is splittable", e);
        return false; // Safer to not split it
    }
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Merges a set of indexes into larger indexes
 * @param fs//w w w  .j  a  v  a2s.  c o m
 * @param srcIndexDir
 * @param dstIndexDir
 * @param srcFormat
 * @param dstFormat
 * @param params
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 */
private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir,
        SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params)
        throws IOException, ParseException, InterruptedException {
    TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null;
    final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir)
            : fs.listStatus(srcIndexDir, timeRange);
    Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here

    // Scan the source indexes and merge each consecutive run belonging to the
    // same unit
    int i1 = 0;
    while (i1 < sourceIndexes.length) {
        final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName()));
        int i2 = i1 + 1;
        // Keep scanning as long as the source index belongs to the same dest index
        while (i2 < sourceIndexes.length && dstFormat
                .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate))
            i2++;

        // Merge all source indexes in the range [i1, i2) into one dest index

        // Copy i1, i2 to other variables as final to be accessible from threads
        final int firstIndex = i1;
        final int lastIndex = i2;

        final Path destIndex = new Path(dstIndexDir, indexToCreate);

        // For each tile, merge all values in all source indexes
        /*A regular expression to catch the tile identifier of a MODIS grid cell*/
        final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$");
        final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath());
        // Shuffle the array for better load balancing across threads
        Random rand = new Random();
        for (int i = 0; i < tilesInFirstDay.length - 1; i++) {
            // Swap the entry at i with any following entry
            int j = i + rand.nextInt(tilesInFirstDay.length - i - 1);
            FileStatus temp = tilesInFirstDay[i];
            tilesInFirstDay[i] = tilesInFirstDay[j];
            tilesInFirstDay[j] = temp;
        }
        Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() {
            @Override
            public Object run(int i_file1, int i_file2) {
                for (int i_file = i_file1; i_file < i_file2; i_file++) {
                    try {
                        FileStatus tileInFirstDay = tilesInFirstDay[i_file];

                        // Extract tile ID
                        Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName());
                        if (!matcher.matches()) {
                            LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath());
                            continue;
                        }

                        final String tileID = matcher.group(1);
                        Path destIndexFile = new Path(destIndex, tileID);

                        PathFilter tileFilter = new PathFilter() {
                            @Override
                            public boolean accept(Path path) {
                                return path.getName().contains(tileID);
                            }
                        };

                        // Find matching tiles in all source indexes to merge
                        Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex);
                        filesToMerge.add(tileInFirstDay.getPath());
                        for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) {
                            FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(),
                                    tileFilter);
                            if (matchedTileFile.length == 0)
                                LOG.warn("Could not find tile " + tileID + " in dir "
                                        + sourceIndexes[iDailyIndex].getPath());
                            else if (matchedTileFile.length == 1)
                                filesToMerge.add(matchedTileFile[0].getPath());
                        }

                        if (fs.exists(destIndexFile)) {
                            // Destination file already exists
                            // Check the date of the destination and source files to see
                            // whether it needs to be updated or not
                            long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime();
                            boolean needsUpdate = false;
                            for (Path fileToMerge : filesToMerge) {
                                long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime();
                                if (sourceTimestamp > destTimestamp) {
                                    needsUpdate = true;
                                    break;
                                }
                            }
                            if (!needsUpdate)
                                continue;
                            else
                                LOG.info("Updating file " + destIndexFile.getName());
                        }

                        // Do the merge
                        Path tmpFile;
                        do {
                            tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp");
                        } while (fs.exists(tmpFile));
                        tmpFile = tmpFile.makeQualified(fs);
                        LOG.info("Merging tile " + tileID + " into file " + destIndexFile);
                        AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]),
                                tmpFile);
                        synchronized (fs) {
                            Path destDir = destIndexFile.getParent();
                            if (!fs.exists(destDir))
                                fs.mkdirs(destDir);
                        }
                        fs.rename(tmpFile, destIndexFile);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                return null;
            }
        });
        i1 = i2;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Calculates the union of a set of shapes categorized by some user defined
 * category./*from w  w w . ja  v  a  2  s  .  com*/
 * @param shapeFile
 * @param categoryFile
 * @return
 * @throws IOException
 */
public static Map<Integer, OGCGeometry> unionLocal(Path shapeFile, Path categoryFile) throws IOException {
    long t1 = System.currentTimeMillis();
    // 1- Build a hashtable of categories (given their size is small)
    Map<Integer, Integer> idToCategory = new HashMap<Integer, Integer>();
    readCategories(categoryFile, idToCategory);
    long t2 = System.currentTimeMillis();

    // 2- Read shapes from the shape file and relate each one to a category

    // Prepare a hash that stores shapes in each category
    Map<Integer, Vector<OGCGeometry>> categoryShapes = new HashMap<Integer, Vector<OGCGeometry>>();

    FileSystem fs1 = shapeFile.getFileSystem(new Configuration());
    long file_size1 = fs1.getFileStatus(shapeFile).getLen();

    ShapeIterRecordReader shapeReader = new ShapeIterRecordReader(fs1.open(shapeFile), 0, file_size1);
    Rectangle cellInfo = shapeReader.createKey();
    ShapeIterator shapes = shapeReader.createValue();

    while (shapeReader.next(cellInfo, shapes)) {
        for (Shape shape : shapes) {
            //int shape_zip = Integer.parseInt(shape.extra.split(",", 7)[5]);
            //Integer category = idToCategory.get(shape_zip);
            Integer category = null;
            if (category != null) {
                Vector<OGCGeometry> geometries = categoryShapes.get(category);
                if (geometries == null) {
                    geometries = new Vector<OGCGeometry>();
                    categoryShapes.put(category, geometries);
                }
                geometries.add(((OGCESRIShape) shape).geom);
            }
        }
    }

    shapeReader.close();
    long t3 = System.currentTimeMillis();

    // 3- Find the union of each category
    Map<Integer, OGCGeometry> final_result = new HashMap<Integer, OGCGeometry>();
    for (Map.Entry<Integer, Vector<OGCGeometry>> category : categoryShapes.entrySet()) {
        if (!category.getValue().isEmpty()) {
            OGCGeometryCollection geom_collection = new OGCConcreteGeometryCollection(category.getValue(),
                    category.getValue().firstElement().esriSR);
            OGCGeometry union = geom_collection.union(category.getValue().firstElement());
            final_result.put(category.getKey(), union);
            // Free up some memory
            category.getValue().clear();
        }
    }
    long t4 = System.currentTimeMillis();

    System.out.println("Time reading categories: " + (t2 - t1) + " millis");
    System.out.println("Time reading records: " + (t3 - t2) + " millis");
    System.out.println("Time union categories: " + (t4 - t3) + " millis");

    return final_result;
}