Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:test.transfer.parse.LoadSemanticAnalyzer.java

License:Apache License

public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException {
    FileStatus[] srcs = fs.globStatus(path);
    if ((srcs != null) && srcs.length == 1) {
        if (srcs[0].isDir()) {
            srcs = fs.listStatus(srcs[0].getPath());
        }//from  w ww . jav a  2 s  .co m
    }
    return (srcs);
}

From source file:voldemort.store.readonly.mr.utils.AvroUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception/*w w w.j a v a2 s.  co m*/
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */

@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) {

    try {
        if (fs.isFile(path)) {
            BufferedInputStream inStream = null;
            try {
                inStream = new BufferedInputStream(fs.open(path));
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            GenericDatumReader datum = new GenericDatumReader();

            DataFileStream reader = null;
            try {
                reader = new DataFileStream(inStream, datum);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return reader.getSchema();
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<Schema> schemas = new ArrayList<Schema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                Schema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        // logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }

}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception/*w ww  .jav a  2  s .  c  o  m*/
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema)
        throws IOException {
    try {
        if (fs.isFile(path)) {
            // this is a normal file, get a schema from it
            Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path);
            if (!m.containsKey("value.schema") || !m.containsKey("key.schema"))
                throw new IllegalArgumentException("No schema found on file " + path.toString());
            return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")),
                    JsonTypeDefinition.fromJson(m.get("value.schema")));
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<JsonSchema> schemas = new ArrayList<JsonSchema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                JsonSchema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }
}