Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:test.transfer.parse.LoadSemanticAnalyzer.java

License:Apache License

public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException {
    FileStatus[] srcs = fs.globStatus(path);
    if ((srcs != null) && srcs.length == 1) {
        if (srcs[0].isDir()) {
            srcs = fs.listStatus(srcs[0].getPath());
        }//from  w ww . jav a  2 s  .co m
    }
    return (srcs);
}

From source file:voldemort.store.readonly.mr.utils.AvroUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception/*w w w.j a v a2 s.  co m*/
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */

@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) {

    try {
        if (fs.isFile(path)) {
            BufferedInputStream inStream = null;
            try {
                inStream = new BufferedInputStream(fs.open(path));
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            GenericDatumReader datum = new GenericDatumReader();

            DataFileStream reader = null;
            try {
                reader = new DataFileStream(inStream, datum);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return reader.getSchema();
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<Schema> schemas = new ArrayList<Schema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                Schema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        // logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }

}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception/*w ww  .jav a  2  s .  c  o  m*/
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema)
        throws IOException {
    try {
        if (fs.isFile(path)) {
            // this is a normal file, get a schema from it
            Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path);
            if (!m.containsKey("value.schema") || !m.containsKey("key.schema"))
                throw new IllegalArgumentException("No schema found on file " + path.toString());
            return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")),
                    JsonTypeDefinition.fromJson(m.get("value.schema")));
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<JsonSchema> schemas = new ArrayList<JsonSchema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                JsonSchema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }
}