Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException 

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:cascading.avro.AvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been provided.
 *
 * @param flowProcess The cascading FlowProcess object for this flow.
 * @param tap         The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none exists.
 */// ww w  .  j a v a 2 s . c  o  m
private Schema getSourceSchema(FlowProcess<JobConf> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been provided.
 *
 * @param flowProcess The cascading FlowProcess object for this flow.
 * @param tap         The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none exists.
 *//*www  . j  av a2  s.  co  m*/
private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*from   w ww  .j av a2s .  c o  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * List input directories. Subclasses may override to, e.g., select only files
 * matching a regular expression.//from w  w  w.j a  v a  2s  .c o m
 *
 * @param job
 *        the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException
 *         if zero items.
 */
protected List<FileStatus> listStatus(BSPJob job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    List<IOException> errors = new ArrayList<IOException>();
    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(HIDDEN_FILE_FILTER);
    PathFilter inputFilter = new MultiPathFilter(filters);
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConf());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }
    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoiner.java

License:Apache License

/**
 * Adds a CHANNELED input specification. A channeled input specification is a channel associated to a Mapper and a
 * input file or glob. The user will implement a {@link MultiJoinChanneledMapper} which will be tied to a single
 * channel.//from   w  w w .jav a2  s  .c o m
 * <p>
 * The user must be consistent with the channel numbers it provides. For instance, in case that two or more different
 * files must belong to the same channel.
 * 
 * @param channel
 * @param location
 * @param channelClass
 * @param inputFormat
 * @param mapper
 * 
 * @throws IOException
 */
public MultiJoiner addChanneledInput(Integer channel, Path location, Class<? extends Object> channelClass,
        Class<? extends InputFormat> inputFormat, Class<? extends MultiJoinChanneledMapper> mapper)
        throws IOException {
    /*
     * Configure the MultiJoiner
     */
    setChannelDatumClass(channel, channelClass);
    FileSystem fS = FileSystem.get(getJob().getConfiguration());
    if (location.toString().contains("*")) { // is a glob
        for (FileStatus fSt : fS.globStatus(location, hiddenFileFilter)) { // expands the glob
            addChanneledInputInner(channel, fSt.getPath(), channelClass, inputFormat, mapper);
        }
    } else if (fS.getFileStatus(location).isDir()) {
        for (FileStatus fSt : fS.listStatus(location, hiddenFileFilter)) { // expands the glob
            addChanneledInputInner(channel, fSt.getPath(), channelClass, inputFormat, mapper);
        }
    } else {
        addChanneledInputInner(channel, location, channelClass, inputFormat, mapper);
    }
    return this;
}

From source file:com.datasalt.utils.viewbuilder.SolrAdminCoreUtils.java

License:Apache License

/**
 * Extracts the core names using the subfolder names of the specified folder
 * @param fs/*  ww  w  .  j  av a  2  s  .  c  o m*/
 * @param coreFolder
 * 
 * @throws IOException
 */
public static List<String> findCoresToDeploy(FileSystem fs, Path coreFolder) throws IOException {
    FileStatus[] indexes = fs.globStatus(new Path(coreFolder + "/*"), new IsDirFilter(fs));
    List<String> coreArray = new ArrayList<String>();
    for (FileStatus index : indexes) {
        coreArray.add(index.getPath().getName());
    }
    return coreArray;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*from   www  .j  a  va  2s .co  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.facebook.hiveio.common.FileSystems.java

License:Apache License

/**
 * List files in directory that are not hidden.
 *
 * @param fs FileSystem/*from   www. j  a  v  a 2s  .com*/
 * @param dir directory to list
 * @return FileStatus[] of non-hidden entries
 * @throws IOException I/O problems
 */
public static FileStatus[] listNonHidden(FileSystem fs, Path dir) throws IOException {
    return fs.globStatus(new Path(dir, "*"), HIDDEN_FILTER);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//w ww.  j a v a  2  s . co  m
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                    while (iter.hasNext()) {
                        LocatedFileStatus stat = iter.next();
                        if (inputFilter.accept(stat.getPath())) {
                            if (recursive && stat.isDirectory()) {
                                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                            } else {
                                result.add(stat);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

/**
 * Gets the list of paths from the pathString specified which may contain
 * comma-separated paths and glob style path
 *
 * @throws IOException/*  w w  w.  j  a  v a 2  s  .  c  o m*/
 */
public static Set<Path> getPaths(String pathString, Configuration conf, boolean failIfNotFound)
        throws IOException {
    Set<Path> paths = new HashSet<Path>();
    String[] pathStrs = LoadFunc.getPathStrings(pathString);
    for (String pathStr : pathStrs) {
        FileSystem fs = FileSystem.get(new Path(pathStr).toUri(), conf);
        FileStatus[] matchedFiles = fs.globStatus(new Path(pathStr), PATH_FILTER);
        if (matchedFiles == null || matchedFiles.length == 0) {
            if (failIfNotFound) {
                throw new IOException("Input Pattern " + pathStr + " matches 0 files");
            } else {
                continue;
            }
        }
        for (FileStatus file : matchedFiles) {
            paths.add(file.getPath());
        }
    }
    return paths;
}