Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    Configuration conf = job.getConfiguration();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//  w w  w.  j  av  a  2s . c o m
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter);
                    for (int j = 0; j < files.length; j++) {
                        if (recursive && files[j].isDirectory()) {
                            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
                        } else {
                            result.add(files[j]);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories.//ww  w  .  ja  v  a2 s. co m
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.twitter.elephanttwin.retrieval.StatusesIndexManager.java

License:Apache License

/**
 * Creates a <code>StatusesIndexManager</code>
 *
 * @param baseDir the base directory//  ww  w.java2  s .  c om
 * @param fs handle to the FileSystem
 * @param startDate start date
 * @param endDate end date
 * @throws IOException
 */
public StatusesIndexManager(Path baseDir, FileSystem fs, Calendar startDate, Calendar endDate)
        throws IOException {

    // copy the Calendar objects (they are mutable and we don't want to mess with the originals)
    // and clear their time related fields, we only care about day resolution
    startDate = (Calendar) startDate.clone();
    startDate.set(Calendar.HOUR_OF_DAY, 0);
    startDate.set(Calendar.MINUTE, 0);
    startDate.set(Calendar.SECOND, 0);
    startDate.set(Calendar.MILLISECOND, 0);
    endDate = (Calendar) endDate.clone();
    endDate.set(Calendar.HOUR_OF_DAY, 0);
    endDate.set(Calendar.MINUTE, 0);
    endDate.set(Calendar.SECOND, 0);
    endDate.set(Calendar.MILLISECOND, 0);

    // Start date should be before or equal to the end date.
    Preconditions.checkArgument(startDate.compareTo(endDate) <= 0);
    this.fs = Preconditions.checkNotNull(fs);

    do {
        String key = dateFormat.format(startDate.getTime());
        // Under directory for each day, there should be a directory for each shard.
        Path pattern = new Path(baseDir, key + "/*");

        // Remember to exclude _log directories. Using a regex here saves us from having to check of
        // empty directories that only contain _log directories, simplifying checks below.
        FileStatus[] statuses = fs.globStatus(pattern, new RegexExcludePathFilter("^.*_logs$"));
        if (statuses == null || statuses.length == 0) {
            // Warn if we can't find directories for that day.
            LOG.warn("Index not found: " + pattern);
        } else {
            for (FileStatus status : statuses) {
                if (!HdfsUtils.HIDDEN_FILE_FILTER.accept(status.getPath())) {
                    continue;
                }
                LOG.debug("Index added: " + status.getPath().toString());

                if (mapping.containsKey(key)) {
                    mapping.get(key).add(status.getPath());
                } else {
                    mapping.put(key, Lists.newArrayList(status.getPath()));
                }
            }
        }

        // Roll forward the calendar to the next day.
        startDate.add(Calendar.DATE, 1);
    } while (startDate.compareTo(endDate) <= 0);
}

From source file:com.uber.hoodie.common.util.FSUtils.java

License:Apache License

/**
 * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
 *//*from   w ww. j a v a 2  s . co m*/
public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath)
        throws IOException {
    List<String> datePartitions = new ArrayList<>();
    // Avoid listing and including any folders under the metafolder
    PathFilter filter = getExcludeMetaPathFilter();
    FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
    for (FileStatus status : folders) {
        Path path = status.getPath();
        datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(),
                path.getParent().getName(), path.getName()));
    }
    return datePartitions;
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. // w w w  . j av  a2 s  .  c om
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean("mapred.input.dir.recursive", false);

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        if (recursive && stat.isDirectory()) {
                            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                        } else {
                            result.add(stat);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private int countIntermediateFolders(Path path) throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(path, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * List all paths matching the "yyyy/MM/dd" format under a given path.
 * /*  w  w  w  .j a v a 2  s. com*/
 * @param fs file system
 * @param input path to search under
 * @return paths
 * @throws IOException
 */
public static List<DatePath> findNestedDatedPaths(FileSystem fs, Path input) throws IOException {
    List<DatePath> inputDates = new ArrayList<DatePath>();

    FileStatus[] pathsStatus = fs.globStatus(new Path(input, "*/*/*"), nonHiddenPathFilter);

    if (pathsStatus == null) {
        return inputDates;
    }

    for (FileStatus pathStatus : pathsStatus) {
        Matcher matcher = dailyPathPattern.matcher(pathStatus.getPath().toString());
        if (matcher.matches()) {
            String datePath = matcher.group(2);
            Date date;
            try {
                date = nestedDatedPathFormat.parse(datePath);
            } catch (ParseException e) {
                continue;
            }

            Calendar cal = Calendar.getInstance(timeZone);

            cal.setTimeInMillis(date.getTime());

            inputDates.add(new DatePath(cal.getTime(), pathStatus.getPath()));
        }
    }

    return inputDates;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private int countIntermediateFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(_intermediatePath, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.test.FirstPassCountJobTests.java

License:Apache License

private int countOutputFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(_outputPath, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java

License:Apache License

/**
 * List input directories. Subclasses may override to, e.g., select only
 * files matching a regular expression.// w  w  w. j  av  a 2  s  .  c o  m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}