Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException 

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    Configuration conf = job.getConfiguration();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//  w w  w.  j  av  a  2s . c o m
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter);
                    for (int j = 0; j < files.length; j++) {
                        if (recursive && files[j].isDirectory()) {
                            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
                        } else {
                            result.add(files[j]);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories.//ww  w  .  ja  v  a2 s. co m
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.twitter.elephanttwin.retrieval.StatusesIndexManager.java

License:Apache License

/**
 * Creates a <code>StatusesIndexManager</code>
 *
 * @param baseDir the base directory//  ww  w.java2  s .  c om
 * @param fs handle to the FileSystem
 * @param startDate start date
 * @param endDate end date
 * @throws IOException
 */
public StatusesIndexManager(Path baseDir, FileSystem fs, Calendar startDate, Calendar endDate)
        throws IOException {

    // copy the Calendar objects (they are mutable and we don't want to mess with the originals)
    // and clear their time related fields, we only care about day resolution
    startDate = (Calendar) startDate.clone();
    startDate.set(Calendar.HOUR_OF_DAY, 0);
    startDate.set(Calendar.MINUTE, 0);
    startDate.set(Calendar.SECOND, 0);
    startDate.set(Calendar.MILLISECOND, 0);
    endDate = (Calendar) endDate.clone();
    endDate.set(Calendar.HOUR_OF_DAY, 0);
    endDate.set(Calendar.MINUTE, 0);
    endDate.set(Calendar.SECOND, 0);
    endDate.set(Calendar.MILLISECOND, 0);

    // Start date should be before or equal to the end date.
    Preconditions.checkArgument(startDate.compareTo(endDate) <= 0);
    this.fs = Preconditions.checkNotNull(fs);

    do {
        String key = dateFormat.format(startDate.getTime());
        // Under directory for each day, there should be a directory for each shard.
        Path pattern = new Path(baseDir, key + "/*");

        // Remember to exclude _log directories. Using a regex here saves us from having to check of
        // empty directories that only contain _log directories, simplifying checks below.
        FileStatus[] statuses = fs.globStatus(pattern, new RegexExcludePathFilter("^.*_logs$"));
        if (statuses == null || statuses.length == 0) {
            // Warn if we can't find directories for that day.
            LOG.warn("Index not found: " + pattern);
        } else {
            for (FileStatus status : statuses) {
                if (!HdfsUtils.HIDDEN_FILE_FILTER.accept(status.getPath())) {
                    continue;
                }
                LOG.debug("Index added: " + status.getPath().toString());

                if (mapping.containsKey(key)) {
                    mapping.get(key).add(status.getPath());
                } else {
                    mapping.put(key, Lists.newArrayList(status.getPath()));
                }
            }
        }

        // Roll forward the calendar to the next day.
        startDate.add(Calendar.DATE, 1);
    } while (startDate.compareTo(endDate) <= 0);
}

From source file:com.uber.hoodie.common.util.FSUtils.java

License:Apache License

/**
 * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
 *//*from   w ww. j a v a 2  s . co m*/
public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath)
        throws IOException {
    List<String> datePartitions = new ArrayList<>();
    // Avoid listing and including any folders under the metafolder
    PathFilter filter = getExcludeMetaPathFilter();
    FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
    for (FileStatus status : folders) {
        Path path = status.getPath();
        datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(),
                path.getParent().getName(), path.getName()));
    }
    return datePartitions;
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. // w w w  . j av  a2 s  .  c om
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean("mapred.input.dir.recursive", false);

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        if (recursive && stat.isDirectory()) {
                            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                        } else {
                            result.add(stat);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private int countIntermediateFolders(Path path) throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(path, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * List all paths matching the "yyyy/MM/dd" format under a given path.
 * /*  w  w  w  .j a v a 2  s. com*/
 * @param fs file system
 * @param input path to search under
 * @return paths
 * @throws IOException
 */
public static List<DatePath> findNestedDatedPaths(FileSystem fs, Path input) throws IOException {
    List<DatePath> inputDates = new ArrayList<DatePath>();

    FileStatus[] pathsStatus = fs.globStatus(new Path(input, "*/*/*"), nonHiddenPathFilter);

    if (pathsStatus == null) {
        return inputDates;
    }

    for (FileStatus pathStatus : pathsStatus) {
        Matcher matcher = dailyPathPattern.matcher(pathStatus.getPath().toString());
        if (matcher.matches()) {
            String datePath = matcher.group(2);
            Date date;
            try {
                date = nestedDatedPathFormat.parse(datePath);
            } catch (ParseException e) {
                continue;
            }

            Calendar cal = Calendar.getInstance(timeZone);

            cal.setTimeInMillis(date.getTime());

            inputDates.add(new DatePath(cal.getTime(), pathStatus.getPath()));
        }
    }

    return inputDates;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private int countIntermediateFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(_intermediatePath, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.test.FirstPassCountJobTests.java

License:Apache License

private int countOutputFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.globStatus(new Path(_outputPath, "*/*/*"), PathUtils.nonHiddenPathFilter).length;
}

From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java

License:Apache License

/**
 * List input directories. Subclasses may override to, e.g., select only
 * files matching a regular expression.// w  w  w. j  av  a 2  s  .  c o  m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}