Example usage for org.apache.hadoop.mapred LocatedFileStatusFetcher LocatedFileStatusFetcher

List of usage examples for org.apache.hadoop.mapred LocatedFileStatusFetcher LocatedFileStatusFetcher

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred LocatedFileStatusFetcher LocatedFileStatusFetcher.

Prototype

public LocatedFileStatusFetcher(Configuration conf, Path[] dirs, boolean recursive, PathFilter inputFilter,
        boolean newApi) throws InterruptedException, IOException 

Source Link

Usage

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*from  ww w .  ja  v  a2  s.co m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = null;

    int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    Stopwatch sw = Stopwatch.createStarted();
    if (numThreads == 1) {
        result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(
                    job.getConfiguration(), dirs, recursive, inputFilter, true);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Lists.newArrayList(locatedFiles);
    }

    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.elapsed(TimeUnit.MILLISECONDS));
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*w  w w . ja v  a  2s  .c  o m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = null;

    int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    Stopwatch sw = new Stopwatch().start();
    if (numThreads == 1) {
        result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(
                    job.getConfiguration(), dirs, recursive, inputFilter, true);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Lists.newArrayList(locatedFiles);
    }

    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis());
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}