Example usage for org.apache.hadoop.fs PathFilter accept

List of usage examples for org.apache.hadoop.fs PathFilter accept

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter accept.

Prototype

boolean accept(Path path);

Source Link

Document

Tests whether or not the specified abstract pathname should be included in a pathname list.

Usage

From source file:com.ibm.stocator.fs.cos.COSAPIClient.java

License:Apache License

/**
 * {@inheritDoc}//from w  w w . ja va2  s  .c  om
 *
 * Prefix based
 * Return everything that starts with the prefix
 * Fill listing
 * Return all objects, even zero size
 * If fileStatus is null means the path is part of some name, neither object
 * or pseudo directory. Was called by Globber
 *
 * @param hostName hostName
 * @param path path
 * @param fullListing Return all objects, even zero size
 * @param prefixBased Return everything that starts with the prefix
 * @return list
 * @throws IOException if error
 */
/*
public FileStatus[] list(String hostName, Path path, boolean fullListing,
    boolean prefixBased) throws IOException {
  String key = pathToKey(hostName, path);
  ArrayList<FileStatus> tmpResult = new ArrayList<FileStatus>();
  ListObjectsRequest request = new ListObjectsRequest().withBucketName(mBucket).withPrefix(key);
        
  String curObj;
  if (path.toString().equals(mBucket)) {
    curObj = "";
  } else if (path.toString().startsWith(mBucket + "/")) {
    curObj = path.toString().substring(mBucket.length() + 1);
  } else if (path.toString().startsWith(hostName)) {
    curObj = path.toString().substring(hostName.length());
  } else {
    curObj = path.toString();
  }
        
  ObjectListing objectList = mClient.listObjects(request);
  List<S3ObjectSummary> objectSummaries = objectList.getObjectSummaries();
  if (objectSummaries.size() == 0) {
    FileStatus[] emptyRes = {};
    LOG.debug("List for bucket {} is empty", mBucket);
    return emptyRes;
  }
  boolean objectScanContinue = true;
  S3ObjectSummary prevObj = null;
  while (objectScanContinue) {
    for (S3ObjectSummary obj : objectSummaries) {
if (prevObj == null) {
  prevObj = obj;
  continue;
}
String objKey = obj.getKey();
String unifiedObjectName = extractUnifiedObjectName(objKey);
if (!prefixBased && !curObj.equals("") && !path.toString().endsWith("/")
    && !unifiedObjectName.equals(curObj) && !unifiedObjectName.startsWith(curObj + "/")) {
  LOG.trace("{} does not match {}. Skipped", unifiedObjectName, curObj);
  continue;
}
if (isSparkOrigin(unifiedObjectName) && !fullListing) {
  LOG.trace("{} created by Spark", unifiedObjectName);
  if (!isJobSuccessful(unifiedObjectName)) {
    LOG.trace("{} created by failed Spark job. Skipped", unifiedObjectName);
    if (fModeAutomaticDelete) {
      delete(hostName, new Path(objKey), true);
    }
    continue;
  } else {
    // if we here - data created by spark and job completed
    // successfully
    // however there be might parts of failed tasks that
    // were not aborted
    // we need to make sure there are no failed attempts
    if (nameWithoutTaskID(objKey).equals(nameWithoutTaskID(prevObj.getKey()))) {
      // found failed that was not aborted.
      LOG.trace("Colisiion found between {} and {}", prevObj.getKey(), objKey);
      if (prevObj.getSize() < obj.getSize()) {
        LOG.trace("New candidate is {}. Removed {}", obj.getKey(), prevObj.getKey());
        prevObj = obj;
      }
      continue;
    }
  }
}
if (prevObj.getSize() > 0 || fullListing) {
  FileStatus fs = getFileStatusObjSummaryBased(prevObj, hostName, path);
  tmpResult.add(fs);
}
prevObj = obj;
    }
    boolean isTruncated = objectList.isTruncated();
    if (isTruncated) {
objectList = mClient.listNextBatchOfObjects(objectList);
objectSummaries = objectList.getObjectSummaries();
    } else {
objectScanContinue = false;
    }
  }
  if (prevObj != null && (prevObj.getSize() > 0 || fullListing)) {
    FileStatus fs = getFileStatusObjSummaryBased(prevObj, hostName, path);
    tmpResult.add(fs);
  }
  if (LOG.isTraceEnabled()) {
    LOG.trace("COS List to return length {}", tmpResult.size());
    for (FileStatus fs: tmpResult) {
LOG.trace("{}", fs.getPath());
    }
  }
  return tmpResult.toArray(new FileStatus[tmpResult.size()]);
}
*/
@Override
public FileStatus[] list(String hostName, Path path, boolean fullListing, boolean prefixBased,
        Boolean isDirectory, boolean flatListing, PathFilter filter) throws FileNotFoundException, IOException {
    LOG.debug("Native direct list status for {}", path);
    ArrayList<FileStatus> tmpResult = new ArrayList<FileStatus>();
    String key = pathToKey(hostName, path);
    if (isDirectory != null && isDirectory.booleanValue() && !key.endsWith("/")) {
        key = key + "/";
        LOG.debug("listNativeDirect modify key to {}", key);
    }

    Map<String, FileStatus> emptyObjects = new HashMap<String, FileStatus>();
    ListObjectsRequest request = new ListObjectsRequest();
    request.setBucketName(mBucket);
    request.setMaxKeys(5000);
    request.setPrefix(key);
    if (!flatListing) {
        request.setDelimiter("/");
    }

    ObjectListing objectList = mClient.listObjects(request);

    List<S3ObjectSummary> objectSummaries = objectList.getObjectSummaries();
    List<String> commonPrefixes = objectList.getCommonPrefixes();

    boolean objectScanContinue = true;
    S3ObjectSummary prevObj = null;
    // start FTA logic
    boolean stocatorOrigin = isSparkOrigin(key, path.toString());
    if (stocatorOrigin) {
        LOG.debug("Stocator origin is true for {}", key);
        if (!isJobSuccessful(key)) {
            LOG.debug("{} created by failed Spark job. Skipped", key);
            if (fModeAutomaticDelete) {
                delete(hostName, new Path(key), true);
            }
            return new FileStatus[0];
        }
    }
    while (objectScanContinue) {
        for (S3ObjectSummary obj : objectSummaries) {
            if (prevObj == null) {
                prevObj = obj;
                continue;
            }
            String objKey = obj.getKey();
            String unifiedObjectName = extractUnifiedObjectName(objKey);
            LOG.debug("list candidate {}, unified name {}", objKey, unifiedObjectName);
            if (stocatorOrigin && !fullListing) {
                LOG.trace("{} created by Spark", unifiedObjectName);
                // if we here - data created by spark and job completed
                // successfully
                // however there be might parts of failed tasks that
                // were not aborted
                // we need to make sure there are no failed attempts
                if (nameWithoutTaskID(objKey).equals(nameWithoutTaskID(prevObj.getKey()))) {
                    // found failed that was not aborted.
                    LOG.trace("Colisiion found between {} and {}", prevObj.getKey(), objKey);
                    if (prevObj.getSize() < obj.getSize()) {
                        LOG.trace("New candidate is {}. Removed {}", obj.getKey(), prevObj.getKey());
                        prevObj = obj;
                    }
                    continue;
                }
            }
            FileStatus fs = createFileStatus(prevObj, hostName, path);
            if (fs.getLen() > 0 || fullListing) {
                LOG.debug("Native direct list. Adding {} size {}", fs.getPath(), fs.getLen());
                if (filter == null) {
                    tmpResult.add(fs);
                } else if (filter != null && filter.accept(fs.getPath())) {
                    tmpResult.add(fs);
                } else {
                    LOG.trace("{} rejected by path filter during list. Filter {}", fs.getPath(), filter);
                }
            } else {
                emptyObjects.put(fs.getPath().toString(), fs);
            }
            prevObj = obj;
        }
        boolean isTruncated = objectList.isTruncated();
        if (isTruncated) {
            objectList = mClient.listNextBatchOfObjects(objectList);
            objectSummaries = objectList.getObjectSummaries();
        } else {
            objectScanContinue = false;
        }
    }

    if (prevObj != null) {
        FileStatus fs = createFileStatus(prevObj, hostName, path);
        LOG.debug("Adding the last object from the list {}", fs.getPath());
        if (fs.getLen() > 0 || fullListing) {
            LOG.debug("Native direct list. Adding {} size {}", fs.getPath(), fs.getLen());
            if (filter == null) {
                memoryCache.putFileStatus(fs.getPath().toString(), fs);
                tmpResult.add(fs);
            } else if (filter != null && filter.accept(fs.getPath())) {
                memoryCache.putFileStatus(fs.getPath().toString(), fs);
                tmpResult.add(fs);
            } else {
                LOG.trace("{} rejected by path filter during list. Filter {}", fs.getPath(), filter);
            }
        } else if (!fs.getPath().getName().equals(HADOOP_SUCCESS)) {
            emptyObjects.put(fs.getPath().toString(), fs);
        }
    }

    // get common prefixes
    for (String comPrefix : commonPrefixes) {
        LOG.debug("Common prefix is {}", comPrefix);
        if (emptyObjects.containsKey(keyToQualifiedPath(hostName, comPrefix).toString())
                || emptyObjects.isEmpty()) {
            FileStatus status = new COSFileStatus(true, false, keyToQualifiedPath(hostName, comPrefix));
            LOG.debug("Match between common prefix and empty object {}. Adding to result", comPrefix);
            if (filter == null) {
                memoryCache.putFileStatus(status.getPath().toString(), status);
                tmpResult.add(status);
            } else if (filter != null && filter.accept(status.getPath())) {
                memoryCache.putFileStatus(status.getPath().toString(), status);
                tmpResult.add(status);
            } else {
                LOG.trace("Common prefix {} rejected by path filter during list. Filter {}", status.getPath(),
                        filter);
            }
        }
    }
    return tmpResult.toArray(new FileStatus[tmpResult.size()]);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//from  w  ww .j a va  2  s .  co m
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                    while (iter.hasNext()) {
                        LocatedFileStatus stat = iter.next();
                        if (inputFilter.accept(stat.getPath())) {
                            if (recursive && stat.isDirectory()) {
                                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                            } else {
                                result.add(stat);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/**
 * Add files in the input path recursively into the results.
 * @param result//from   w w  w . j a  v a  2s  .c  o  m
 *          The List to store all files.
 * @param fs
 *          The FileSystem.
 * @param path
 *          The input path.
 * @param inputFilter
 *          The input filter that can be used to filter files/dirs. 
 * @throws IOException
 */
protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path);
    while (iter.hasNext()) {
        LocatedFileStatus stat = iter.next();
        if (inputFilter.accept(stat.getPath())) {
            if (stat.isDirectory()) {
                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
            } else {
                result.add(stat);
            }
        }
    }
}

From source file:com.mongodb.hadoop.BSONFileInputFormat.java

License:Apache License

@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
    Configuration config = context.getConfiguration();
    PathFilter pf = getInputPathFilter(context);
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    List<FileStatus> inputFiles = listStatus(context);
    for (FileStatus file : inputFiles) {
        if (pf != null && !pf.accept(file.getPath())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
            }//w ww .  ja va2 s  .  co  m
            continue;
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("processing file " + file.getPath());
            }
        }

        BSONSplitter splitter = new BSONSplitter();
        splitter.setConf(config);
        splitter.setInputPath(file.getPath());
        Path splitFilePath = new Path(file.getPath().getParent(), "." + file.getPath().getName() + ".splits");
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        splits.addAll(splitter.getAllSplits());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Total of %d found.", splits.size()));
    }
    return splits;
}

From source file:com.netflix.aegisthus.tools.ChainedPathFilter.java

License:Apache License

@Override
public boolean accept(Path path) {
    for (PathFilter pf : pfs) {
        if (!pf.accept(path)) {
            return false;
        }//from w  w w  .jav a  2s  .com
    }
    return true;
}

From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java

License:Apache License

@Override
protected List<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException {
    PathFilter filter = HiddenPathFilter.get();

    // get files on the local FS in the attempt path
    Path attemptPath = getTaskAttemptPath(context);
    FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration());
    RemoteIterator<LocatedFileStatus> iter = attemptFS.listFiles(attemptPath, true /* recursive */ );

    List<FileStatus> stats = Lists.newArrayList();
    while (iter.hasNext()) {
        FileStatus stat = iter.next();//w ww .  j a  v  a2  s.c o  m
        if (filter.accept(stat.getPath())) {
            stats.add(stat);
        }
    }

    return stats;
}

From source file:com.twitter.elephanttwin.util.HdfsUtils.java

License:Apache License

/**
 * @param result contains the list of FileStatus passed the filtering conditions;
 * @param fs//  w  w  w . java 2  s. c  o m
 * @param path
 * @param dirFilter : filter works on directories only;
 * @param fileFilter: filer works on files only;
 * @throws IOException
 */
public static void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter dirFilter, PathFilter fileFilter) throws IOException {
    FileStatus[] stats = fs.listStatus(path);
    if (stats != null) {
        for (FileStatus stat : stats) {
            if (stat.isDir() && dirFilter.accept(stat.getPath())) {
                addInputPathRecursively(result, fs, stat.getPath(), dirFilter, fileFilter);
            } else {
                if (fileFilter.accept(stat.getPath())) {
                    result.add(stat);
                }
            }
        }
    }
}

From source file:com.uber.hoodie.common.util.FSUtils.java

License:Apache License

/**
 * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its
 * subdirs are skipped/*from w  w  w  . ja  va2s.c o m*/
 * @param fs           File System
 * @param basePathStr  Base-Path
 * @param consumer     Callback for processing
 * @param excludeMetaFolder Exclude .hoodie folder
 * @throws IOException
 */
@VisibleForTesting
static void processFiles(FileSystem fs, String basePathStr, Function<FileStatus, Boolean> consumer,
        boolean excludeMetaFolder) throws IOException {
    PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER;
    FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr));
    for (int i = 0; i < topLevelStatuses.length; i++) {
        FileStatus child = topLevelStatuses[i];
        if (child.isFile()) {
            boolean success = consumer.apply(child);
            if (!success) {
                throw new HoodieException("Failed to process file-status=" + child);
            }
        } else if (pathFilter.accept(child.getPath())) {
            RemoteIterator<LocatedFileStatus> itr = fs.listFiles(child.getPath(), true);
            while (itr.hasNext()) {
                FileStatus status = itr.next();
                boolean success = consumer.apply(status);
                if (!success) {
                    throw new HoodieException("Failed to process file-status=" + status);
                }
            }
        }
    }
}

From source file:de.zib.sfs.StatisticsFileSystem.java

License:BSD License

@Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException {
    PathFilter wrappedFilter = new PathFilter() {
        @Override//w  w w .  ja v  a2 s.c o  m
        public boolean accept(Path path) {
            return filter.accept(unwrapPath(path));
        }
    };

    UnwrappedPath unwrappedPathPattern = unwrapPath(pathPattern);
    FileStatus[] fileStatuses = this.wrappedFS.globStatus(unwrappedPathPattern, wrappedFilter);
    if (fileStatuses == null) {
        return null;
    }
    if (unwrappedPathPattern.isUnwrapped()) {
        for (FileStatus fileStatus : fileStatuses) {
            fileStatus
                    .setPath(setAuthority(wrapPath(fileStatus.getPath()), pathPattern.toUri().getAuthority()));
        }
    }
    return fileStatuses;
}

From source file:de.zib.sfs.StatisticsFileSystem.java

License:BSD License

@Override
public FileStatus[] listStatus(Path f, PathFilter filter) throws FileNotFoundException, IOException {
    long startTime = System.nanoTime();
    UnwrappedPath unwrappedPath = unwrapPath(f);
    PathFilter wrappedFilter = new PathFilter() {
        @Override/*www. jav a  2s  .com*/
        public boolean accept(Path path) {
            return filter.accept(unwrapPath(path));
        }
    };

    FileStatus[] fileStatuses = this.wrappedFS.listStatus(unwrappedPath, wrappedFilter);
    if (unwrappedPath.isUnwrapped()) {
        for (FileStatus fileStatus : fileStatuses) {
            fileStatus.setPath(setAuthority(wrapPath(fileStatus.getPath()), f.toUri().getAuthority()));
        }
    }
    if (!this.skipOther) {
        int fd = LiveOperationStatisticsAggregator.instance.registerFileDescriptor(f.toString());
        LiveOperationStatisticsAggregator.instance.aggregateOperationStatistics(OperationSource.SFS,
                OperationCategory.OTHER, startTime, System.nanoTime(), fd);
    }
    return fileStatuses;
}