List of usage examples for org.apache.hadoop.fs PathFilter accept
boolean accept(Path path);
From source file:com.ibm.stocator.fs.cos.COSAPIClient.java
License:Apache License
/** * {@inheritDoc}//from w w w . ja va2 s .c om * * Prefix based * Return everything that starts with the prefix * Fill listing * Return all objects, even zero size * If fileStatus is null means the path is part of some name, neither object * or pseudo directory. Was called by Globber * * @param hostName hostName * @param path path * @param fullListing Return all objects, even zero size * @param prefixBased Return everything that starts with the prefix * @return list * @throws IOException if error */ /* public FileStatus[] list(String hostName, Path path, boolean fullListing, boolean prefixBased) throws IOException { String key = pathToKey(hostName, path); ArrayList<FileStatus> tmpResult = new ArrayList<FileStatus>(); ListObjectsRequest request = new ListObjectsRequest().withBucketName(mBucket).withPrefix(key); String curObj; if (path.toString().equals(mBucket)) { curObj = ""; } else if (path.toString().startsWith(mBucket + "/")) { curObj = path.toString().substring(mBucket.length() + 1); } else if (path.toString().startsWith(hostName)) { curObj = path.toString().substring(hostName.length()); } else { curObj = path.toString(); } ObjectListing objectList = mClient.listObjects(request); List<S3ObjectSummary> objectSummaries = objectList.getObjectSummaries(); if (objectSummaries.size() == 0) { FileStatus[] emptyRes = {}; LOG.debug("List for bucket {} is empty", mBucket); return emptyRes; } boolean objectScanContinue = true; S3ObjectSummary prevObj = null; while (objectScanContinue) { for (S3ObjectSummary obj : objectSummaries) { if (prevObj == null) { prevObj = obj; continue; } String objKey = obj.getKey(); String unifiedObjectName = extractUnifiedObjectName(objKey); if (!prefixBased && !curObj.equals("") && !path.toString().endsWith("/") && !unifiedObjectName.equals(curObj) && !unifiedObjectName.startsWith(curObj + "/")) { LOG.trace("{} does not match {}. Skipped", unifiedObjectName, curObj); continue; } if (isSparkOrigin(unifiedObjectName) && !fullListing) { LOG.trace("{} created by Spark", unifiedObjectName); if (!isJobSuccessful(unifiedObjectName)) { LOG.trace("{} created by failed Spark job. Skipped", unifiedObjectName); if (fModeAutomaticDelete) { delete(hostName, new Path(objKey), true); } continue; } else { // if we here - data created by spark and job completed // successfully // however there be might parts of failed tasks that // were not aborted // we need to make sure there are no failed attempts if (nameWithoutTaskID(objKey).equals(nameWithoutTaskID(prevObj.getKey()))) { // found failed that was not aborted. LOG.trace("Colisiion found between {} and {}", prevObj.getKey(), objKey); if (prevObj.getSize() < obj.getSize()) { LOG.trace("New candidate is {}. Removed {}", obj.getKey(), prevObj.getKey()); prevObj = obj; } continue; } } } if (prevObj.getSize() > 0 || fullListing) { FileStatus fs = getFileStatusObjSummaryBased(prevObj, hostName, path); tmpResult.add(fs); } prevObj = obj; } boolean isTruncated = objectList.isTruncated(); if (isTruncated) { objectList = mClient.listNextBatchOfObjects(objectList); objectSummaries = objectList.getObjectSummaries(); } else { objectScanContinue = false; } } if (prevObj != null && (prevObj.getSize() > 0 || fullListing)) { FileStatus fs = getFileStatusObjSummaryBased(prevObj, hostName, path); tmpResult.add(fs); } if (LOG.isTraceEnabled()) { LOG.trace("COS List to return length {}", tmpResult.size()); for (FileStatus fs: tmpResult) { LOG.trace("{}", fs.getPath()); } } return tmpResult.toArray(new FileStatus[tmpResult.size()]); } */ @Override public FileStatus[] list(String hostName, Path path, boolean fullListing, boolean prefixBased, Boolean isDirectory, boolean flatListing, PathFilter filter) throws FileNotFoundException, IOException { LOG.debug("Native direct list status for {}", path); ArrayList<FileStatus> tmpResult = new ArrayList<FileStatus>(); String key = pathToKey(hostName, path); if (isDirectory != null && isDirectory.booleanValue() && !key.endsWith("/")) { key = key + "/"; LOG.debug("listNativeDirect modify key to {}", key); } Map<String, FileStatus> emptyObjects = new HashMap<String, FileStatus>(); ListObjectsRequest request = new ListObjectsRequest(); request.setBucketName(mBucket); request.setMaxKeys(5000); request.setPrefix(key); if (!flatListing) { request.setDelimiter("/"); } ObjectListing objectList = mClient.listObjects(request); List<S3ObjectSummary> objectSummaries = objectList.getObjectSummaries(); List<String> commonPrefixes = objectList.getCommonPrefixes(); boolean objectScanContinue = true; S3ObjectSummary prevObj = null; // start FTA logic boolean stocatorOrigin = isSparkOrigin(key, path.toString()); if (stocatorOrigin) { LOG.debug("Stocator origin is true for {}", key); if (!isJobSuccessful(key)) { LOG.debug("{} created by failed Spark job. Skipped", key); if (fModeAutomaticDelete) { delete(hostName, new Path(key), true); } return new FileStatus[0]; } } while (objectScanContinue) { for (S3ObjectSummary obj : objectSummaries) { if (prevObj == null) { prevObj = obj; continue; } String objKey = obj.getKey(); String unifiedObjectName = extractUnifiedObjectName(objKey); LOG.debug("list candidate {}, unified name {}", objKey, unifiedObjectName); if (stocatorOrigin && !fullListing) { LOG.trace("{} created by Spark", unifiedObjectName); // if we here - data created by spark and job completed // successfully // however there be might parts of failed tasks that // were not aborted // we need to make sure there are no failed attempts if (nameWithoutTaskID(objKey).equals(nameWithoutTaskID(prevObj.getKey()))) { // found failed that was not aborted. LOG.trace("Colisiion found between {} and {}", prevObj.getKey(), objKey); if (prevObj.getSize() < obj.getSize()) { LOG.trace("New candidate is {}. Removed {}", obj.getKey(), prevObj.getKey()); prevObj = obj; } continue; } } FileStatus fs = createFileStatus(prevObj, hostName, path); if (fs.getLen() > 0 || fullListing) { LOG.debug("Native direct list. Adding {} size {}", fs.getPath(), fs.getLen()); if (filter == null) { tmpResult.add(fs); } else if (filter != null && filter.accept(fs.getPath())) { tmpResult.add(fs); } else { LOG.trace("{} rejected by path filter during list. Filter {}", fs.getPath(), filter); } } else { emptyObjects.put(fs.getPath().toString(), fs); } prevObj = obj; } boolean isTruncated = objectList.isTruncated(); if (isTruncated) { objectList = mClient.listNextBatchOfObjects(objectList); objectSummaries = objectList.getObjectSummaries(); } else { objectScanContinue = false; } } if (prevObj != null) { FileStatus fs = createFileStatus(prevObj, hostName, path); LOG.debug("Adding the last object from the list {}", fs.getPath()); if (fs.getLen() > 0 || fullListing) { LOG.debug("Native direct list. Adding {} size {}", fs.getPath(), fs.getLen()); if (filter == null) { memoryCache.putFileStatus(fs.getPath().toString(), fs); tmpResult.add(fs); } else if (filter != null && filter.accept(fs.getPath())) { memoryCache.putFileStatus(fs.getPath().toString(), fs); tmpResult.add(fs); } else { LOG.trace("{} rejected by path filter during list. Filter {}", fs.getPath(), filter); } } else if (!fs.getPath().getName().equals(HADOOP_SUCCESS)) { emptyObjects.put(fs.getPath().toString(), fs); } } // get common prefixes for (String comPrefix : commonPrefixes) { LOG.debug("Common prefix is {}", comPrefix); if (emptyObjects.containsKey(keyToQualifiedPath(hostName, comPrefix).toString()) || emptyObjects.isEmpty()) { FileStatus status = new COSFileStatus(true, false, keyToQualifiedPath(hostName, comPrefix)); LOG.debug("Match between common prefix and empty object {}. Adding to result", comPrefix); if (filter == null) { memoryCache.putFileStatus(status.getPath().toString(), status); tmpResult.add(status); } else if (filter != null && filter.accept(status.getPath())) { memoryCache.putFileStatus(status.getPath().toString(), status); tmpResult.add(status); } else { LOG.trace("Common prefix {} rejected by path filter during list. Filter {}", status.getPath(), filter); } } } return tmpResult.toArray(new FileStatus[tmpResult.size()]); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i];//from w ww .j a va 2 s . co m FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** * Add files in the input path recursively into the results. * @param result//from w w w . j a v a 2s .c o m * The List to store all files. * @param fs * The FileSystem. * @param path * The input path. * @param inputFilter * The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } }
From source file:com.mongodb.hadoop.BSONFileInputFormat.java
License:Apache License
@Override public List<FileSplit> getSplits(final JobContext context) throws IOException { Configuration config = context.getConfiguration(); PathFilter pf = getInputPathFilter(context); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); List<FileStatus> inputFiles = listStatus(context); for (FileStatus file : inputFiles) { if (pf != null && !pf.accept(file.getPath())) { if (LOG.isDebugEnabled()) { LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath())); }//w ww . ja va2 s . co m continue; } else { if (LOG.isDebugEnabled()) { LOG.debug("processing file " + file.getPath()); } } BSONSplitter splitter = new BSONSplitter(); splitter.setConf(config); splitter.setInputPath(file.getPath()); Path splitFilePath = new Path(file.getPath().getParent(), "." + file.getPath().getName() + ".splits"); try { splitter.loadSplitsFromSplitFile(file, splitFilePath); } catch (BSONSplitter.NoSplitFileException nsfe) { if (LOG.isDebugEnabled()) { LOG.debug(String.format("No split file for %s; building split file", file.getPath())); } splitter.readSplitsForFile(file); } if (LOG.isDebugEnabled()) { LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size())); } splits.addAll(splitter.getAllSplits()); } if (LOG.isDebugEnabled()) { LOG.debug(String.format("Total of %d found.", splits.size())); } return splits; }
From source file:com.netflix.aegisthus.tools.ChainedPathFilter.java
License:Apache License
@Override public boolean accept(Path path) { for (PathFilter pf : pfs) { if (!pf.accept(path)) { return false; }//from w w w .jav a 2s .com } return true; }
From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java
License:Apache License
@Override protected List<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException { PathFilter filter = HiddenPathFilter.get(); // get files on the local FS in the attempt path Path attemptPath = getTaskAttemptPath(context); FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration()); RemoteIterator<LocatedFileStatus> iter = attemptFS.listFiles(attemptPath, true /* recursive */ ); List<FileStatus> stats = Lists.newArrayList(); while (iter.hasNext()) { FileStatus stat = iter.next();//w ww . j a v a2 s.c o m if (filter.accept(stat.getPath())) { stats.add(stat); } } return stats; }
From source file:com.twitter.elephanttwin.util.HdfsUtils.java
License:Apache License
/** * @param result contains the list of FileStatus passed the filtering conditions; * @param fs// w w w . java 2 s. c o m * @param path * @param dirFilter : filter works on directories only; * @param fileFilter: filer works on files only; * @throws IOException */ public static void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter dirFilter, PathFilter fileFilter) throws IOException { FileStatus[] stats = fs.listStatus(path); if (stats != null) { for (FileStatus stat : stats) { if (stat.isDir() && dirFilter.accept(stat.getPath())) { addInputPathRecursively(result, fs, stat.getPath(), dirFilter, fileFilter); } else { if (fileFilter.accept(stat.getPath())) { result.add(stat); } } } } }
From source file:com.uber.hoodie.common.util.FSUtils.java
License:Apache License
/** * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its * subdirs are skipped/*from w w w . ja va2s.c o m*/ * @param fs File System * @param basePathStr Base-Path * @param consumer Callback for processing * @param excludeMetaFolder Exclude .hoodie folder * @throws IOException */ @VisibleForTesting static void processFiles(FileSystem fs, String basePathStr, Function<FileStatus, Boolean> consumer, boolean excludeMetaFolder) throws IOException { PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER; FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr)); for (int i = 0; i < topLevelStatuses.length; i++) { FileStatus child = topLevelStatuses[i]; if (child.isFile()) { boolean success = consumer.apply(child); if (!success) { throw new HoodieException("Failed to process file-status=" + child); } } else if (pathFilter.accept(child.getPath())) { RemoteIterator<LocatedFileStatus> itr = fs.listFiles(child.getPath(), true); while (itr.hasNext()) { FileStatus status = itr.next(); boolean success = consumer.apply(status); if (!success) { throw new HoodieException("Failed to process file-status=" + status); } } } } }
From source file:de.zib.sfs.StatisticsFileSystem.java
License:BSD License
@Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException { PathFilter wrappedFilter = new PathFilter() { @Override//w w w . ja v a2 s.c o m public boolean accept(Path path) { return filter.accept(unwrapPath(path)); } }; UnwrappedPath unwrappedPathPattern = unwrapPath(pathPattern); FileStatus[] fileStatuses = this.wrappedFS.globStatus(unwrappedPathPattern, wrappedFilter); if (fileStatuses == null) { return null; } if (unwrappedPathPattern.isUnwrapped()) { for (FileStatus fileStatus : fileStatuses) { fileStatus .setPath(setAuthority(wrapPath(fileStatus.getPath()), pathPattern.toUri().getAuthority())); } } return fileStatuses; }
From source file:de.zib.sfs.StatisticsFileSystem.java
License:BSD License
@Override public FileStatus[] listStatus(Path f, PathFilter filter) throws FileNotFoundException, IOException { long startTime = System.nanoTime(); UnwrappedPath unwrappedPath = unwrapPath(f); PathFilter wrappedFilter = new PathFilter() { @Override/*www. jav a 2s .com*/ public boolean accept(Path path) { return filter.accept(unwrapPath(path)); } }; FileStatus[] fileStatuses = this.wrappedFS.listStatus(unwrappedPath, wrappedFilter); if (unwrappedPath.isUnwrapped()) { for (FileStatus fileStatus : fileStatuses) { fileStatus.setPath(setAuthority(wrapPath(fileStatus.getPath()), f.toUri().getAuthority())); } } if (!this.skipOther) { int fd = LiveOperationStatisticsAggregator.instance.registerFileDescriptor(f.toString()); LiveOperationStatisticsAggregator.instance.aggregateOperationStatistics(OperationSource.SFS, OperationCategory.OTHER, startTime, System.nanoTime(), fd); } return fileStatuses; }