List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); Configuration conf = job.getConfiguration(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i];// w w w. j av a 2s . c o m FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter); for (int j = 0; j < files.length; j++) { if (recursive && files[j].isDirectory()) { simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter); } else { result.add(files[j]); } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
/** * List input directories.//ww w . ja v a2 s. co m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // Get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); final SmilePathFilter filter = new SmilePathFilter(); FileStatus[] matches = fs.globStatus(p, filter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), filter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.twitter.elephanttwin.retrieval.StatusesIndexManager.java
License:Apache License
/** * Creates a <code>StatusesIndexManager</code> * * @param baseDir the base directory// ww w.java2 s . c om * @param fs handle to the FileSystem * @param startDate start date * @param endDate end date * @throws IOException */ public StatusesIndexManager(Path baseDir, FileSystem fs, Calendar startDate, Calendar endDate) throws IOException { // copy the Calendar objects (they are mutable and we don't want to mess with the originals) // and clear their time related fields, we only care about day resolution startDate = (Calendar) startDate.clone(); startDate.set(Calendar.HOUR_OF_DAY, 0); startDate.set(Calendar.MINUTE, 0); startDate.set(Calendar.SECOND, 0); startDate.set(Calendar.MILLISECOND, 0); endDate = (Calendar) endDate.clone(); endDate.set(Calendar.HOUR_OF_DAY, 0); endDate.set(Calendar.MINUTE, 0); endDate.set(Calendar.SECOND, 0); endDate.set(Calendar.MILLISECOND, 0); // Start date should be before or equal to the end date. Preconditions.checkArgument(startDate.compareTo(endDate) <= 0); this.fs = Preconditions.checkNotNull(fs); do { String key = dateFormat.format(startDate.getTime()); // Under directory for each day, there should be a directory for each shard. Path pattern = new Path(baseDir, key + "/*"); // Remember to exclude _log directories. Using a regex here saves us from having to check of // empty directories that only contain _log directories, simplifying checks below. FileStatus[] statuses = fs.globStatus(pattern, new RegexExcludePathFilter("^.*_logs$")); if (statuses == null || statuses.length == 0) { // Warn if we can't find directories for that day. LOG.warn("Index not found: " + pattern); } else { for (FileStatus status : statuses) { if (!HdfsUtils.HIDDEN_FILE_FILTER.accept(status.getPath())) { continue; } LOG.debug("Index added: " + status.getPath().toString()); if (mapping.containsKey(key)) { mapping.get(key).add(status.getPath()); } else { mapping.put(key, Lists.newArrayList(status.getPath())); } } } // Roll forward the calendar to the next day. startDate.add(Calendar.DATE, 1); } while (startDate.compareTo(endDate) <= 0); }
From source file:com.uber.hoodie.common.util.FSUtils.java
License:Apache License
/** * Gets all partition paths assuming date partitioning (year, month, day) three levels down. *//*from w ww. j a v a 2 s . co m*/ public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException { List<String> datePartitions = new ArrayList<>(); // Avoid listing and including any folders under the metafolder PathFilter filter = getExcludeMetaPathFilter(); FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter); for (FileStatus status : folders) { Path path = status.getPath(); datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), path.getParent().getName(), path.getName())); } return datePartitions; }
From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. // w w w . j av a2 s . c om * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job); // Whether we need to recursive look into the directory structure boolean recursive = job.getBoolean("mapred.input.dir.recursive", false); List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result.toArray(new FileStatus[result.size()]); }
From source file:datafu.hourglass.demo.Examples.java
License:Apache License
private int countIntermediateFolders(Path path) throws IOException { FileSystem fs = getFileSystem(); return fs.globStatus(new Path(path, "*/*/*"), PathUtils.nonHiddenPathFilter).length; }
From source file:datafu.hourglass.fs.PathUtils.java
License:Apache License
/** * List all paths matching the "yyyy/MM/dd" format under a given path. * /* w w w .j a v a 2 s. com*/ * @param fs file system * @param input path to search under * @return paths * @throws IOException */ public static List<DatePath> findNestedDatedPaths(FileSystem fs, Path input) throws IOException { List<DatePath> inputDates = new ArrayList<DatePath>(); FileStatus[] pathsStatus = fs.globStatus(new Path(input, "*/*/*"), nonHiddenPathFilter); if (pathsStatus == null) { return inputDates; } for (FileStatus pathStatus : pathsStatus) { Matcher matcher = dailyPathPattern.matcher(pathStatus.getPath().toString()); if (matcher.matches()) { String datePath = matcher.group(2); Date date; try { date = nestedDatedPathFormat.parse(datePath); } catch (ParseException e) { continue; } Calendar cal = Calendar.getInstance(timeZone); cal.setTimeInMillis(date.getTime()); inputDates.add(new DatePath(cal.getTime(), pathStatus.getPath())); } } return inputDates; }
From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java
License:Apache License
private int countIntermediateFolders() throws IOException { FileSystem fs = getFileSystem(); return fs.globStatus(new Path(_intermediatePath, "*/*/*"), PathUtils.nonHiddenPathFilter).length; }
From source file:datafu.hourglass.test.FirstPassCountJobTests.java
License:Apache License
private int countOutputFolders() throws IOException { FileSystem fs = getFileSystem(); return fs.globStatus(new Path(_outputPath, "*/*/*"), PathUtils.nonHiddenPathFilter).length; }
From source file:edu.ucsb.cs.hadoop.CustomFileInputFormat.java
License:Apache License
/** * List input directories. Subclasses may override to, e.g., select only * files matching a regular expression.// w w w. j av a 2 s . c o m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result.toArray(new FileStatus[result.size()]); }