List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. /*from ww w. j a v a 2 s . c o m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java
License:Apache License
/** * Move the TEMP output folder to final one(user defined one); * If there are multiple files under one job's output folder, it should merge the output into one file. * Then rename the folder to the final one. * @param job/*from w w w . j ava 2s. c o m*/ * @param output * @param result */ protected void moveResults(JobConf job, String output, String result) { try { FileSystem fs = FileSystem.get(job); log.debug("move results: " + result); Path src = new Path(result + "/" + "*.csv*"); Path dst = new Path(output); if (!fs.exists(dst)) { fs.mkdirs(dst); } FileStatus[] matches = fs.globStatus(src, new PathFilter() { @Override public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }); if (matches != null && matches.length != 0) { if (matches.length > 1) { //multiple output files String[] args = new String[2]; args[0] = result; args[1] = "_" + result; fs.delete(new Path("_" + result)); //merge multiple output files into one file ToolRunner.run(new MergeOutput(this.conf), args); fs.delete(new Path(result)); fs.rename(new Path("_" + result), new Path(result)); } matches = fs.globStatus(src, new PathFilter() { @Override public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }); for (FileStatus file : matches) { String filename = file.getPath().getName(); filename = filename.substring(0, filename.indexOf("-")); log.debug("move file:" + filename); Path toFile = new Path(output + "/" + filename); if (fs.exists(toFile)) { fs.delete(toFile); } fs.rename(file.getPath(), toFile); fs.delete(file.getPath().getParent(), true); FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent()); if (tmpDirs == null || tmpDirs.length == 0) { fs.delete(file.getPath().getParent().getParent(), true); } break; } } else { MOVE_DONE = false; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); MOVE_DONE = false; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } MOVE_DONE = true; }
From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java
License:Apache License
protected boolean inputValidation(Configuration job, String dir, PathFilter filter) { boolean result = false; if (filter == null) { filter = new PathFilter() { @Override//from w w w .ja va2 s . c o m public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }; } Path[] p = StringUtils.stringToPath(new String[] { dir }); try { FileSystem fs = p[0].getFileSystem(job); FileStatus[] matches = fs.globStatus(p[0], filter); if (matches != null && matches.length != 0) { result = true; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; }
From source file:hydrograph.engine.cascading.scheme.avro.CustomAvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been * provided.//from w ww . jav a 2s . co m * * @param flowProcess * The cascading FlowProcess object for this flow. * @param tap * The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none * exists. */ private Schema getSourceSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:it.crs4.seal.read_sort.MergeAlignments.java
License:Open Source License
private Path[] getSourcePaths() throws Exception { Path srcPath = new Path(userInput); FileSystem srcFs = srcPath.getFileSystem(getConf()); if (srcFs.exists(srcPath)) { FileStatus stat = srcFs.getFileStatus(srcPath); if (stat.isDir()) { String msg = "source path " + srcPath + " is a directory. Globbing with "; srcPath = new Path(srcPath, "*"); log.info(msg + srcPath);/*from ww w . j ava 2 s.c o m*/ } } // Glob source path. The returned paths are already sorted. We filter out paths starting // with '_' (see SourcePathFilter). // If the path doesn't contain a glob patter, and it doesn't exist, the function will return null. Path[] sources = FileUtil.stat2Paths(srcFs.globStatus(srcPath, new SourcePathFilter())); if (sources == null) throw new IllegalArgumentException("Source path " + srcPath.makeQualified(srcFs) + " doesn't exist"); if (log.isDebugEnabled()) { log.debug("Sources:"); for (int i = 0; i < sources.length; ++i) log.debug(sources[i]); } if (sources.length == 0) throw new IllegalArgumentException("no source files selected"); log.info("Merging " + sources.length + " files."); return sources; }
From source file:ml.shifu.guagua.hadoop.GuaguaMRUnitDriver.java
License:Apache License
/** * List input directories.// www. j a va 2 s. co m * Subclasses may override to, e.g., select only files matching a regular expression. * * @param job * the job to list input paths for * @return array of FileStatus objects * @throws IOException * if zero items. * @throws InvalidInputException * If any IOException for input files. */ protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(input); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new IOException(errors.toString()); } return result; }
From source file:ml.shifu.guagua.mapreduce.GuaguaMRUnitDriver.java
License:Apache License
/** * List input directories.//from w ww .j av a 2s .c o m * Subclasses may override to, e.g., select only files matching a regular expression. * * @param job * the job to list input paths for * @return array of FileStatus objects * @throws IOException * if zero items or any IOException for input files. */ protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(input); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new IOException(errors.toString()); } return result; }
From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java
License:Apache License
@SuppressWarnings("deprecation") protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }//ww w .jav a2 s .c o m // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive", false); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { FileStatus[] fss = fs.listStatus(globStat.getPath()); for (FileStatus fileStatus : fss) { if (inputFilter.accept(fileStatus.getPath())) { if (recursive && fileStatus.isDir()) { addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter); } else { result.add(fileStatus); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total validation paths to process : " + result.size()); return result; }
From source file:org.apache.falcon.catalog.CatalogPartitionHandler.java
License:Apache License
private void registerPartitions(Configuration conf, CatalogStorage storage, Path staticPath, List<String> staticPartition) throws FalconException { try {//from ww w .ja v a2 s.co m FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(conf); if (!fs.exists(staticPath)) { //Do nothing if the output path doesn't exist return; } List<String> partitionColumns = getPartitionColumns(conf, storage); int dynamicPartCols = partitionColumns.size() - staticPartition.size(); Path searchPath = staticPath; if (dynamicPartCols > 0) { searchPath = new Path(staticPath, StringUtils.repeat("*", "/", dynamicPartCols)); } //Figure out the dynamic partitions from the directories on hdfs FileStatus[] files = fs.globStatus(searchPath, PATH_FILTER); Map<List<String>, String> partitions = new HashMap<List<String>, String>(); for (FileStatus file : files) { List<String> dynamicParts = getDynamicPartitions(file.getPath(), staticPath); List<String> partitionValues = new ArrayList<String>(staticPartition); partitionValues.addAll(dynamicParts); LOG.debug("Final partition - " + partitionValues); partitions.put(partitionValues, file.getPath().toString()); } List<List<String>> existPartitions = listPartitions(conf, storage, staticPartition); Collection<List<String>> targetPartitions = partitions.keySet(); Collection<List<String>> partitionsForDrop = CollectionUtils.subtract(existPartitions, targetPartitions); Collection<List<String>> partitionsForAdd = CollectionUtils.subtract(targetPartitions, existPartitions); Collection<List<String>> partitionsForUpdate = CollectionUtils.intersection(existPartitions, targetPartitions); for (List<String> partition : partitionsForDrop) { dropPartitions(conf, storage, partition); } for (List<String> partition : partitionsForAdd) { addPartition(conf, storage, partition, partitions.get(partition)); } for (List<String> partition : partitionsForUpdate) { updatePartition(conf, storage, partition, partitions.get(partition)); } } catch (IOException e) { throw new FalconException(e); } }
From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java
License:Apache License
/** * Common method for listing vertex/edge input directories. * * @param job The job/* w ww. ja va2 s.c o m*/ * @param dirs list of vertex/edge input paths * @return Array of FileStatus objects * @throws IOException */ private List<FileStatus> listStatus(JobContext job, Path[] dirs) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } /*if[HADOOP_NON_SECURE] else[HADOOP_NON_SECURE] // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); end[HADOOP_NON_SECURE]*/ List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the HIDDEN_FILE_FILTER and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(HIDDEN_FILE_FILTER); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }