List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
private FileStatus[] getExpiredHoplogs() throws IOException { FileStatus files[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() { @Override/*from www.ja va 2 s. com*/ public boolean accept(Path file) { // All expired hoplog end with expire extension and must match the valid file regex String fileName = file.getName(); if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) { return false; } return true; } }); return files; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
/** * locks sorted oplogs collection, removes oplog and renames for deletion later * @throws IOException //w ww . j av a 2s .c om */ private void markHoplogsForDeletion() throws IOException { ArrayList<IOException> errors = new ArrayList<IOException>(); FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() { @Override public boolean accept(Path file) { // All valid hoplog files must match the regex Matcher matcher = HOPLOG_PATTERN.matcher(file.getName()); return matcher.matches(); } }); FileStatus[] expired = getExpiredHoplogs(); validHoplogs = filterValidHoplogs(validHoplogs, expired); if (validHoplogs == null || validHoplogs.length == 0) { return; } for (FileStatus fileStatus : validHoplogs) { try { addExpiryMarkerForAFile(getHoplog(fileStatus.getPath())); } catch (IOException e) { // even if there is an IO error continue removing other hoplogs and // notify at the end errors.add(e); } } if (!errors.isEmpty()) { for (IOException e : errors) { logger.warn(LocalizedStrings.HOPLOG_HOPLOG_REMOVE_FAILED, e); } } }
From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java
License:Apache License
private static Map<Integer, List<String>> readPoints(Path pointsPathDir, Configuration conf) throws IOException { Map<Integer, List<String>> result = new TreeMap<Integer, List<String>>(); FileSystem fs = pointsPathDir.getFileSystem(conf); FileStatus[] children = fs.listStatus(pointsPathDir, new PathFilter() { public boolean accept(Path path) { String name = path.getName(); return !(name.endsWith(".crc") || name.startsWith("_")); }//from w w w . j a va2 s . c om }); for (FileStatus file : children) { Path path = file.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); try { IntWritable key = reader.getKeyClass().asSubclass(IntWritable.class).newInstance(); WeightedVectorWritable value = reader.getValueClass().asSubclass(WeightedVectorWritable.class) .newInstance(); while (reader.next(key, value)) { //key is the clusterId, value is a list of points //String clusterId = value.toString(); List<String> pointList = result.get(key.get()); if (pointList == null) { pointList = new ArrayList<String>(); result.put(key.get(), pointList); } //We know we are dealing with named vectors, b/c we generated from the id field String name = ((NamedVector) value.getVector()).getName(); pointList.add(name); //value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance(); } } catch (InstantiationException e) { log.error("Exception", e); } catch (IllegalAccessException e) { log.error("Exception", e); } } return result; }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
/** * Returns the output from {@link CrushReducer}. Each reducer writes out a mapping of source files to crush output file. *///from www .j ava2 s. com private List<FileStatus> getOutputMappings() throws IOException { FileStatus[] files = fs.listStatus(outDir, new PathFilter() { Matcher matcher = Pattern.compile("part-\\d+").matcher("dummy"); @Override public boolean accept(Path path) { matcher.reset(path.getName()); return matcher.matches(); } }); return asList(files); }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /*/* w w w. j a v a2 s . c o m*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException { PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().startsWith("part-"); }/* w w w .j ava 2 s . c om*/ }; FileStatus[] list = fs.listStatus(path, filter); for (FileStatus stat : list) { fs.delete(stat.getPath(), false); } }
From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java
License:Apache License
public void setSequential(JobConf conf) throws Exception { registerSerializers(conf);/* w ww . ja v a2 s. c om*/ // For an expression, the location is the final file name Path outPath = new Path(location); FileSystem fs = outPath.getFileSystem(conf); outPath = outPath.makeQualified(fs); if (fs.exists(outPath)) { // TODO: Jaql currently has overwrite semantics; add flag to control this if (fs.isFile(outPath)) { fs.delete(outPath, false); } else { // Look for a map-reduce output directory FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() { boolean onlyOne = true; public boolean accept(Path path) { String name = path.getName(); if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) { return false; } if (onlyOne) { onlyOne = false; return true; } return false; } }); if (nonMR.length > 0) { throw new IOException( "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath()); } fs.delete(outPath, true); } } // In sequential mode, we will write directly to the output file // and bypass the _temporary directory and rename of the standard // FileOutputCommitter by using our own DirectFileOutputCommitter. FileOutputFormat.setOutputPath(conf, outPath.getParent()); conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class); }
From source file:com.iflytek.spider.util.HadoopFSUtil.java
License:Apache License
/** * Returns PathFilter that passes all paths through. *///from w w w . j a v a2 s.com public static PathFilter getPassAllFilter() { return new PathFilter() { public boolean accept(Path arg0) { return true; } }; }
From source file:com.iflytek.spider.util.HadoopFSUtil.java
License:Apache License
/** * Returns PathFilter that passes directories through. *//*from w w w. ja va 2 s. c o m*/ public static PathFilter getPassDirectoriesFilter(final FileSystem fs) { return new PathFilter() { public boolean accept(final Path path) { try { return fs.getFileStatus(path).isDir(); } catch (IOException ioe) { return false; } } }; }
From source file:com.inmobi.conduit.AbstractService.java
License:Apache License
private List<Path> listPartFiles(Path path, FileSystem fs) { List<Path> matches = new LinkedList<Path>(); try {// w ww .ja v a 2s. c o m FileStatus[] statuses = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return path.toString().contains("part"); } }); for (FileStatus status : statuses) { matches.add(status.getPath()); } } catch (IOException e) { LOG.error(e.getMessage(), e); } return matches; }