List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:gobblin.source.DatePartitionedNestedRetriever.java
License:Apache License
/** * This method is to filter out files that don't need to be processed by extension * @return the pathFilter//from w ww .ja v a2 s . c o m */ private PathFilter getFileFilter() { final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension : "." + this.expectedExtension; return new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(extension); } }; }
From source file:gobblin.util.FileListUtilsTest.java
License:Apache License
@Test public void testListPathsRecursively() throws IOException { FileSystem localFs = FileSystem.getLocal(new Configuration()); Path baseDir = new Path(FILE_UTILS_TEST_DIR, "fileListTestDir2"); try {// w ww .j av a2 s . c o m if (localFs.exists(baseDir)) { localFs.delete(baseDir, true); } localFs.mkdirs(baseDir); localFs.create(new Path(baseDir, TEST_FILE_NAME1)); Path subDir = new Path(baseDir, "subDir"); localFs.mkdirs(subDir); localFs.create(new Path(subDir, TEST_FILE_NAME2)); List<FileStatus> testFiles = FileListUtils.listPathsRecursively(localFs, baseDir, new PathFilter() { @Override public boolean accept(Path path) { return true; } }); Assert.assertEquals(4, testFiles.size()); Set<String> fileNames = Sets.newHashSet(); for (FileStatus testFileStatus : testFiles) { fileNames.add(testFileStatus.getPath().getName()); } Set<String> expectedFileNames = Sets.newHashSet(); expectedFileNames.add(baseDir.getName()); expectedFileNames.add(subDir.getName()); expectedFileNames.add(TEST_FILE_NAME1); expectedFileNames.add(TEST_FILE_NAME2); Assert.assertEquals(fileNames, expectedFileNames); } finally { localFs.delete(baseDir, true); } }
From source file:gobblin.util.logs.LogCopier.java
License:Apache License
/** * Perform a check on new source log files and submit copy tasks for new log files. */// w w w .ja va2 s.co m private void checkSrcLogFiles() throws IOException { List<FileStatus> srcLogFiles = FileListUtils.listFilesRecursively(this.srcFs, this.srcLogDir, new PathFilter() { @Override public boolean accept(Path path) { return LogCopier.this.logFileExtensions.contains(Files.getFileExtension(path.getName())); } }); if (srcLogFiles.isEmpty()) { LOGGER.warn("No log file found under directory " + this.srcLogDir); return; } Set<Path> newLogFiles = Sets.newHashSet(); for (FileStatus srcLogFile : srcLogFiles) { newLogFiles.add(srcLogFile.getPath()); } HashSet<Path> deletedLogFiles = Sets.newHashSet(getSourceFiles()); // Compute the set of deleted log files since the last check deletedLogFiles.removeAll(newLogFiles); // Compute the set of new log files since the last check newLogFiles.removeAll(getSourceFiles()); // Schedule a copy task for each new log file for (final Path srcLogFile : newLogFiles) { String destLogFileName = this.logFileNamePrefix.isPresent() ? this.logFileNamePrefix.get() + "." + srcLogFile.getName() : srcLogFile.getName(); final Path destLogFile = new Path(this.destLogDir, destLogFileName); this.scheduler.schedule(new LogCopyTask(srcLogFile, destLogFile), this.copyInterval, this.timeUnit); } // Cancel the copy task for each deleted log file for (Path deletedLogFile : deletedLogFiles) { Optional<LogCopyTask> logCopyTask = this.scheduler.getScheduledTask(deletedLogFile); if (logCopyTask.isPresent()) { this.scheduler.cancel(logCopyTask.get()); } } }
From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java
License:Apache License
/** * Move the TEMP output folder to final one(user defined one); * If there are multiple files under one job's output folder, it should merge the output into one file. * Then rename the folder to the final one. * @param job// www . j a v a 2 s.c o m * @param output * @param result */ protected void moveResults(JobConf job, String output, String result) { try { FileSystem fs = FileSystem.get(job); log.debug("move results: " + result); Path src = new Path(result + "/" + "*.csv*"); Path dst = new Path(output); if (!fs.exists(dst)) { fs.mkdirs(dst); } FileStatus[] matches = fs.globStatus(src, new PathFilter() { @Override public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }); if (matches != null && matches.length != 0) { if (matches.length > 1) { //multiple output files String[] args = new String[2]; args[0] = result; args[1] = "_" + result; fs.delete(new Path("_" + result)); //merge multiple output files into one file ToolRunner.run(new MergeOutput(this.conf), args); fs.delete(new Path(result)); fs.rename(new Path("_" + result), new Path(result)); } matches = fs.globStatus(src, new PathFilter() { @Override public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }); for (FileStatus file : matches) { String filename = file.getPath().getName(); filename = filename.substring(0, filename.indexOf("-")); log.debug("move file:" + filename); Path toFile = new Path(output + "/" + filename); if (fs.exists(toFile)) { fs.delete(toFile); } fs.rename(file.getPath(), toFile); fs.delete(file.getPath().getParent(), true); FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent()); if (tmpDirs == null || tmpDirs.length == 0) { fs.delete(file.getPath().getParent().getParent(), true); } break; } } else { MOVE_DONE = false; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); MOVE_DONE = false; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } MOVE_DONE = true; }
From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java
License:Apache License
protected boolean inputValidation(Configuration job, String dir, PathFilter filter) { boolean result = false; if (filter == null) { filter = new PathFilter() { @Override// ww w . j ava 2 s . c o m public boolean accept(Path path) { // TODO Auto-generated method stub return true; } }; } Path[] p = StringUtils.stringToPath(new String[] { dir }); try { FileSystem fs = p[0].getFileSystem(job); FileStatus[] matches = fs.globStatus(p[0], filter); if (matches != null && matches.length != 0) { result = true; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; }
From source file:io.druid.storage.hdfs.HdfsFileTimestampVersionFinder.java
License:Apache License
private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException { final PathFilter filter = new PathFilter() { @Override/* ww w .ja v a 2 s . c om*/ public boolean accept(Path path) { return pattern == null || pattern.matcher(path.getName()).matches(); } }; long modifiedTime = Long.MIN_VALUE; URI mostRecentURI = null; final FileSystem fs = dir.getFileSystem(config); for (FileStatus status : fs.listStatus(dir, filter)) { if (status.isFile()) { final long thisModifiedTime = status.getModificationTime(); if (thisModifiedTime >= modifiedTime) { modifiedTime = thisModifiedTime; mostRecentURI = status.getPath().toUri(); } } } return mostRecentURI; }
From source file:io.spring.batch.workflow.configuration.MainFlowConfiguration.java
License:Apache License
@Bean public Partitioner partitioner(FileSystem fileSystem) { return new Partitioner() { @Override// w w w . j a v a2 s. c o m public Map<String, ExecutionContext> partition(int gridSize) { Map<String, ExecutionContext> contexts = new HashMap<>(); try { FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/probes"), new PathFilter() { @Override public boolean accept(Path path) { try { return fileSystem.isDirectory(path); } catch (IOException e) { return false; } } }); int count = 0; for (FileStatus fileStatus : fileStatuses) { ExecutionContext executionContext = new ExecutionContext(); executionContext.put("curInputDir", fileStatus.getPath().toString()); contexts.put("dir" + count, executionContext); count++; } } catch (IOException e) { e.printStackTrace(); } return contexts; } }; }
From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java
License:Open Source License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2) { System.out.println("Usage: <input folder> <output file>"); System.exit(-1);/*from w w w .j a v a 2 s . c o m*/ } String inputPath = args[0]; String outputFile = args[1]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() { @Override public boolean accept(Path path) { return path.toString().endsWith(".parquet"); } }); Path output = new Path(outputFile); fs.delete(output, true); ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>(); inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class); Job job = new Job(conf); ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>( ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ProtoParquetOutputFormat.setEnableDictionary(job, true); RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output, CompressionCodecName.SNAPPY); List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); for (FileStatus fileStatus : input) { System.out.println(fileStatus.getPath().toString()); splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus))); } int splitIndex = 0; for (ParquetInputSplit split : splits) { System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of " + splits.size() + ")"); TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex), splitIndex); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split, ctx); reader.initialize(split, ctx); while (reader.nextKeyValue()) { ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue(); ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder(); builder.setUrl(record.getUrl()); builder.setArchiveTime(record.getArchiveTime()); builder.addAllScripts(record.getScriptsList()); builder.addAllIframes(record.getIframesList()); builder.addAllLinks(record.getLinksList()); builder.addAllImages(record.getImagesList()); recordWriter.write(null, builder.build()); } if (reader != null) { reader.close(); } splitIndex++; } TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); if (recordWriter != null) { recordWriter.close(ctx); } }
From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java
License:Apache License
/** * List all the inputs files. Better to follow FileInputFormat#listStatus *//*from ww w . ja v a 2s .c o m*/ public static FileStatus[] listStatus(Configuration conf) throws IOException { String newPath = expandInputFolder(conf); // Get all files except pig or hadoop meta FileStatus[] fileStatus = FileSystem.get(conf).globStatus(new Path(newPath), new PathFilter() { @Override public boolean accept(Path path) { return !isPigOrHadoopMetaFile(path); } }); return fileStatus; }
From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java
License:Apache License
private void updateAvgScores(SourceType source, String postTrainOutputPath) throws IOException { List<Scanner> scanners = null; try {/*from w w w.ja va 2s. co m*/ scanners = ShifuFileUtils.getDataScanners(postTrainOutputPath, source, new PathFilter() { @Override public boolean accept(Path path) { return path.toString().contains("part-r-"); } }); for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String line = scanner.nextLine().trim(); String[] keyValues = line.split("\t"); String key = keyValues[0]; String value = keyValues[1]; ColumnConfig config = this.columnConfigList.get(Integer.parseInt(key)); List<Integer> binAvgScores = new ArrayList<Integer>(); String[] avgScores = value.split(","); for (int i = 0; i < avgScores.length; i++) { binAvgScores.add(Integer.parseInt(avgScores[i])); } config.setBinAvgScore(binAvgScores); } } } finally { // release closeScanners(scanners); } }