Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:gobblin.source.DatePartitionedNestedRetriever.java

License:Apache License

/**
 * This method is to filter out files that don't need to be processed by extension
 * @return the pathFilter//from w ww  .ja  v  a2 s  . c  o m
 */
private PathFilter getFileFilter() {
    final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension
            : "." + this.expectedExtension;

    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(extension);
        }
    };
}

From source file:gobblin.util.FileListUtilsTest.java

License:Apache License

@Test
public void testListPathsRecursively() throws IOException {
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    Path baseDir = new Path(FILE_UTILS_TEST_DIR, "fileListTestDir2");
    try {// w  ww .j av a2  s  . c o  m
        if (localFs.exists(baseDir)) {
            localFs.delete(baseDir, true);
        }
        localFs.mkdirs(baseDir);
        localFs.create(new Path(baseDir, TEST_FILE_NAME1));
        Path subDir = new Path(baseDir, "subDir");
        localFs.mkdirs(subDir);
        localFs.create(new Path(subDir, TEST_FILE_NAME2));
        List<FileStatus> testFiles = FileListUtils.listPathsRecursively(localFs, baseDir, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return true;
            }
        });
        Assert.assertEquals(4, testFiles.size());

        Set<String> fileNames = Sets.newHashSet();
        for (FileStatus testFileStatus : testFiles) {
            fileNames.add(testFileStatus.getPath().getName());
        }

        Set<String> expectedFileNames = Sets.newHashSet();
        expectedFileNames.add(baseDir.getName());
        expectedFileNames.add(subDir.getName());
        expectedFileNames.add(TEST_FILE_NAME1);
        expectedFileNames.add(TEST_FILE_NAME2);

        Assert.assertEquals(fileNames, expectedFileNames);
    } finally {
        localFs.delete(baseDir, true);
    }
}

From source file:gobblin.util.logs.LogCopier.java

License:Apache License

/**
 * Perform a check on new source log files and submit copy tasks for new log files.
 *///  w w w .ja va2  s.co  m
private void checkSrcLogFiles() throws IOException {
    List<FileStatus> srcLogFiles = FileListUtils.listFilesRecursively(this.srcFs, this.srcLogDir,
            new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return LogCopier.this.logFileExtensions.contains(Files.getFileExtension(path.getName()));
                }
            });

    if (srcLogFiles.isEmpty()) {
        LOGGER.warn("No log file found under directory " + this.srcLogDir);
        return;
    }

    Set<Path> newLogFiles = Sets.newHashSet();
    for (FileStatus srcLogFile : srcLogFiles) {
        newLogFiles.add(srcLogFile.getPath());
    }

    HashSet<Path> deletedLogFiles = Sets.newHashSet(getSourceFiles());
    // Compute the set of deleted log files since the last check
    deletedLogFiles.removeAll(newLogFiles);
    // Compute the set of new log files since the last check
    newLogFiles.removeAll(getSourceFiles());

    // Schedule a copy task for each new log file
    for (final Path srcLogFile : newLogFiles) {
        String destLogFileName = this.logFileNamePrefix.isPresent()
                ? this.logFileNamePrefix.get() + "." + srcLogFile.getName()
                : srcLogFile.getName();
        final Path destLogFile = new Path(this.destLogDir, destLogFileName);

        this.scheduler.schedule(new LogCopyTask(srcLogFile, destLogFile), this.copyInterval, this.timeUnit);
    }

    // Cancel the copy task for each deleted log file
    for (Path deletedLogFile : deletedLogFiles) {
        Optional<LogCopyTask> logCopyTask = this.scheduler.getScheduledTask(deletedLogFile);
        if (logCopyTask.isPresent()) {
            this.scheduler.cancel(logCopyTask.get());
        }
    }
}

From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java

License:Apache License

/**
 * Move the TEMP output folder to final one(user defined one);
 * If there are multiple files under one job's output folder, it should merge the output into one file.
 * Then rename the folder to the final one.
 * @param job//  www . j a v  a  2  s.c  o  m
 * @param output
 * @param result
 */
protected void moveResults(JobConf job, String output, String result) {
    try {
        FileSystem fs = FileSystem.get(job);
        log.debug("move results: " + result);
        Path src = new Path(result + "/" + "*.csv*");
        Path dst = new Path(output);
        if (!fs.exists(dst)) {
            fs.mkdirs(dst);
        }
        FileStatus[] matches = fs.globStatus(src, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;

            }
        });
        if (matches != null && matches.length != 0) {
            if (matches.length > 1) {
                //multiple output files
                String[] args = new String[2];
                args[0] = result;
                args[1] = "_" + result;
                fs.delete(new Path("_" + result));
                //merge multiple output files into one file
                ToolRunner.run(new MergeOutput(this.conf), args);
                fs.delete(new Path(result));
                fs.rename(new Path("_" + result), new Path(result));
            }

            matches = fs.globStatus(src, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    // TODO Auto-generated method stub
                    return true;
                }
            });

            for (FileStatus file : matches) {
                String filename = file.getPath().getName();
                filename = filename.substring(0, filename.indexOf("-"));
                log.debug("move file:" + filename);
                Path toFile = new Path(output + "/" + filename);
                if (fs.exists(toFile)) {
                    fs.delete(toFile);
                }
                fs.rename(file.getPath(), toFile);
                fs.delete(file.getPath().getParent(), true);
                FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent());
                if (tmpDirs == null || tmpDirs.length == 0) {
                    fs.delete(file.getPath().getParent().getParent(), true);
                }
                break;
            }
        } else {
            MOVE_DONE = false;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        MOVE_DONE = false;
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    MOVE_DONE = true;
}

From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java

License:Apache License

protected boolean inputValidation(Configuration job, String dir, PathFilter filter) {
    boolean result = false;
    if (filter == null) {
        filter = new PathFilter() {
            @Override// ww w . j  ava  2  s  . c  o m
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;
            }

        };
    }
    Path[] p = StringUtils.stringToPath(new String[] { dir });
    try {
        FileSystem fs = p[0].getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p[0], filter);
        if (matches != null && matches.length != 0) {
            result = true;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return result;
}

From source file:io.druid.storage.hdfs.HdfsFileTimestampVersionFinder.java

License:Apache License

private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
    final PathFilter filter = new PathFilter() {
        @Override/* ww  w  .ja  v a 2  s  .  c  om*/
        public boolean accept(Path path) {
            return pattern == null || pattern.matcher(path.getName()).matches();
        }
    };
    long modifiedTime = Long.MIN_VALUE;
    URI mostRecentURI = null;
    final FileSystem fs = dir.getFileSystem(config);
    for (FileStatus status : fs.listStatus(dir, filter)) {
        if (status.isFile()) {
            final long thisModifiedTime = status.getModificationTime();
            if (thisModifiedTime >= modifiedTime) {
                modifiedTime = thisModifiedTime;
                mostRecentURI = status.getPath().toUri();
            }
        }
    }

    return mostRecentURI;
}

From source file:io.spring.batch.workflow.configuration.MainFlowConfiguration.java

License:Apache License

@Bean
public Partitioner partitioner(FileSystem fileSystem) {
    return new Partitioner() {
        @Override// w  w w . j  a v a2  s. c  o  m
        public Map<String, ExecutionContext> partition(int gridSize) {
            Map<String, ExecutionContext> contexts = new HashMap<>();

            try {
                FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/probes"), new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        try {
                            return fileSystem.isDirectory(path);
                        } catch (IOException e) {
                            return false;
                        }
                    }
                });

                int count = 0;
                for (FileStatus fileStatus : fileStatuses) {
                    ExecutionContext executionContext = new ExecutionContext();

                    executionContext.put("curInputDir", fileStatus.getPath().toString());

                    contexts.put("dir" + count, executionContext);

                    count++;
                }

            } catch (IOException e) {
                e.printStackTrace();
            }

            return contexts;
        }
    };
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java

License:Open Source License

public static void main(String[] args) throws IOException, InterruptedException {

    if (args.length != 2) {
        System.out.println("Usage: <input folder> <output file>");
        System.exit(-1);/*from  w w  w  .j a  v a  2 s  .  c  o m*/
    }

    String inputPath = args[0];
    String outputFile = args[1];

    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.toString().endsWith(".parquet");
        }
    });

    Path output = new Path(outputFile);

    fs.delete(output, true);

    ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>();
    inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class);

    Job job = new Job(conf);
    ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>(
            ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ProtoParquetOutputFormat.setEnableDictionary(job, true);

    RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output,
            CompressionCodecName.SNAPPY);

    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();

    for (FileStatus fileStatus : input) {
        System.out.println(fileStatus.getPath().toString());
        splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus)));
    }

    int splitIndex = 0;
    for (ParquetInputSplit split : splits) {

        System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of "
                + splits.size() + ")");

        TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex),
                splitIndex);
        TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

        RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split,
                ctx);
        reader.initialize(split, ctx);

        while (reader.nextKeyValue()) {

            ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue();

            ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder();

            builder.setUrl(record.getUrl());
            builder.setArchiveTime(record.getArchiveTime());

            builder.addAllScripts(record.getScriptsList());
            builder.addAllIframes(record.getIframesList());
            builder.addAllLinks(record.getLinksList());
            builder.addAllImages(record.getImagesList());

            recordWriter.write(null, builder.build());
        }

        if (reader != null) {
            reader.close();
        }

        splitIndex++;
    }

    TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1);
    TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

    if (recordWriter != null) {
        recordWriter.close(ctx);
    }

}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * List all the inputs files. Better to follow FileInputFormat#listStatus
 *//*from ww  w  .  ja  v a  2s .c  o  m*/
public static FileStatus[] listStatus(Configuration conf) throws IOException {
    String newPath = expandInputFolder(conf);
    // Get all files except pig or hadoop meta
    FileStatus[] fileStatus = FileSystem.get(conf).globStatus(new Path(newPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !isPigOrHadoopMetaFile(path);
        }
    });
    return fileStatus;
}

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private void updateAvgScores(SourceType source, String postTrainOutputPath) throws IOException {
    List<Scanner> scanners = null;
    try {/*from w w w.ja va  2s.  co m*/
        scanners = ShifuFileUtils.getDataScanners(postTrainOutputPath, source, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });

        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                String value = keyValues[1];
                ColumnConfig config = this.columnConfigList.get(Integer.parseInt(key));
                List<Integer> binAvgScores = new ArrayList<Integer>();
                String[] avgScores = value.split(",");
                for (int i = 0; i < avgScores.length; i++) {
                    binAvgScores.add(Integer.parseInt(avgScores[i]));
                }
                config.setBinAvgScore(binAvgScores);
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
}