Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:gobblin.source.DatePartitionedNestedRetriever.java

License:Apache License

/**
 * This method is to filter out files that don't need to be processed by extension
 * @return the pathFilter//from w ww  .ja  v  a2 s  . c  o m
 */
private PathFilter getFileFilter() {
    final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension
            : "." + this.expectedExtension;

    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(extension);
        }
    };
}

From source file:gobblin.util.FileListUtilsTest.java

License:Apache License

@Test
public void testListPathsRecursively() throws IOException {
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    Path baseDir = new Path(FILE_UTILS_TEST_DIR, "fileListTestDir2");
    try {// w  ww .j av a2  s  . c o  m
        if (localFs.exists(baseDir)) {
            localFs.delete(baseDir, true);
        }
        localFs.mkdirs(baseDir);
        localFs.create(new Path(baseDir, TEST_FILE_NAME1));
        Path subDir = new Path(baseDir, "subDir");
        localFs.mkdirs(subDir);
        localFs.create(new Path(subDir, TEST_FILE_NAME2));
        List<FileStatus> testFiles = FileListUtils.listPathsRecursively(localFs, baseDir, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return true;
            }
        });
        Assert.assertEquals(4, testFiles.size());

        Set<String> fileNames = Sets.newHashSet();
        for (FileStatus testFileStatus : testFiles) {
            fileNames.add(testFileStatus.getPath().getName());
        }

        Set<String> expectedFileNames = Sets.newHashSet();
        expectedFileNames.add(baseDir.getName());
        expectedFileNames.add(subDir.getName());
        expectedFileNames.add(TEST_FILE_NAME1);
        expectedFileNames.add(TEST_FILE_NAME2);

        Assert.assertEquals(fileNames, expectedFileNames);
    } finally {
        localFs.delete(baseDir, true);
    }
}

From source file:gobblin.util.logs.LogCopier.java

License:Apache License

/**
 * Perform a check on new source log files and submit copy tasks for new log files.
 *///  w w w .ja va2  s.co  m
private void checkSrcLogFiles() throws IOException {
    List<FileStatus> srcLogFiles = FileListUtils.listFilesRecursively(this.srcFs, this.srcLogDir,
            new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return LogCopier.this.logFileExtensions.contains(Files.getFileExtension(path.getName()));
                }
            });

    if (srcLogFiles.isEmpty()) {
        LOGGER.warn("No log file found under directory " + this.srcLogDir);
        return;
    }

    Set<Path> newLogFiles = Sets.newHashSet();
    for (FileStatus srcLogFile : srcLogFiles) {
        newLogFiles.add(srcLogFile.getPath());
    }

    HashSet<Path> deletedLogFiles = Sets.newHashSet(getSourceFiles());
    // Compute the set of deleted log files since the last check
    deletedLogFiles.removeAll(newLogFiles);
    // Compute the set of new log files since the last check
    newLogFiles.removeAll(getSourceFiles());

    // Schedule a copy task for each new log file
    for (final Path srcLogFile : newLogFiles) {
        String destLogFileName = this.logFileNamePrefix.isPresent()
                ? this.logFileNamePrefix.get() + "." + srcLogFile.getName()
                : srcLogFile.getName();
        final Path destLogFile = new Path(this.destLogDir, destLogFileName);

        this.scheduler.schedule(new LogCopyTask(srcLogFile, destLogFile), this.copyInterval, this.timeUnit);
    }

    // Cancel the copy task for each deleted log file
    for (Path deletedLogFile : deletedLogFiles) {
        Optional<LogCopyTask> logCopyTask = this.scheduler.getScheduledTask(deletedLogFile);
        if (logCopyTask.isPresent()) {
            this.scheduler.cancel(logCopyTask.get());
        }
    }
}

From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java

License:Apache License

/**
 * Move the TEMP output folder to final one(user defined one);
 * If there are multiple files under one job's output folder, it should merge the output into one file.
 * Then rename the folder to the final one.
 * @param job//  www . j a v  a  2  s.c  o  m
 * @param output
 * @param result
 */
protected void moveResults(JobConf job, String output, String result) {
    try {
        FileSystem fs = FileSystem.get(job);
        log.debug("move results: " + result);
        Path src = new Path(result + "/" + "*.csv*");
        Path dst = new Path(output);
        if (!fs.exists(dst)) {
            fs.mkdirs(dst);
        }
        FileStatus[] matches = fs.globStatus(src, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;

            }
        });
        if (matches != null && matches.length != 0) {
            if (matches.length > 1) {
                //multiple output files
                String[] args = new String[2];
                args[0] = result;
                args[1] = "_" + result;
                fs.delete(new Path("_" + result));
                //merge multiple output files into one file
                ToolRunner.run(new MergeOutput(this.conf), args);
                fs.delete(new Path(result));
                fs.rename(new Path("_" + result), new Path(result));
            }

            matches = fs.globStatus(src, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    // TODO Auto-generated method stub
                    return true;
                }
            });

            for (FileStatus file : matches) {
                String filename = file.getPath().getName();
                filename = filename.substring(0, filename.indexOf("-"));
                log.debug("move file:" + filename);
                Path toFile = new Path(output + "/" + filename);
                if (fs.exists(toFile)) {
                    fs.delete(toFile);
                }
                fs.rename(file.getPath(), toFile);
                fs.delete(file.getPath().getParent(), true);
                FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent());
                if (tmpDirs == null || tmpDirs.length == 0) {
                    fs.delete(file.getPath().getParent().getParent(), true);
                }
                break;
            }
        } else {
            MOVE_DONE = false;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        MOVE_DONE = false;
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    MOVE_DONE = true;
}

From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java

License:Apache License

protected boolean inputValidation(Configuration job, String dir, PathFilter filter) {
    boolean result = false;
    if (filter == null) {
        filter = new PathFilter() {
            @Override// ww w . j  ava  2  s  . c  o m
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;
            }

        };
    }
    Path[] p = StringUtils.stringToPath(new String[] { dir });
    try {
        FileSystem fs = p[0].getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p[0], filter);
        if (matches != null && matches.length != 0) {
            result = true;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return result;
}

From source file:io.druid.storage.hdfs.HdfsFileTimestampVersionFinder.java

License:Apache License

private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
    final PathFilter filter = new PathFilter() {
        @Override/* ww  w  .ja  v a 2  s  .  c  om*/
        public boolean accept(Path path) {
            return pattern == null || pattern.matcher(path.getName()).matches();
        }
    };
    long modifiedTime = Long.MIN_VALUE;
    URI mostRecentURI = null;
    final FileSystem fs = dir.getFileSystem(config);
    for (FileStatus status : fs.listStatus(dir, filter)) {
        if (status.isFile()) {
            final long thisModifiedTime = status.getModificationTime();
            if (thisModifiedTime >= modifiedTime) {
                modifiedTime = thisModifiedTime;
                mostRecentURI = status.getPath().toUri();
            }
        }
    }

    return mostRecentURI;
}

From source file:io.spring.batch.workflow.configuration.MainFlowConfiguration.java

License:Apache License

@Bean
public Partitioner partitioner(FileSystem fileSystem) {
    return new Partitioner() {
        @Override// w  w w . j  a v a2  s. c  o  m
        public Map<String, ExecutionContext> partition(int gridSize) {
            Map<String, ExecutionContext> contexts = new HashMap<>();

            try {
                FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/probes"), new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        try {
                            return fileSystem.isDirectory(path);
                        } catch (IOException e) {
                            return false;
                        }
                    }
                });

                int count = 0;
                for (FileStatus fileStatus : fileStatuses) {
                    ExecutionContext executionContext = new ExecutionContext();

                    executionContext.put("curInputDir", fileStatus.getPath().toString());

                    contexts.put("dir" + count, executionContext);

                    count++;
                }

            } catch (IOException e) {
                e.printStackTrace();
            }

            return contexts;
        }
    };
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java

License:Open Source License

public static void main(String[] args) throws IOException, InterruptedException {

    if (args.length != 2) {
        System.out.println("Usage: <input folder> <output file>");
        System.exit(-1);/*from  w w  w  .j a  v a  2 s  .  c  o m*/
    }

    String inputPath = args[0];
    String outputFile = args[1];

    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.toString().endsWith(".parquet");
        }
    });

    Path output = new Path(outputFile);

    fs.delete(output, true);

    ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>();
    inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class);

    Job job = new Job(conf);
    ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>(
            ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ProtoParquetOutputFormat.setEnableDictionary(job, true);

    RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output,
            CompressionCodecName.SNAPPY);

    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();

    for (FileStatus fileStatus : input) {
        System.out.println(fileStatus.getPath().toString());
        splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus)));
    }

    int splitIndex = 0;
    for (ParquetInputSplit split : splits) {

        System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of "
                + splits.size() + ")");

        TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex),
                splitIndex);
        TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

        RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split,
                ctx);
        reader.initialize(split, ctx);

        while (reader.nextKeyValue()) {

            ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue();

            ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder();

            builder.setUrl(record.getUrl());
            builder.setArchiveTime(record.getArchiveTime());

            builder.addAllScripts(record.getScriptsList());
            builder.addAllIframes(record.getIframesList());
            builder.addAllLinks(record.getLinksList());
            builder.addAllImages(record.getImagesList());

            recordWriter.write(null, builder.build());
        }

        if (reader != null) {
            reader.close();
        }

        splitIndex++;
    }

    TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1);
    TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

    if (recordWriter != null) {
        recordWriter.close(ctx);
    }

}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

/**
 * List all the inputs files. Better to follow FileInputFormat#listStatus
 *//*from ww  w  .  ja  v a  2s .c  o  m*/
public static FileStatus[] listStatus(Configuration conf) throws IOException {
    String newPath = expandInputFolder(conf);
    // Get all files except pig or hadoop meta
    FileStatus[] fileStatus = FileSystem.get(conf).globStatus(new Path(newPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !isPigOrHadoopMetaFile(path);
        }
    });
    return fileStatus;
}

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private void updateAvgScores(SourceType source, String postTrainOutputPath) throws IOException {
    List<Scanner> scanners = null;
    try {/*from w w w.ja va  2s.  co m*/
        scanners = ShifuFileUtils.getDataScanners(postTrainOutputPath, source, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });

        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                String value = keyValues[1];
                ColumnConfig config = this.columnConfigList.get(Integer.parseInt(key));
                List<Integer> binAvgScores = new ArrayList<Integer>();
                String[] avgScores = value.split(",");
                for (int i = 0; i < avgScores.length; i++) {
                    binAvgScores.add(Integer.parseInt(avgScores[i]));
                }
                config.setBinAvgScore(binAvgScores);
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
}