Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

private FileStatus[] getExpiredHoplogs() throws IOException {
    FileStatus files[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() {
        @Override/*from   www.ja va 2  s.  com*/
        public boolean accept(Path file) {
            // All expired hoplog end with expire extension and must match the valid file regex
            String fileName = file.getName();
            if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) {
                return false;
            }
            return true;
        }
    });
    return files;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java

License:Apache License

/**
 * locks sorted oplogs collection, removes oplog and renames for deletion later
 * @throws IOException //w ww  .  j av a 2s  .c om
 */
private void markHoplogsForDeletion() throws IOException {

    ArrayList<IOException> errors = new ArrayList<IOException>();
    FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() {
        @Override
        public boolean accept(Path file) {
            // All valid hoplog files must match the regex
            Matcher matcher = HOPLOG_PATTERN.matcher(file.getName());
            return matcher.matches();
        }
    });

    FileStatus[] expired = getExpiredHoplogs();
    validHoplogs = filterValidHoplogs(validHoplogs, expired);

    if (validHoplogs == null || validHoplogs.length == 0) {
        return;
    }
    for (FileStatus fileStatus : validHoplogs) {
        try {
            addExpiryMarkerForAFile(getHoplog(fileStatus.getPath()));
        } catch (IOException e) {
            // even if there is an IO error continue removing other hoplogs and
            // notify at the end
            errors.add(e);
        }
    }

    if (!errors.isEmpty()) {
        for (IOException e : errors) {
            logger.warn(LocalizedStrings.HOPLOG_HOPLOG_REMOVE_FAILED, e);
        }
    }
}

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

private static Map<Integer, List<String>> readPoints(Path pointsPathDir, Configuration conf)
        throws IOException {
    Map<Integer, List<String>> result = new TreeMap<Integer, List<String>>();

    FileSystem fs = pointsPathDir.getFileSystem(conf);
    FileStatus[] children = fs.listStatus(pointsPathDir, new PathFilter() {
        public boolean accept(Path path) {
            String name = path.getName();
            return !(name.endsWith(".crc") || name.startsWith("_"));
        }//from   w  w  w .  j  a va2 s  .  c om
    });

    for (FileStatus file : children) {
        Path path = file.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
            IntWritable key = reader.getKeyClass().asSubclass(IntWritable.class).newInstance();
            WeightedVectorWritable value = reader.getValueClass().asSubclass(WeightedVectorWritable.class)
                    .newInstance();
            while (reader.next(key, value)) {
                //key is the clusterId, value is a list of points
                //String clusterId = value.toString();
                List<String> pointList = result.get(key.get());
                if (pointList == null) {
                    pointList = new ArrayList<String>();
                    result.put(key.get(), pointList);
                }
                //We know we are dealing with named vectors, b/c we generated from the id field
                String name = ((NamedVector) value.getVector()).getName();
                pointList.add(name);
                //value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance();
            }
        } catch (InstantiationException e) {
            log.error("Exception", e);
        } catch (IllegalAccessException e) {
            log.error("Exception", e);
        }
    }

    return result;
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

/**
 * Returns the output from {@link CrushReducer}. Each reducer writes out a mapping of source files to crush output file.
 *///from  www .j  ava2  s. com
private List<FileStatus> getOutputMappings() throws IOException {
    FileStatus[] files = fs.listStatus(outDir, new PathFilter() {
        Matcher matcher = Pattern.compile("part-\\d+").matcher("dummy");

        @Override
        public boolean accept(Path path) {
            matcher.reset(path.getName());

            return matcher.matches();
        }
    });

    return asList(files);
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath());

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();

    /*/* w w  w. j  a v  a2  s  . c  o m*/
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));

    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFiles == null)
                            return true;
                        ignoredFiles.reset(testPath.toUri().getPath());
                        return !ignoredFiles.matches();
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, " is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            boolean changed = uncrushedFiles.add(path.toUri().getPath());

                            assert changed : path.toUri().getPath();

                            long fileLength = content.getLen();

                            if (fileLength <= maxEligibleSize) {
                                crushables.add(content);
                                crushableBytes += fileLength;
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, " has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;

                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);

                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();

                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                        } else {
                            nBuckets += crushFiles.size();

                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

                            print(Verbosity.INFO, " => " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> bucketFiles = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), bucketFiles.size()));

                                key.set(bucketId);

                                for (String f : bucketFiles) {
                                    boolean changed = uncrushedFiles.remove(f);

                                    assert changed : f;

                                    pathMatcher.reset(f);

                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + bucketFiles, e);
        }
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();

    assert partitions.size() <= numPartitions;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    IntWritable partNum = new IntWritable();

    try {
        for (Bucket partition : partitions) {
            String partitionName = partition.name();

            partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));

            for (String bucketId : partition.contents()) {
                key.set(bucketId);

                writer.append(key, partNum);
            }
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }

    DataOutputStream countersStream = fs.create(this.counters);

    try {
        jobCounters.write(countersStream);
    } finally {
        try {
            countersStream.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException {
    PathFilter filter = new PathFilter() {
        public boolean accept(Path file) {
            return file.getName().startsWith("part-");
        }/*  w  w w .j  ava  2  s . c om*/
    };
    FileStatus[] list = fs.listStatus(path, filter);
    for (FileStatus stat : list) {
        fs.delete(stat.getPath(), false);
    }
}

From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java

License:Apache License

public void setSequential(JobConf conf) throws Exception {
    registerSerializers(conf);/*  w  ww . ja v a2  s. c om*/

    // For an expression, the location is the final file name
    Path outPath = new Path(location);
    FileSystem fs = outPath.getFileSystem(conf);
    outPath = outPath.makeQualified(fs);
    if (fs.exists(outPath)) {
        // TODO: Jaql currently has overwrite semantics; add flag to control this
        if (fs.isFile(outPath)) {
            fs.delete(outPath, false);
        } else {
            // Look for a map-reduce output directory
            FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() {
                boolean onlyOne = true;

                public boolean accept(Path path) {
                    String name = path.getName();
                    if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) {
                        return false;
                    }
                    if (onlyOne) {
                        onlyOne = false;
                        return true;
                    }
                    return false;
                }
            });
            if (nonMR.length > 0) {
                throw new IOException(
                        "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath());
            }
            fs.delete(outPath, true);
        }
    }

    // In sequential mode, we will write directly to the output file
    // and bypass the _temporary directory and rename of the standard 
    // FileOutputCommitter by using our own DirectFileOutputCommitter.
    FileOutputFormat.setOutputPath(conf, outPath.getParent());
    conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class);
}

From source file:com.iflytek.spider.util.HadoopFSUtil.java

License:Apache License

/**
 * Returns PathFilter that passes all paths through.
 *///from  w  w  w .  j a v  a2  s.com
public static PathFilter getPassAllFilter() {
    return new PathFilter() {
        public boolean accept(Path arg0) {
            return true;
        }
    };
}

From source file:com.iflytek.spider.util.HadoopFSUtil.java

License:Apache License

/**
 * Returns PathFilter that passes directories through.
 *//*from w  w w.  ja va  2 s.  c  o  m*/
public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
    return new PathFilter() {
        public boolean accept(final Path path) {
            try {
                return fs.getFileStatus(path).isDir();
            } catch (IOException ioe) {
                return false;
            }
        }

    };
}

From source file:com.inmobi.conduit.AbstractService.java

License:Apache License

private List<Path> listPartFiles(Path path, FileSystem fs) {
    List<Path> matches = new LinkedList<Path>();
    try {// w ww  .ja  v  a  2s.  c  o  m
        FileStatus[] statuses = fs.listStatus(path, new PathFilter() {
            public boolean accept(Path path) {
                return path.toString().contains("part");
            }
        });
        for (FileStatus status : statuses) {
            matches.add(status.getPath());
        }
    } catch (IOException e) {
        LOG.error(e.getMessage(), e);
    }
    return matches;
}