Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testOrderbyIncreaseParallelism() throws IOException {
    // order by parallelism is 3 originally, increase to 4
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            "1000");
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = group A by name parallel 3;");
    pigServer.registerQuery("C = foreach B generate group as name, AVG(A.age) as age;");
    pigServer.registerQuery("D = order C by age;");
    pigServer.store("D", "output3");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output3"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }//from   w w w. ja v  a  2 s.c  o  m
            return false;
        }
    });
    assertEquals(files.length, 4);
}

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testSkewedJoinDecreaseParallelism() throws IOException {
    // skewed join parallelism is 4 originally, reduce to 1
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            Long.toString(InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);");
    pigServer.registerQuery("C = join A by name, B by name using 'skewed';");
    pigServer.store("C", "output4");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output4"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }/*from w ww  .  j a v a  2  s .c o m*/
            return false;
        }
    });
    assertEquals(files.length, 1);
}

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testSkewedJoinIncreaseParallelism() throws IOException {
    // skewed join parallelism is 3 originally, increase to 5
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            "80000");
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);");
    pigServer.registerQuery("C = join A by name, B by name using 'skewed';");
    pigServer.store("C", "output5");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output5"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }//from  www  .  j a  v  a  2  s .c  o  m
            return false;
        }
    });
    assertEquals(files.length, 5);
}

From source file:org.apache.ranger.plugin.store.file.BaseFileStore.java

License:Apache License

protected <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception {
    if (LOG.isDebugEnabled()) {
        LOG.debug("==> BaseFileStore.loadFromDir()");
    }/*  www  .j  a  v a  2  s .  co  m*/

    List<T> ret = new ArrayList<T>();

    try {
        FileSystem fileSystem = getFileSystem(dirPath);

        if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) {
            PathFilter filter = new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON);
                }
            };

            FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter);

            if (sdFiles != null) {
                for (FileStatus sdFile : sdFiles) {
                    T obj = loadFromFile(sdFile.getPath(), cls);

                    if (obj != null) {
                        ret.add(obj);
                    }
                }
            }
        } else {
            LOG.error(dirPath + ": does not exists or not a directory");
        }
    } catch (IOException excp) {
        LOG.warn("error loading service-def in directory " + dirPath, excp);
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("<== BaseFileStore.loadFromDir(): count=" + (ret == null ? 0 : ret.size()));
    }

    return ret;
}

From source file:org.apache.ranger.plugin.store.file.FileStoreUtil.java

License:Apache License

public <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception {
    if (LOG.isDebugEnabled()) {
        LOG.debug("==> FileStoreUtil.loadFromDir()");
    }//from   w ww. ja v  a 2  s  .  c  om

    List<T> ret = new ArrayList<T>();

    try {
        FileSystem fileSystem = getFileSystem(dirPath);

        if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) {
            PathFilter filter = new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON);
                }
            };

            FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter);

            if (sdFiles != null) {
                for (FileStatus sdFile : sdFiles) {
                    T obj = loadFromFile(sdFile.getPath(), cls);

                    if (obj != null) {
                        ret.add(obj);
                    }
                }
            }
        } else {
            LOG.error(dirPath + ": does not exists or not a directory");
        }
    } catch (IOException excp) {
        LOG.warn("error loading service-def in directory " + dirPath, excp);
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("<== FileStoreUtil.loadFromDir(): count=" + (ret == null ? 0 : ret.size()));
    }

    return ret;
}

From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java

License:Apache License

/**
 * Set up the MapReduce job to use file inputs from previous iterations.
 * @param   fileMapper  Mapper class for generated triples
 * @param   incMapper   Mapper class for generated inconsistenies
 * @param   filter      Exclude facts that aren't helpful for inference
 */// w w w .  j av  a 2  s  . c o  m
protected void configureFileInput(Class<? extends Mapper<Fact, NullWritable, ?, ?>> fileMapper,
        Class<? extends Mapper<Derivation, NullWritable, ?, ?>> incMapper, final boolean filter)
        throws IOException {
    // Set up file input for all iterations up to this one
    Configuration conf = job.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
    Path inputPath;
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    // Set min/max split, if not already provided:
    long blocksize = Long.parseLong(conf.get("dfs.blocksize"));
    String minSplitProp = "mapreduce.input.fileinputformat.split.minsize";
    String maxSplitProp = "mapreduce.input.fileinputformat.split.maxsize";
    conf.set(minSplitProp, conf.get(minSplitProp, String.valueOf(blocksize)));
    conf.set(maxSplitProp, conf.get(maxSplitProp, String.valueOf(blocksize * 8)));
    for (int i = 1; i <= iteration; i++) {
        // Prefer cleaned output...
        inputPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + i);
        // But if there isn't any, try intermediate data:
        if (!fs.isDirectory(inputPath)) {
            inputPath = MRReasoningUtils.getOutputPath(conf,
                    MRReasoningUtils.OUTPUT_BASE + i + MRReasoningUtils.TEMP_SUFFIX);
        }
        // And only proceed if we found one or the other.
        if (fs.isDirectory(inputPath)) {
            // Never include debug output. If filter is true, select only
            // intermediate and schema data, otherwise include everything.
            PathFilter f = new PathFilter() {
                public boolean accept(Path path) {
                    String s = path.getName();
                    if (s.startsWith(MRReasoningUtils.DEBUG_OUT)) {
                        return false;
                    } else {
                        return !filter || s.startsWith(MRReasoningUtils.INTERMEDIATE_OUT)
                                || s.startsWith(MRReasoningUtils.SCHEMA_OUT);
                    }
                }
            };
            for (FileStatus status : fs.listStatus(inputPath, f)) {
                if (status.getLen() > 0) {
                    Path p = status.getPath();
                    String s = p.getName();
                    if (s.startsWith(MRReasoningUtils.INCONSISTENT_OUT)) {
                        if (incMapper != null) {
                            MultipleInputs.addInputPath(job, p, CombineSequenceFileInputFormat.class,
                                    incMapper);
                        }
                    } else {
                        MultipleInputs.addInputPath(job, status.getPath(), CombineSequenceFileInputFormat.class,
                                fileMapper);
                    }
                }
            }
        }
    }
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private long addInputFiles(List<Path> inputFiles, List<Path> inputLists, Path fullInputList, Configuration conf)
        throws IOException {

    long numFiles = 0;
    FileSystem fs = fullInputList.getFileSystem(conf);
    FSDataOutputStream out = fs.create(fullInputList);
    try {/* ww  w  .j av a 2 s.  c om*/
        Writer writer = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));

        for (Path inputFile : inputFiles) {
            FileSystem inputFileFs = inputFile.getFileSystem(conf);
            if (inputFileFs.exists(inputFile)) {
                PathFilter pathFilter = new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        return !path.getName().startsWith("."); // ignore "hidden" files and dirs
                    }
                };
                numFiles += addInputFilesRecursively(inputFile, writer, inputFileFs, pathFilter);
            }
        }

        for (Path inputList : inputLists) {
            InputStream in;
            if (inputList.toString().equals("-")) {
                in = System.in;
            } else if (inputList.isAbsoluteAndSchemeAuthorityNull()) {
                in = new BufferedInputStream(new FileInputStream(inputList.toString()));
            } else {
                in = inputList.getFileSystem(conf).open(inputList);
            }
            try {
                BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
                String line;
                while ((line = reader.readLine()) != null) {
                    writer.write(line + "\n");
                    numFiles++;
                }
                reader.close();
            } finally {
                in.close();
            }
        }

        writer.close();
    } finally {
        out.close();
    }
    return numFiles;
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private static FileStatus[] listSortedOutputShardDirs(Job job, Path outputReduceDir, FileSystem fs)
        throws FileNotFoundException, IOException {

    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
        @Override/* w w w  .jav a2s.c o  m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });
    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }

    // use alphanumeric sort (rather than lexicographical sort) to properly handle more than 99999 shards
    Arrays.sort(dirs, new Comparator<FileStatus>() {
        @Override
        public int compare(FileStatus f1, FileStatus f2) {
            return new ForkedAlphaNumericComparator().compare(f1.getPath().getName(), f2.getPath().getName());
        }
    });

    return dirs;
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private static boolean renameTreeMergeShardDirs(Path outputTreeMergeStep, Job job, FileSystem fs)
        throws IOException {
    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputTreeMergeStep, new PathFilter() {
        @Override/*from   w ww .jav a 2  s  .  co m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });

    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }

    for (FileStatus dir : dirs) {
        Path path = dir.getPath();
        Path renamedPath = new Path(path.getParent(), "_" + path.getName());
        if (!rename(path, renamedPath, fs)) {
            return false;
        }
    }

    for (FileStatus dir : dirs) {
        Path path = dir.getPath();
        Path renamedPath = new Path(path.getParent(), "_" + path.getName());

        Path solrShardNumberFile = new Path(renamedPath, ForkedTreeMergeMapper.SOLR_SHARD_NUMBER);
        InputStream in = fs.open(solrShardNumberFile);
        byte[] bytes = ByteStreams.toByteArray(in);
        in.close();
        Preconditions.checkArgument(bytes.length > 0);
        int solrShard = Integer.parseInt(new String(bytes, Charsets.UTF_8));
        if (!delete(solrShardNumberFile, false, fs)) {
            return false;
        }

        // see FileOutputFormat.NUMBER_FORMAT
        NumberFormat numberFormat = NumberFormat.getInstance();
        numberFormat.setMinimumIntegerDigits(5);
        numberFormat.setGroupingUsed(false);
        Path finalPath = new Path(renamedPath.getParent(), dirPrefix + "-m-" + numberFormat.format(solrShard));

        LOG.info("MTree merge renaming solr shard: " + solrShard + " from dir: " + dir.getPath() + " to dir: "
                + finalPath);
        if (!rename(renamedPath, finalPath, fs)) {
            return false;
        }
    }
    return true;
}

From source file:org.apache.solr.hadoop.MapReduceIndexerTool.java

License:Apache License

private FileStatus[] listSortedOutputShardDirs(Path outputReduceDir, FileSystem fs)
        throws FileNotFoundException, IOException {

    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
        @Override/*from w w  w.  j a  va  2 s .c  o  m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });
    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }
    Arrays.sort(dirs); // FIXME: handle more than 99999 shards (need numeric sort rather than lexicographical sort)
    return dirs;
}