Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testOrderbyIncreaseParallelism() throws IOException {
    // order by parallelism is 3 originally, increase to 4
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            "1000");
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = group A by name parallel 3;");
    pigServer.registerQuery("C = foreach B generate group as name, AVG(A.age) as age;");
    pigServer.registerQuery("D = order C by age;");
    pigServer.store("D", "output3");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output3"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }//from   w w w. ja v  a  2 s.c  o  m
            return false;
        }
    });
    assertEquals(files.length, 4);
}

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testSkewedJoinDecreaseParallelism() throws IOException {
    // skewed join parallelism is 4 originally, reduce to 1
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            Long.toString(InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);");
    pigServer.registerQuery("C = join A by name, B by name using 'skewed';");
    pigServer.store("C", "output4");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output4"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }/*from w ww  .  j a v a  2  s .c o m*/
            return false;
        }
    });
    assertEquals(files.length, 1);
}

From source file:org.apache.pig.tez.TestTezAutoParallelism.java

License:Apache License

@Test
public void testSkewedJoinIncreaseParallelism() throws IOException {
    // skewed join parallelism is 3 originally, increase to 5
    pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true");
    pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000");
    pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
            "80000");
    pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);");
    pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);");
    pigServer.registerQuery("C = join A by name, B by name using 'skewed';");
    pigServer.store("C", "output5");
    FileSystem fs = cluster.getFileSystem();
    FileStatus[] files = fs.listStatus(new Path("output5"), new PathFilter() {
        public boolean accept(Path path) {
            if (path.getName().startsWith("part")) {
                return true;
            }//from  www  .  j a  v  a  2  s .c  o  m
            return false;
        }
    });
    assertEquals(files.length, 5);
}

From source file:org.apache.ranger.plugin.store.file.BaseFileStore.java

License:Apache License

protected <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception {
    if (LOG.isDebugEnabled()) {
        LOG.debug("==> BaseFileStore.loadFromDir()");
    }/*  www  .j  a  v a  2  s .  co  m*/

    List<T> ret = new ArrayList<T>();

    try {
        FileSystem fileSystem = getFileSystem(dirPath);

        if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) {
            PathFilter filter = new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON);
                }
            };

            FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter);

            if (sdFiles != null) {
                for (FileStatus sdFile : sdFiles) {
                    T obj = loadFromFile(sdFile.getPath(), cls);

                    if (obj != null) {
                        ret.add(obj);
                    }
                }
            }
        } else {
            LOG.error(dirPath + ": does not exists or not a directory");
        }
    } catch (IOException excp) {
        LOG.warn("error loading service-def in directory " + dirPath, excp);
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("<== BaseFileStore.loadFromDir(): count=" + (ret == null ? 0 : ret.size()));
    }

    return ret;
}

From source file:org.apache.ranger.plugin.store.file.FileStoreUtil.java

License:Apache License

public <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception {
    if (LOG.isDebugEnabled()) {
        LOG.debug("==> FileStoreUtil.loadFromDir()");
    }//from   w ww. ja v  a 2  s  .  c  om

    List<T> ret = new ArrayList<T>();

    try {
        FileSystem fileSystem = getFileSystem(dirPath);

        if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) {
            PathFilter filter = new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON);
                }
            };

            FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter);

            if (sdFiles != null) {
                for (FileStatus sdFile : sdFiles) {
                    T obj = loadFromFile(sdFile.getPath(), cls);

                    if (obj != null) {
                        ret.add(obj);
                    }
                }
            }
        } else {
            LOG.error(dirPath + ": does not exists or not a directory");
        }
    } catch (IOException excp) {
        LOG.warn("error loading service-def in directory " + dirPath, excp);
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("<== FileStoreUtil.loadFromDir(): count=" + (ret == null ? 0 : ret.size()));
    }

    return ret;
}

From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java

License:Apache License

/**
 * Set up the MapReduce job to use file inputs from previous iterations.
 * @param   fileMapper  Mapper class for generated triples
 * @param   incMapper   Mapper class for generated inconsistenies
 * @param   filter      Exclude facts that aren't helpful for inference
 */// w w w .  j av  a 2  s  . c o  m
protected void configureFileInput(Class<? extends Mapper<Fact, NullWritable, ?, ?>> fileMapper,
        Class<? extends Mapper<Derivation, NullWritable, ?, ?>> incMapper, final boolean filter)
        throws IOException {
    // Set up file input for all iterations up to this one
    Configuration conf = job.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
    Path inputPath;
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    // Set min/max split, if not already provided:
    long blocksize = Long.parseLong(conf.get("dfs.blocksize"));
    String minSplitProp = "mapreduce.input.fileinputformat.split.minsize";
    String maxSplitProp = "mapreduce.input.fileinputformat.split.maxsize";
    conf.set(minSplitProp, conf.get(minSplitProp, String.valueOf(blocksize)));
    conf.set(maxSplitProp, conf.get(maxSplitProp, String.valueOf(blocksize * 8)));
    for (int i = 1; i <= iteration; i++) {
        // Prefer cleaned output...
        inputPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + i);
        // But if there isn't any, try intermediate data:
        if (!fs.isDirectory(inputPath)) {
            inputPath = MRReasoningUtils.getOutputPath(conf,
                    MRReasoningUtils.OUTPUT_BASE + i + MRReasoningUtils.TEMP_SUFFIX);
        }
        // And only proceed if we found one or the other.
        if (fs.isDirectory(inputPath)) {
            // Never include debug output. If filter is true, select only
            // intermediate and schema data, otherwise include everything.
            PathFilter f = new PathFilter() {
                public boolean accept(Path path) {
                    String s = path.getName();
                    if (s.startsWith(MRReasoningUtils.DEBUG_OUT)) {
                        return false;
                    } else {
                        return !filter || s.startsWith(MRReasoningUtils.INTERMEDIATE_OUT)
                                || s.startsWith(MRReasoningUtils.SCHEMA_OUT);
                    }
                }
            };
            for (FileStatus status : fs.listStatus(inputPath, f)) {
                if (status.getLen() > 0) {
                    Path p = status.getPath();
                    String s = p.getName();
                    if (s.startsWith(MRReasoningUtils.INCONSISTENT_OUT)) {
                        if (incMapper != null) {
                            MultipleInputs.addInputPath(job, p, CombineSequenceFileInputFormat.class,
                                    incMapper);
                        }
                    } else {
                        MultipleInputs.addInputPath(job, status.getPath(), CombineSequenceFileInputFormat.class,
                                fileMapper);
                    }
                }
            }
        }
    }
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private long addInputFiles(List<Path> inputFiles, List<Path> inputLists, Path fullInputList, Configuration conf)
        throws IOException {

    long numFiles = 0;
    FileSystem fs = fullInputList.getFileSystem(conf);
    FSDataOutputStream out = fs.create(fullInputList);
    try {/* ww  w  .j av a 2 s.  c om*/
        Writer writer = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));

        for (Path inputFile : inputFiles) {
            FileSystem inputFileFs = inputFile.getFileSystem(conf);
            if (inputFileFs.exists(inputFile)) {
                PathFilter pathFilter = new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        return !path.getName().startsWith("."); // ignore "hidden" files and dirs
                    }
                };
                numFiles += addInputFilesRecursively(inputFile, writer, inputFileFs, pathFilter);
            }
        }

        for (Path inputList : inputLists) {
            InputStream in;
            if (inputList.toString().equals("-")) {
                in = System.in;
            } else if (inputList.isAbsoluteAndSchemeAuthorityNull()) {
                in = new BufferedInputStream(new FileInputStream(inputList.toString()));
            } else {
                in = inputList.getFileSystem(conf).open(inputList);
            }
            try {
                BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
                String line;
                while ((line = reader.readLine()) != null) {
                    writer.write(line + "\n");
                    numFiles++;
                }
                reader.close();
            } finally {
                in.close();
            }
        }

        writer.close();
    } finally {
        out.close();
    }
    return numFiles;
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private static FileStatus[] listSortedOutputShardDirs(Job job, Path outputReduceDir, FileSystem fs)
        throws FileNotFoundException, IOException {

    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
        @Override/* w w w  .jav a2s.c o  m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });
    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }

    // use alphanumeric sort (rather than lexicographical sort) to properly handle more than 99999 shards
    Arrays.sort(dirs, new Comparator<FileStatus>() {
        @Override
        public int compare(FileStatus f1, FileStatus f2) {
            return new ForkedAlphaNumericComparator().compare(f1.getPath().getName(), f2.getPath().getName());
        }
    });

    return dirs;
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

private static boolean renameTreeMergeShardDirs(Path outputTreeMergeStep, Job job, FileSystem fs)
        throws IOException {
    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputTreeMergeStep, new PathFilter() {
        @Override/*from   w ww .jav a 2  s  .  co m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });

    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }

    for (FileStatus dir : dirs) {
        Path path = dir.getPath();
        Path renamedPath = new Path(path.getParent(), "_" + path.getName());
        if (!rename(path, renamedPath, fs)) {
            return false;
        }
    }

    for (FileStatus dir : dirs) {
        Path path = dir.getPath();
        Path renamedPath = new Path(path.getParent(), "_" + path.getName());

        Path solrShardNumberFile = new Path(renamedPath, ForkedTreeMergeMapper.SOLR_SHARD_NUMBER);
        InputStream in = fs.open(solrShardNumberFile);
        byte[] bytes = ByteStreams.toByteArray(in);
        in.close();
        Preconditions.checkArgument(bytes.length > 0);
        int solrShard = Integer.parseInt(new String(bytes, Charsets.UTF_8));
        if (!delete(solrShardNumberFile, false, fs)) {
            return false;
        }

        // see FileOutputFormat.NUMBER_FORMAT
        NumberFormat numberFormat = NumberFormat.getInstance();
        numberFormat.setMinimumIntegerDigits(5);
        numberFormat.setGroupingUsed(false);
        Path finalPath = new Path(renamedPath.getParent(), dirPrefix + "-m-" + numberFormat.format(solrShard));

        LOG.info("MTree merge renaming solr shard: " + solrShard + " from dir: " + dir.getPath() + " to dir: "
                + finalPath);
        if (!rename(renamedPath, finalPath, fs)) {
            return false;
        }
    }
    return true;
}

From source file:org.apache.solr.hadoop.MapReduceIndexerTool.java

License:Apache License

private FileStatus[] listSortedOutputShardDirs(Path outputReduceDir, FileSystem fs)
        throws FileNotFoundException, IOException {

    final String dirPrefix = SolrOutputFormat.getOutputName(job);
    FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
        @Override/*from w w  w.  j a  va  2 s .c  o  m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(dirPrefix);
        }
    });
    for (FileStatus dir : dirs) {
        if (!dir.isDirectory()) {
            throw new IllegalStateException("Not a directory: " + dir.getPath());
        }
    }
    Arrays.sort(dirs); // FIXME: handle more than 99999 shards (need numeric sort rather than lexicographical sort)
    return dirs;
}