List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:org.apache.pig.tez.TestTezAutoParallelism.java
License:Apache License
@Test public void testOrderbyIncreaseParallelism() throws IOException { // order by parallelism is 3 originally, increase to 4 pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true"); pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000"); pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, "1000"); pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);"); pigServer.registerQuery("B = group A by name parallel 3;"); pigServer.registerQuery("C = foreach B generate group as name, AVG(A.age) as age;"); pigServer.registerQuery("D = order C by age;"); pigServer.store("D", "output3"); FileSystem fs = cluster.getFileSystem(); FileStatus[] files = fs.listStatus(new Path("output3"), new PathFilter() { public boolean accept(Path path) { if (path.getName().startsWith("part")) { return true; }//from w w w. ja v a 2 s.c o m return false; } }); assertEquals(files.length, 4); }
From source file:org.apache.pig.tez.TestTezAutoParallelism.java
License:Apache License
@Test public void testSkewedJoinDecreaseParallelism() throws IOException { // skewed join parallelism is 4 originally, reduce to 1 pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true"); pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000"); pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, Long.toString(InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER)); pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);"); pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);"); pigServer.registerQuery("C = join A by name, B by name using 'skewed';"); pigServer.store("C", "output4"); FileSystem fs = cluster.getFileSystem(); FileStatus[] files = fs.listStatus(new Path("output4"), new PathFilter() { public boolean accept(Path path) { if (path.getName().startsWith("part")) { return true; }/*from w ww . j a v a 2 s .c o m*/ return false; } }); assertEquals(files.length, 1); }
From source file:org.apache.pig.tez.TestTezAutoParallelism.java
License:Apache License
@Test public void testSkewedJoinIncreaseParallelism() throws IOException { // skewed join parallelism is 3 originally, increase to 5 pigServer.getPigContext().getProperties().setProperty(PigConfiguration.PIG_NO_SPLIT_COMBINATION, "true"); pigServer.getPigContext().getProperties().setProperty(MRConfiguration.MAX_SPLIT_SIZE, "3000"); pigServer.getPigContext().getProperties().setProperty(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, "80000"); pigServer.registerQuery("A = load '" + INPUT_FILE1 + "' as (name:chararray, age:int);"); pigServer.registerQuery("B = load '" + INPUT_FILE2 + "' as (name:chararray, gender:chararray);"); pigServer.registerQuery("C = join A by name, B by name using 'skewed';"); pigServer.store("C", "output5"); FileSystem fs = cluster.getFileSystem(); FileStatus[] files = fs.listStatus(new Path("output5"), new PathFilter() { public boolean accept(Path path) { if (path.getName().startsWith("part")) { return true; }//from www . j a v a 2 s .c o m return false; } }); assertEquals(files.length, 5); }
From source file:org.apache.ranger.plugin.store.file.BaseFileStore.java
License:Apache License
protected <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception { if (LOG.isDebugEnabled()) { LOG.debug("==> BaseFileStore.loadFromDir()"); }/* www .j a v a 2 s . co m*/ List<T> ret = new ArrayList<T>(); try { FileSystem fileSystem = getFileSystem(dirPath); if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) { PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON); } }; FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter); if (sdFiles != null) { for (FileStatus sdFile : sdFiles) { T obj = loadFromFile(sdFile.getPath(), cls); if (obj != null) { ret.add(obj); } } } } else { LOG.error(dirPath + ": does not exists or not a directory"); } } catch (IOException excp) { LOG.warn("error loading service-def in directory " + dirPath, excp); } if (LOG.isDebugEnabled()) { LOG.debug("<== BaseFileStore.loadFromDir(): count=" + (ret == null ? 0 : ret.size())); } return ret; }
From source file:org.apache.ranger.plugin.store.file.FileStoreUtil.java
License:Apache License
public <T> List<T> loadFromDir(Path dirPath, final String filePrefix, Class<T> cls) throws Exception { if (LOG.isDebugEnabled()) { LOG.debug("==> FileStoreUtil.loadFromDir()"); }//from w ww. ja v a 2 s . c om List<T> ret = new ArrayList<T>(); try { FileSystem fileSystem = getFileSystem(dirPath); if (fileSystem.exists(dirPath) && fileSystem.isDirectory(dirPath)) { PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith(filePrefix) && path.getName().endsWith(FILE_SUFFIX_JSON); } }; FileStatus[] sdFiles = fileSystem.listStatus(dirPath, filter); if (sdFiles != null) { for (FileStatus sdFile : sdFiles) { T obj = loadFromFile(sdFile.getPath(), cls); if (obj != null) { ret.add(obj); } } } } else { LOG.error(dirPath + ": does not exists or not a directory"); } } catch (IOException excp) { LOG.warn("error loading service-def in directory " + dirPath, excp); } if (LOG.isDebugEnabled()) { LOG.debug("<== FileStoreUtil.loadFromDir(): count=" + (ret == null ? 0 : ret.size())); } return ret; }
From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java
License:Apache License
/** * Set up the MapReduce job to use file inputs from previous iterations. * @param fileMapper Mapper class for generated triples * @param incMapper Mapper class for generated inconsistenies * @param filter Exclude facts that aren't helpful for inference */// w w w . j av a 2 s . c o m protected void configureFileInput(Class<? extends Mapper<Fact, NullWritable, ?, ?>> fileMapper, Class<? extends Mapper<Derivation, NullWritable, ?, ?>> incMapper, final boolean filter) throws IOException { // Set up file input for all iterations up to this one Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path inputPath; int iteration = MRReasoningUtils.getCurrentIteration(conf); // Set min/max split, if not already provided: long blocksize = Long.parseLong(conf.get("dfs.blocksize")); String minSplitProp = "mapreduce.input.fileinputformat.split.minsize"; String maxSplitProp = "mapreduce.input.fileinputformat.split.maxsize"; conf.set(minSplitProp, conf.get(minSplitProp, String.valueOf(blocksize))); conf.set(maxSplitProp, conf.get(maxSplitProp, String.valueOf(blocksize * 8))); for (int i = 1; i <= iteration; i++) { // Prefer cleaned output... inputPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + i); // But if there isn't any, try intermediate data: if (!fs.isDirectory(inputPath)) { inputPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + i + MRReasoningUtils.TEMP_SUFFIX); } // And only proceed if we found one or the other. if (fs.isDirectory(inputPath)) { // Never include debug output. If filter is true, select only // intermediate and schema data, otherwise include everything. PathFilter f = new PathFilter() { public boolean accept(Path path) { String s = path.getName(); if (s.startsWith(MRReasoningUtils.DEBUG_OUT)) { return false; } else { return !filter || s.startsWith(MRReasoningUtils.INTERMEDIATE_OUT) || s.startsWith(MRReasoningUtils.SCHEMA_OUT); } } }; for (FileStatus status : fs.listStatus(inputPath, f)) { if (status.getLen() > 0) { Path p = status.getPath(); String s = p.getName(); if (s.startsWith(MRReasoningUtils.INCONSISTENT_OUT)) { if (incMapper != null) { MultipleInputs.addInputPath(job, p, CombineSequenceFileInputFormat.class, incMapper); } } else { MultipleInputs.addInputPath(job, status.getPath(), CombineSequenceFileInputFormat.class, fileMapper); } } } } } }
From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java
License:Apache License
private long addInputFiles(List<Path> inputFiles, List<Path> inputLists, Path fullInputList, Configuration conf) throws IOException { long numFiles = 0; FileSystem fs = fullInputList.getFileSystem(conf); FSDataOutputStream out = fs.create(fullInputList); try {/* ww w .j av a 2 s. c om*/ Writer writer = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); for (Path inputFile : inputFiles) { FileSystem inputFileFs = inputFile.getFileSystem(conf); if (inputFileFs.exists(inputFile)) { PathFilter pathFilter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("."); // ignore "hidden" files and dirs } }; numFiles += addInputFilesRecursively(inputFile, writer, inputFileFs, pathFilter); } } for (Path inputList : inputLists) { InputStream in; if (inputList.toString().equals("-")) { in = System.in; } else if (inputList.isAbsoluteAndSchemeAuthorityNull()) { in = new BufferedInputStream(new FileInputStream(inputList.toString())); } else { in = inputList.getFileSystem(conf).open(inputList); } try { BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8")); String line; while ((line = reader.readLine()) != null) { writer.write(line + "\n"); numFiles++; } reader.close(); } finally { in.close(); } } writer.close(); } finally { out.close(); } return numFiles; }
From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java
License:Apache License
private static FileStatus[] listSortedOutputShardDirs(Job job, Path outputReduceDir, FileSystem fs) throws FileNotFoundException, IOException { final String dirPrefix = SolrOutputFormat.getOutputName(job); FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() { @Override/* w w w .jav a2s.c o m*/ public boolean accept(Path path) { return path.getName().startsWith(dirPrefix); } }); for (FileStatus dir : dirs) { if (!dir.isDirectory()) { throw new IllegalStateException("Not a directory: " + dir.getPath()); } } // use alphanumeric sort (rather than lexicographical sort) to properly handle more than 99999 shards Arrays.sort(dirs, new Comparator<FileStatus>() { @Override public int compare(FileStatus f1, FileStatus f2) { return new ForkedAlphaNumericComparator().compare(f1.getPath().getName(), f2.getPath().getName()); } }); return dirs; }
From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java
License:Apache License
private static boolean renameTreeMergeShardDirs(Path outputTreeMergeStep, Job job, FileSystem fs) throws IOException { final String dirPrefix = SolrOutputFormat.getOutputName(job); FileStatus[] dirs = fs.listStatus(outputTreeMergeStep, new PathFilter() { @Override/*from w ww .jav a 2 s . co m*/ public boolean accept(Path path) { return path.getName().startsWith(dirPrefix); } }); for (FileStatus dir : dirs) { if (!dir.isDirectory()) { throw new IllegalStateException("Not a directory: " + dir.getPath()); } } for (FileStatus dir : dirs) { Path path = dir.getPath(); Path renamedPath = new Path(path.getParent(), "_" + path.getName()); if (!rename(path, renamedPath, fs)) { return false; } } for (FileStatus dir : dirs) { Path path = dir.getPath(); Path renamedPath = new Path(path.getParent(), "_" + path.getName()); Path solrShardNumberFile = new Path(renamedPath, ForkedTreeMergeMapper.SOLR_SHARD_NUMBER); InputStream in = fs.open(solrShardNumberFile); byte[] bytes = ByteStreams.toByteArray(in); in.close(); Preconditions.checkArgument(bytes.length > 0); int solrShard = Integer.parseInt(new String(bytes, Charsets.UTF_8)); if (!delete(solrShardNumberFile, false, fs)) { return false; } // see FileOutputFormat.NUMBER_FORMAT NumberFormat numberFormat = NumberFormat.getInstance(); numberFormat.setMinimumIntegerDigits(5); numberFormat.setGroupingUsed(false); Path finalPath = new Path(renamedPath.getParent(), dirPrefix + "-m-" + numberFormat.format(solrShard)); LOG.info("MTree merge renaming solr shard: " + solrShard + " from dir: " + dir.getPath() + " to dir: " + finalPath); if (!rename(renamedPath, finalPath, fs)) { return false; } } return true; }
From source file:org.apache.solr.hadoop.MapReduceIndexerTool.java
License:Apache License
private FileStatus[] listSortedOutputShardDirs(Path outputReduceDir, FileSystem fs) throws FileNotFoundException, IOException { final String dirPrefix = SolrOutputFormat.getOutputName(job); FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() { @Override/*from w w w. j a va 2 s .c o m*/ public boolean accept(Path path) { return path.getName().startsWith(dirPrefix); } }); for (FileStatus dir : dirs) { if (!dir.isDirectory()) { throw new IllegalStateException("Not a directory: " + dir.getPath()); } } Arrays.sort(dirs); // FIXME: handle more than 99999 shards (need numeric sort rather than lexicographical sort) return dirs; }