List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:edu.nyu.vida.data_polygamy.utils.FrameworkUtils.java
License:BSD License
public static String searchPreProcessing(final String name, Configuration conf, boolean s3) throws IOException { PathFilter filter = new PathFilter() { @Override/*from w w w . ja va 2s .c om*/ public boolean accept(Path arg0) { if (arg0.getName().contains(name + "-")) return true; return false; } }; Path path = null; FileSystem fs = null; if (s3) { path = new Path(conf.get("bucket") + preProcessingDir); fs = FileSystem.get(path.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + preProcessingDir); } FileStatus[] status = fs.listStatus(path, filter); if (s3) fs.close(); String preProcessingFile = null; boolean aggregatesFound = false; String fileName = ""; for (FileStatus fileStatus : status) { fileName = fileStatus.getPath().getName(); if (!fileName.endsWith(".aggregates")) preProcessingFile = fileName; else if (fileName.endsWith(".aggregates")) aggregatesFound = true; } if (!aggregatesFound) return null; return preProcessingFile; }
From source file:edu.nyu.vida.data_polygamy.utils.FrameworkUtils.java
License:BSD License
public static String searchAggregatesHeader(final String name, Configuration conf, boolean s3) throws IOException { PathFilter filter = new PathFilter() { @Override/*from w w w. jav a 2s. c o m*/ public boolean accept(Path arg0) { if (arg0.getName().contains(name + "-")) return true; return false; } }; Path path = null; FileSystem fs = null; if (s3) { path = new Path(conf.get("bucket") + preProcessingDir); fs = FileSystem.get(path.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + preProcessingDir); } FileStatus[] status = fs.listStatus(path, filter); if (s3) fs.close(); String fileName = ""; for (FileStatus fileStatus : status) { fileName = fileStatus.getPath().getName(); if (fileName.endsWith(".aggregates")) return fileName; } return null; }
From source file:edu.nyu.vida.data_polygamy.utils.FrameworkUtils.java
License:BSD License
public static String[] searchAggregates(final String name, Configuration conf, boolean s3) throws IOException { PathFilter filter = new PathFilter() { @Override//from ww w.ja v a2 s . c o m public boolean accept(Path arg0) { if (arg0.getName().contains("_SUCCESS")) return false; return true; } }; Path path = null; FileSystem fs = null; if (s3) { path = new Path(conf.get("bucket") + aggregatesDir + "/" + name); fs = FileSystem.get(path.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + aggregatesDir + "/" + name); } FileStatus[] status; try { status = fs.listStatus(path, filter); } catch (FileNotFoundException e) { return new String[0]; } if (s3) fs.close(); String[] names = new String[status.length]; String fileName = ""; for (int i = 0; i < status.length; i++) { fileName = status[i].getPath().getName(); names[i] = fileName; } return names; }
From source file:edu.nyu.vida.data_polygamy.utils.FrameworkUtils.java
License:BSD License
public static String[] searchIndex(final String name, Configuration conf, boolean s3) throws IOException { PathFilter filter = new PathFilter() { @Override/*from w w w.j a v a2 s. c o m*/ public boolean accept(Path arg0) { if (arg0.getName().contains("_SUCCESS")) return false; return true; } }; Path path = null; FileSystem fs = null; if (s3) { path = new Path(conf.get("bucket") + indexDir + "/" + name); fs = FileSystem.get(path.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + indexDir + "/" + name); } FileStatus[] status; try { status = fs.listStatus(path, filter); } catch (FileNotFoundException e) { return new String[0]; } if (s3) fs.close(); String[] names = new String[status.length]; String fileName = ""; for (int i = 0; i < status.length; i++) { fileName = status[i].getPath().getName(); names[i] = fileName; } return names; }
From source file:edu.nyu.vida.data_polygamy.utils.FrameworkUtils.java
License:BSD License
public static String[] searchDataAttributes(final String name, Configuration conf, boolean s3) throws IOException { PathFilter filter = new PathFilter() { @Override/*from www . j av a 2 s . c o m*/ public boolean accept(Path arg0) { if (arg0.getName().contains("_SUCCESS")) return false; return true; } }; Path path = null; FileSystem fs = null; if (s3) { path = new Path(conf.get("bucket") + dataAttributesDir + "/" + name); fs = FileSystem.get(path.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + dataAttributesDir + "/" + name); } FileStatus[] status; try { status = fs.listStatus(path, filter); } catch (FileNotFoundException e) { return new String[0]; } if (s3) fs.close(); String[] names = new String[status.length]; String fileName = ""; for (int i = 0; i < status.length; i++) { fileName = status[i].getPath().getName(); names[i] = fileName; } return names; }
From source file:edu.stolaf.cs.wmrserver.JobServiceHandler.java
License:Apache License
public static FileStatus[] listInputFiles(FileSystem fs, Path path) throws IOException { if (!fs.isDirectory(path)) return new FileStatus[] { fs.getFileStatus(path) }; else {/*from w w w . j av a 2s . c o m*/ // Get all files in directory that are not directories or hidden files final FileSystem fsFinal = fs; PathFilter filter = new PathFilter() { public boolean accept(Path p) { try { return !(fsFinal.isDirectory(p) || p.getName().startsWith(".") || p.getName().startsWith("_")); } catch (IOException ex) { throw new RuntimeException("Error filtering files.", ex); } } }; return fs.listStatus(path, filter); } }
From source file:edu.uci.ics.pregelix.dataflow.HDFSFileWriteOperatorDescriptor.java
License:Apache License
@SuppressWarnings("rawtypes") @Override// w w w. j ava2 s . c om public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, final IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions) throws HyracksDataException { return new AbstractUnaryInputSinkOperatorNodePushable() { private RecordDescriptor rd0; private FrameDeserializer frameDeserializer; private Configuration conf; private VertexWriter vertexWriter; private TaskAttemptContext context; private String TEMP_DIR = "_temporary"; private ClassLoader ctxCL; private ContextFactory ctxFactory = new ContextFactory(); @Override public void open() throws HyracksDataException { rd0 = inputRdFactory == null ? recordDescProvider.getInputRecordDescriptor(getActivityId(), 0) : inputRdFactory.createRecordDescriptor(); frameDeserializer = new FrameDeserializer(ctx.getFrameSize(), rd0); ctxCL = Thread.currentThread().getContextClassLoader(); Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); conf = confFactory.createConfiguration(); VertexOutputFormat outputFormat = BspUtils.createVertexOutputFormat(conf); context = ctxFactory.createContext(conf, partition); try { vertexWriter = outputFormat.createVertexWriter(context); } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } @SuppressWarnings("unchecked") @Override public void nextFrame(ByteBuffer frame) throws HyracksDataException { frameDeserializer.reset(frame); try { while (!frameDeserializer.done()) { Object[] tuple = frameDeserializer.deserializeRecord(); Vertex value = (Vertex) tuple[1]; vertexWriter.writeVertex(value); } } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } @Override public void fail() throws HyracksDataException { Thread.currentThread().setContextClassLoader(ctxCL); } @Override public void close() throws HyracksDataException { try { vertexWriter.close(context); moveFilesToFinalPath(); } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } private void moveFilesToFinalPath() throws HyracksDataException { try { JobContext job = ctxFactory.createJobContext(conf); Path outputPath = FileOutputFormat.getOutputPath(job); FileSystem dfs = FileSystem.get(conf); Path filePath = new Path(outputPath, "part-" + new Integer(partition).toString()); FileStatus[] results = findPartitionPaths(outputPath, dfs); if (results.length >= 1) { /** * for Hadoop-0.20.2 */ renameFile(dfs, filePath, results); } else { /** * for Hadoop-0.23.1 */ int jobId = job.getJobID().getId(); outputPath = new Path( outputPath.toString() + File.separator + TEMP_DIR + File.separator + jobId); results = findPartitionPaths(outputPath, dfs); renameFile(dfs, filePath, results); } } catch (IOException e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } private FileStatus[] findPartitionPaths(Path outputPath, FileSystem dfs) throws FileNotFoundException, IOException { FileStatus[] tempPaths = dfs.listStatus(outputPath, new PathFilter() { @Override public boolean accept(Path dir) { return dir.getName().endsWith(TEMP_DIR); } }); Path tempDir = tempPaths[0].getPath(); FileStatus[] results = dfs.listStatus(tempDir, new PathFilter() { @Override public boolean accept(Path dir) { return dir.getName().indexOf(context.getTaskAttemptID().toString()) >= 0; } }); return results; } private void renameFile(FileSystem dfs, Path filePath, FileStatus[] results) throws IOException, HyracksDataException, FileNotFoundException { Path srcDir = results[0].getPath(); if (!dfs.exists(srcDir)) throw new HyracksDataException("file " + srcDir.toString() + " does not exist!"); FileStatus[] srcFiles = dfs.listStatus(srcDir); Path srcFile = srcFiles[0].getPath(); dfs.delete(filePath, true); dfs.rename(srcFile, filePath); } }; }
From source file:edu.uci.ics.pregelix.dataflow.VertexFileWriteOperatorDescriptor.java
License:Apache License
@SuppressWarnings("rawtypes") @Override//from w ww .j a va2s . c o m public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, final IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions) throws HyracksDataException { return new AbstractUnaryInputSinkOperatorNodePushable() { private RecordDescriptor rd0; private FrameDeserializer frameDeserializer; private Configuration conf; private VertexWriter vertexWriter; private TaskAttemptContext context; private String TEMP_DIR = "_temporary"; private ClassLoader ctxCL; private ContextFactory ctxFactory = new ContextFactory(); @Override public void open() throws HyracksDataException { rd0 = inputRdFactory == null ? recordDescProvider.getInputRecordDescriptor(getActivityId(), 0) : inputRdFactory.createRecordDescriptor(ctx); frameDeserializer = new FrameDeserializer(rd0); ctxCL = Thread.currentThread().getContextClassLoader(); Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); conf = confFactory.createConfiguration(ctx); VertexOutputFormat outputFormat = BspUtils.createVertexOutputFormat(conf); context = ctxFactory.createContext(conf, partition); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); try { if (preHookFactory != null) { preHookFactory.createRuntimeHook().configure(ctx); } vertexWriter = outputFormat.createVertexWriter(context); } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } @SuppressWarnings("unchecked") @Override public void nextFrame(ByteBuffer frame) throws HyracksDataException { frameDeserializer.reset(frame); try { while (!frameDeserializer.done()) { Object[] tuple = frameDeserializer.deserializeRecord(); Vertex value = (Vertex) tuple[1]; vertexWriter.writeVertex(value); } } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } @Override public void fail() throws HyracksDataException { Thread.currentThread().setContextClassLoader(ctxCL); } @Override public void close() throws HyracksDataException { try { vertexWriter.close(context); moveFilesToFinalPath(); } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } private void moveFilesToFinalPath() throws HyracksDataException { try { JobContext job = ctxFactory.createJobContext(conf); Path outputPath = FileOutputFormat.getOutputPath(job); FileSystem dfs = FileSystem.get(conf); Path filePath = new Path(outputPath, "part-" + new Integer(partition).toString()); FileStatus[] results = findPartitionPaths(outputPath, dfs); if (results.length >= 1) { /** * for Hadoop-0.20.2 */ renameFile(dfs, filePath, results); } else { /** * for Hadoop-0.23.1 */ int jobId = job.getJobID().getId(); outputPath = new Path( outputPath.toString() + File.separator + TEMP_DIR + File.separator + jobId); results = findPartitionPaths(outputPath, dfs); renameFile(dfs, filePath, results); } } catch (IOException e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } private FileStatus[] findPartitionPaths(Path outputPath, FileSystem dfs) throws FileNotFoundException, IOException { FileStatus[] tempPaths = dfs.listStatus(outputPath, new PathFilter() { @Override public boolean accept(Path dir) { return dir.getName().endsWith(TEMP_DIR) && dir.getName().indexOf(".crc") < 0; } }); Path tempDir = tempPaths[0].getPath(); FileStatus[] results = dfs.listStatus(tempDir, new PathFilter() { @Override public boolean accept(Path dir) { return dir.getName().indexOf(context.getTaskAttemptID().toString()) >= 0 && dir.getName().indexOf(".crc") < 0; } }); return results; } private void renameFile(FileSystem dfs, Path filePath, FileStatus[] results) throws IOException, HyracksDataException, FileNotFoundException { Path srcDir = results[0].getPath(); if (!dfs.exists(srcDir)) { throw new HyracksDataException("file " + srcDir.toString() + " does not exist!"); } FileStatus[] srcFiles = dfs.listStatus(srcDir); Path srcFile = srcFiles[0].getPath(); dfs.delete(filePath, true); dfs.rename(srcFile, filePath); } }; }
From source file:edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.java
License:Apache License
@Override public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); if (options == null) { return -1; }// w w w .j a va 2 s . c om // Temp directory. String tmpDir = "tmp-" + TrecWebDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool name: " + TrecWebDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = new Job(getConf(), TrecWebDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(TrecWebDocnoMappingBuilder.class); job.setNumReduceTasks(1); PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; // Note: Gov2 and Wt10g raw collections are organized into sub-directories. Path collectionPath = new Path(options.collection); for (FileStatus status : fs.listStatus(collectionPath, filter)) { if (status.isDirectory()) { for (FileStatus s : fs.listStatus(status.getPath(), filter)) { FileInputFormat.addInputPath(job, s.getPath()); } } else { FileInputFormat.addInputPath(job, status.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(options.inputFormat); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs); fs.delete(new Path(tmpDir), true); return 0; }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Merges a set of indexes into larger indexes * @param fs/*from w w w . j a v a2 s . c om*/ * @param srcIndexDir * @param dstIndexDir * @param srcFormat * @param dstFormat * @param params * @throws IOException * @throws ParseException * @throws InterruptedException */ private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir, SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params) throws IOException, ParseException, InterruptedException { TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null; final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir) : fs.listStatus(srcIndexDir, timeRange); Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here // Scan the source indexes and merge each consecutive run belonging to the // same unit int i1 = 0; while (i1 < sourceIndexes.length) { final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName())); int i2 = i1 + 1; // Keep scanning as long as the source index belongs to the same dest index while (i2 < sourceIndexes.length && dstFormat .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate)) i2++; // Merge all source indexes in the range [i1, i2) into one dest index // Copy i1, i2 to other variables as final to be accessible from threads final int firstIndex = i1; final int lastIndex = i2; final Path destIndex = new Path(dstIndexDir, indexToCreate); // For each tile, merge all values in all source indexes /*A regular expression to catch the tile identifier of a MODIS grid cell*/ final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$"); final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath()); // Shuffle the array for better load balancing across threads Random rand = new Random(); for (int i = 0; i < tilesInFirstDay.length - 1; i++) { // Swap the entry at i with any following entry int j = i + rand.nextInt(tilesInFirstDay.length - i - 1); FileStatus temp = tilesInFirstDay[i]; tilesInFirstDay[i] = tilesInFirstDay[j]; tilesInFirstDay[j] = temp; } Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() { @Override public Object run(int i_file1, int i_file2) { for (int i_file = i_file1; i_file < i_file2; i_file++) { try { FileStatus tileInFirstDay = tilesInFirstDay[i_file]; // Extract tile ID Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName()); if (!matcher.matches()) { LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath()); continue; } final String tileID = matcher.group(1); Path destIndexFile = new Path(destIndex, tileID); PathFilter tileFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains(tileID); } }; // Find matching tiles in all source indexes to merge Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex); filesToMerge.add(tileInFirstDay.getPath()); for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) { FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(), tileFilter); if (matchedTileFile.length == 0) LOG.warn("Could not find tile " + tileID + " in dir " + sourceIndexes[iDailyIndex].getPath()); else if (matchedTileFile.length == 1) filesToMerge.add(matchedTileFile[0].getPath()); } if (fs.exists(destIndexFile)) { // Destination file already exists // Check the date of the destination and source files to see // whether it needs to be updated or not long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime(); boolean needsUpdate = false; for (Path fileToMerge : filesToMerge) { long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime(); if (sourceTimestamp > destTimestamp) { needsUpdate = true; break; } } if (!needsUpdate) continue; else LOG.info("Updating file " + destIndexFile.getName()); } // Do the merge Path tmpFile; do { tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp"); } while (fs.exists(tmpFile)); tmpFile = tmpFile.makeQualified(fs); LOG.info("Merging tile " + tileID + " into file " + destIndexFile); AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]), tmpFile); synchronized (fs) { Path destDir = destIndexFile.getParent(); if (!fs.exists(destDir)) fs.mkdirs(destDir); } fs.rename(tmpFile, destIndexFile); } catch (IOException e) { e.printStackTrace(); } } return null; } }); i1 = i2; } }