List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:org.apache.tez.test.TestTezJobs.java
License:Apache License
@Test(timeout = 60000) public void testSortMergeJoinExampleDisableSplitGrouping() throws Exception { SortMergeJoinExample sortMergeJoinExample = new SortMergeJoinExample(); sortMergeJoinExample.setConf(conf);/* w w w .ja va 2s. com*/ Path stagingDirPath = new Path(TEST_ROOT_DIR + "/tmp/tez-staging-dir"); Path inPath1 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath1"); Path inPath2 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath2"); Path outPath = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/outPath"); localFs.delete(outPath, true); localFs.mkdirs(inPath1); localFs.mkdirs(inPath2); localFs.mkdirs(stagingDirPath); Set<String> expectedResult = new HashSet<String>(); FSDataOutputStream out1 = localFs.create(new Path(inPath1, "file")); FSDataOutputStream out2 = localFs.create(new Path(inPath2, "file")); BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1)); BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2)); for (int i = 0; i < 20; i++) { String term = "term" + i; writer1.write(term); writer1.newLine(); if (i % 2 == 0) { writer2.write(term); writer2.newLine(); expectedResult.add(term); } } writer1.close(); writer2.close(); out1.close(); out2.close(); String[] args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), "-local", "-disableSplitGrouping", inPath1.toString(), inPath2.toString(), "1", outPath.toString() }; assertEquals(0, sortMergeJoinExample.run(args)); FileStatus[] statuses = localFs.listStatus(outPath, new PathFilter() { public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); assertEquals(1, statuses.length); FSDataInputStream inStream = localFs.open(statuses[0].getPath()); BufferedReader reader = new BufferedReader(new InputStreamReader(inStream)); String line; while ((line = reader.readLine()) != null) { assertTrue(expectedResult.remove(line)); } reader.close(); inStream.close(); assertEquals(0, expectedResult.size()); }
From source file:org.archive.tnh.nutch.Segments.java
License:Apache License
public static PathFilter getPassDirectoriesFilter(final FileSystem fs) { return new PathFilter() { public boolean accept(final Path path) { try { return fs.getFileStatus(path).isDir(); } catch (IOException ioe) { return false; }/*ww w.j a v a 2s. c om*/ } }; }
From source file:org.commoncrawl.service.listcrawler.DataTransferAgent.java
License:Open Source License
public static void main(String[] args) { Logger logger = Logger.getLogger("org.commoncrawl"); logger.setLevel(Level.INFO);/*from ww w . j a v a 2s . c o m*/ BasicConfigurator.configure(); Configuration conf = new Configuration(); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); // set a big io buffer size ... conf.setInt("io.file.buffer.size", 4096 * 1024); final File transferLogDir = new File("/home/rana/ccprod/data/proxy_xfr_log"); final Path hdfsCacheDataPath = new Path("crawl/proxy/cache/"); final File shutdownFile = new File("/home/rana/ccprod/data/shutdown_xfr"); // create a deque .. final LinkedBlockingDeque<ProxyTransferItem> itemQueue = new LinkedBlockingDeque<ProxyTransferItem>(); final EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { final DistributedFileSystem fs = (DistributedFileSystem) FileSystem.get(conf); Thread transferThreads[] = new Thread[TRANSFER_THREADS_PER_HOST * mappingsTable.size()]; Semaphore shutdownSemaphore = new Semaphore(0); int threadIndex = 0; for (int i = 0; i < TRANSFER_THREADS_PER_HOST; ++i) { int serverIdx = 0; for (CCBridgeServerMapping mapping : mappingsTable) { transferThreads[(i * mappingsTable.size()) + serverIdx++] = startTransferThread(threadIndex++, mapping, shutdownFile, fs, conf, itemQueue, eventLoop, shutdownSemaphore); } } Thread scannerThread = new Thread(new Runnable() { long _lastScanId = -1; long _lastOutOfOrderDataDirId = -1L; static final int SCAN_INTERVAL_MS = 500; @Override public void run() { while (true) { try { if (shutdownFile.exists()) { LOG.info("Shutdown File Detected in ScanTimer Outer Loop. Exiting Scan Thread"); return; } LOG.info("Scanning For Files based on filter. Last Known Scan Id is:" + _lastScanId); FileStatus fileList[] = fs.listStatus(hdfsCacheDataPath, new PathFilter() { @Override public boolean accept(Path path) { try { if (path.getName().startsWith("cacheData-")) { // extract file id ... long currentFileId = Long .parseLong(path.getName().substring("cacheData-".length())); // figure out if we are going to process it ... if (_lastScanId == -1 || currentFileId > _lastScanId) { return true; } } } catch (Exception e) { LOG.error("Caught Exception Processing Path Filter:" + CCStringUtils.stringifyException(e)); } return false; } }); LOG.info("Scan returned:" + fileList.length + " Number of Valid Files"); long latestFileId = 0L; for (FileStatus file : fileList) { // extract file id ... long currentFileId = Long .parseLong(file.getPath().getName().substring("cacheData-".length())); // figure out if we are going to process it ... if (_lastScanId == -1 || currentFileId > _lastScanId) { // cache max latest id .. latestFileId = Math.max(latestFileId, currentFileId); File logFile = hdfsCacheFileToLogFileLocation(transferLogDir, file); if (logFile != null) { if (logFile.exists()) { LOG.info("Skipping:" + file.getPath().getName()); } else { LOG.info("Queueing File:" + file.getPath().getName()); itemQueue.add(new ProxyTransferItem(file.getPath(), logFile, file.getPath().getName())); } } } } // ok update lastest file id _lastScanId = Math.max(_lastScanId, latestFileId); FileStatus outofOrderDataDirs[] = fs .globStatus(new Path("crawl/proxy/dtAgentOutOfOrderTransfers/*")); for (FileStatus outOfOrderDataDir : outofOrderDataDirs) { long dataDirId = Long.parseLong(outOfOrderDataDir.getPath().getName()); if (dataDirId > _lastOutOfOrderDataDirId) { FileStatus candidates[] = fs .globStatus(new Path(outOfOrderDataDir.getPath(), "part-*")); for (FileStatus candidate : candidates) { File logFile = outOfOrderFileToLogFileLocation(transferLogDir, candidate.getPath()); if (logFile != null) { String candidateName = candidate.getPath().getParent().getName() + "-" + candidate.getPath().getName(); if (logFile.exists()) { LOG.info("Skipping OOB FILE:" + candidateName); } else { LOG.info("Queueing OOB FILE:" + candidateName); itemQueue.add(new ProxyTransferItem(candidate.getPath(), logFile, candidateName)); } } } _lastOutOfOrderDataDirId = dataDirId; } } LOG.info("Finish Scan. Last Known Scan Id is now:" + _lastScanId); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } try { Thread.sleep(SCAN_INTERVAL_MS); } catch (InterruptedException e) { } } } }); // start scanner thread ... scannerThread.start(); LOG.info("Waiting on Transfer Threads"); shutdownSemaphore.acquireUninterruptibly(TRANSFER_THREADS_PER_HOST * mappingsTable.size()); LOG.info("ALL Transfer Threads Dead."); // wait for scanner thread to die LOG.info("Waiting for Scanner Thread to Die."); try { scannerThread.join(); } catch (InterruptedException e) { } LOG.info("Killing Event Loop"); eventLoop.stop(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java
License:Open Source License
static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException { FileSystem fs = FileSystem.get(conf); FileSystem localFS = FileSystem.getLocal(conf); final Multimap<Long, Range> rangeMap = TreeMultimap.create(); FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*")); for (FileStatus candidate : candidateDirs) { String fileName = candidate.getPath().getName(); // get scaled timestamp start long timestampStart = Long.parseLong(fileName) * 1000000000; // ok see if exceeds our cutoff time if (timestampStart < cutOffTimeMillisecond) { FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*")); for (FileStatus range : ranges) { String rangeName = range.getPath().getName(); long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-"))); long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1)); rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd)); }/*from w w w.j a va 2 s . co m*/ } } PathFilter cacheDataFilter = new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) { long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1)); long timestampPrefix = timestamp / 1000000000L; //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix); for (Range range : rangeMap.get(timestampPrefix)) { if (timestamp >= range.e0 && timestamp <= range.e1) { return true; } } } return false; } }; PathFilter historyDataFilter = new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) { int indexOfDot = path.getName().indexOf("."); long timestamp = -1L; if (indexOfDot != -1) { timestamp = Long .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot)); } else { timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1)); } if (timestamp < cutOffTimeMillisecond) { return true; } } return false; } }; FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter); for (FileStatus candidate : purgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath()); } FileStatus localcacheDataPurgeCandidates[] = localFS .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter); for (FileStatus candidate : localcacheDataPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); localFS.delete(candidate.getPath()); } // now delete bloom filter data FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter); for (FileStatus candidate : historyPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath(), true); } // now delete bloom filter data FileStatus localHistoryPurgeCandidates[] = localFS.globStatus( new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter); for (FileStatus candidate : historyPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath(), true); } for (FileStatus candidate : localHistoryPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); localFS.delete(candidate.getPath(), true); } }
From source file:org.elasticsearch.common.blobstore.hdfs.AbstractHdfsBlobContainer.java
License:Apache License
@Override public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final String blobNamePrefix) throws IOException { FileStatus[] files = blobStore.fileSystem().listStatus(path, new PathFilter() { @Override/*from w w w .j a v a 2 s . c om*/ public boolean accept(Path path) { return path.getName().startsWith(blobNamePrefix); } }); if (files == null || files.length == 0) { return ImmutableMap.of(); } ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder(); for (FileStatus file : files) { builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen())); } return builder.build(); }
From source file:org.elasticsearch.hadoop.hdfs.blobstore.AbstractHdfsBlobContainer.java
License:Apache License
@Override public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final @Nullable String blobNamePrefix) throws IOException { FileStatus[] files = blobStore.fileSystem().listStatus(path, new PathFilter() { @Override/*from w ww . j ava2 s. c o m*/ public boolean accept(Path path) { return path.getName().startsWith(blobNamePrefix); } }); if (files == null || files.length == 0) { return ImmutableMap.of(); } ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder(); for (FileStatus file : files) { builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen())); } return builder.build(); }
From source file:org.elasticsearch.hadoop.hdfs.blobstore.HdfsBlobContainer.java
License:Apache License
@Override public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final @Nullable String blobNamePrefix) throws IOException { FileStatus[] files = blobStore.fileSystemFactory().getFileSystem().listStatus(path, new PathFilter() { @Override/*from w w w. ja va2 s .co m*/ public boolean accept(Path path) { return path.getName().startsWith(blobNamePrefix); } }); if (files == null || files.length == 0) { return ImmutableMap.of(); } ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder(); for (FileStatus file : files) { builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen())); } return builder.build(); }
From source file:org.elasticsearch.repositories.hdfs.HdfsBlobContainer.java
License:Apache License
@Override public Map<String, BlobMetaData> listBlobsByPrefix(@Nullable final String prefix) throws IOException { FileStatus[] files = store.execute(new Operation<FileStatus[]>() { @Override//from w ww . j av a 2 s . com public FileStatus[] run(FileContext fileContext) throws IOException { return (fileContext.util().listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { return prefix == null || path.getName().startsWith(prefix); } })); } }); Map<String, BlobMetaData> map = new LinkedHashMap<String, BlobMetaData>(); for (FileStatus file : files) { map.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen())); } return Collections.unmodifiableMap(map); }
From source file:org.kitesdk.apps.spi.oozie.ShareLibs.java
License:Apache License
/** * Based on ShareLibService.getLatestLibPath, which is not available * in client libraries.//from www .j av a 2 s . c o m */ private static Path getLatestLibPath(FileSystem fs, Path rootDir) throws IOException { Date max = new Date(0L); Path path = null; PathFilter directoryFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith("lib_"); } }; FileStatus[] files = fs.listStatus(rootDir, directoryFilter); for (FileStatus file : files) { String name = file.getPath().getName().toString(); String time = name.substring("lib_".length()); Date d = null; try { d = new SimpleDateFormat("yyyyMMddHHmmss").parse(time); } catch (ParseException e) { continue; } if (d.compareTo(max) > 0) { path = file.getPath(); max = d; } } //If there are no timestamped directories, fall back to root directory if (path == null) { path = rootDir; } return path; }
From source file:org.mrgeo.format.DirectoryInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Path parent = getParentDirectory(context); final String[] subdirs = getSubdirNames(context); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileSystem fs = parent.getFileSystem(context.getConfiguration()); // Only look at subdirs requested by the caller (if specified) PathFilter pathFilter = null;/* ww w . j av a 2 s. c o m*/ if (subdirs != null && subdirs.length > 0) { pathFilter = new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); for (String subdir : subdirs) { if (subdir.equals(name)) { return true; } } return false; } }; } FileStatus[] children = (pathFilter == null) ? fs.listStatus(parent) : fs.listStatus(parent, pathFilter); for (FileStatus child : children) { // Return each subdirectory as a separate split. This means that entire subdirs will // be processed by a single mapper. if (child.isDir()) { // BlockLocation[] blkLocations = fs.getFileBlockLocations(child, 0L, child.getLen()); // splits.add(new DirectorySplit(child.getPath(), (blkLocations.length == 0) ? new String[0] : blkLocations[0].getHosts())); splits.add(new DirectorySplit(child.getPath(), new String[0])); } } return splits; }