Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.tez.test.TestTezJobs.java

License:Apache License

@Test(timeout = 60000)
public void testSortMergeJoinExampleDisableSplitGrouping() throws Exception {
    SortMergeJoinExample sortMergeJoinExample = new SortMergeJoinExample();
    sortMergeJoinExample.setConf(conf);/* w  w  w  .ja va 2s.  com*/
    Path stagingDirPath = new Path(TEST_ROOT_DIR + "/tmp/tez-staging-dir");
    Path inPath1 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath1");
    Path inPath2 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath2");
    Path outPath = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/outPath");
    localFs.delete(outPath, true);
    localFs.mkdirs(inPath1);
    localFs.mkdirs(inPath2);
    localFs.mkdirs(stagingDirPath);

    Set<String> expectedResult = new HashSet<String>();

    FSDataOutputStream out1 = localFs.create(new Path(inPath1, "file"));
    FSDataOutputStream out2 = localFs.create(new Path(inPath2, "file"));
    BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1));
    BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2));
    for (int i = 0; i < 20; i++) {
        String term = "term" + i;
        writer1.write(term);
        writer1.newLine();
        if (i % 2 == 0) {
            writer2.write(term);
            writer2.newLine();
            expectedResult.add(term);
        }
    }
    writer1.close();
    writer2.close();
    out1.close();
    out2.close();

    String[] args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(),
            "-local", "-disableSplitGrouping", inPath1.toString(), inPath2.toString(), "1",
            outPath.toString() };
    assertEquals(0, sortMergeJoinExample.run(args));

    FileStatus[] statuses = localFs.listStatus(outPath, new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    });
    assertEquals(1, statuses.length);
    FSDataInputStream inStream = localFs.open(statuses[0].getPath());
    BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    while ((line = reader.readLine()) != null) {
        assertTrue(expectedResult.remove(line));
    }
    reader.close();
    inStream.close();
    assertEquals(0, expectedResult.size());
}

From source file:org.archive.tnh.nutch.Segments.java

License:Apache License

public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
    return new PathFilter() {
        public boolean accept(final Path path) {
            try {
                return fs.getFileStatus(path).isDir();
            } catch (IOException ioe) {
                return false;
            }/*ww  w.j  a  v a 2s. c om*/
        }
    };
}

From source file:org.commoncrawl.service.listcrawler.DataTransferAgent.java

License:Open Source License

public static void main(String[] args) {

    Logger logger = Logger.getLogger("org.commoncrawl");
    logger.setLevel(Level.INFO);/*from  ww  w . j a v a  2s  .  c o  m*/
    BasicConfigurator.configure();

    Configuration conf = new Configuration();

    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");

    // set a big io buffer size ... 
    conf.setInt("io.file.buffer.size", 4096 * 1024);

    final File transferLogDir = new File("/home/rana/ccprod/data/proxy_xfr_log");
    final Path hdfsCacheDataPath = new Path("crawl/proxy/cache/");
    final File shutdownFile = new File("/home/rana/ccprod/data/shutdown_xfr");

    // create a deque .. 
    final LinkedBlockingDeque<ProxyTransferItem> itemQueue = new LinkedBlockingDeque<ProxyTransferItem>();

    final EventLoop eventLoop = new EventLoop();
    eventLoop.start();

    try {

        final DistributedFileSystem fs = (DistributedFileSystem) FileSystem.get(conf);
        Thread transferThreads[] = new Thread[TRANSFER_THREADS_PER_HOST * mappingsTable.size()];
        Semaphore shutdownSemaphore = new Semaphore(0);
        int threadIndex = 0;
        for (int i = 0; i < TRANSFER_THREADS_PER_HOST; ++i) {
            int serverIdx = 0;
            for (CCBridgeServerMapping mapping : mappingsTable) {
                transferThreads[(i * mappingsTable.size()) + serverIdx++] = startTransferThread(threadIndex++,
                        mapping, shutdownFile, fs, conf, itemQueue, eventLoop, shutdownSemaphore);
            }
        }

        Thread scannerThread = new Thread(new Runnable() {

            long _lastScanId = -1;
            long _lastOutOfOrderDataDirId = -1L;

            static final int SCAN_INTERVAL_MS = 500;

            @Override
            public void run() {

                while (true) {

                    try {
                        if (shutdownFile.exists()) {
                            LOG.info("Shutdown File Detected in ScanTimer Outer Loop. Exiting Scan Thread");
                            return;
                        }

                        LOG.info("Scanning For Files based on filter. Last Known Scan Id is:" + _lastScanId);
                        FileStatus fileList[] = fs.listStatus(hdfsCacheDataPath, new PathFilter() {

                            @Override
                            public boolean accept(Path path) {
                                try {
                                    if (path.getName().startsWith("cacheData-")) {
                                        // extract file id ... 
                                        long currentFileId = Long
                                                .parseLong(path.getName().substring("cacheData-".length()));
                                        // figure out if we are going to process it ... 
                                        if (_lastScanId == -1 || currentFileId > _lastScanId) {
                                            return true;
                                        }
                                    }
                                } catch (Exception e) {
                                    LOG.error("Caught Exception Processing Path Filter:"
                                            + CCStringUtils.stringifyException(e));
                                }
                                return false;
                            }
                        });
                        LOG.info("Scan returned:" + fileList.length + " Number of Valid Files");

                        long latestFileId = 0L;
                        for (FileStatus file : fileList) {
                            // extract file id ... 
                            long currentFileId = Long
                                    .parseLong(file.getPath().getName().substring("cacheData-".length()));
                            // figure out if we are going to process it ... 
                            if (_lastScanId == -1 || currentFileId > _lastScanId) {
                                // cache max latest id ..
                                latestFileId = Math.max(latestFileId, currentFileId);
                                File logFile = hdfsCacheFileToLogFileLocation(transferLogDir, file);
                                if (logFile != null) {
                                    if (logFile.exists()) {
                                        LOG.info("Skipping:" + file.getPath().getName());
                                    } else {
                                        LOG.info("Queueing File:" + file.getPath().getName());
                                        itemQueue.add(new ProxyTransferItem(file.getPath(), logFile,
                                                file.getPath().getName()));
                                    }
                                }
                            }
                        }
                        // ok update lastest file id 
                        _lastScanId = Math.max(_lastScanId, latestFileId);

                        FileStatus outofOrderDataDirs[] = fs
                                .globStatus(new Path("crawl/proxy/dtAgentOutOfOrderTransfers/*"));

                        for (FileStatus outOfOrderDataDir : outofOrderDataDirs) {
                            long dataDirId = Long.parseLong(outOfOrderDataDir.getPath().getName());
                            if (dataDirId > _lastOutOfOrderDataDirId) {
                                FileStatus candidates[] = fs
                                        .globStatus(new Path(outOfOrderDataDir.getPath(), "part-*"));

                                for (FileStatus candidate : candidates) {
                                    File logFile = outOfOrderFileToLogFileLocation(transferLogDir,
                                            candidate.getPath());
                                    if (logFile != null) {
                                        String candidateName = candidate.getPath().getParent().getName() + "-"
                                                + candidate.getPath().getName();

                                        if (logFile.exists()) {
                                            LOG.info("Skipping OOB FILE:" + candidateName);

                                        } else {
                                            LOG.info("Queueing OOB FILE:" + candidateName);
                                            itemQueue.add(new ProxyTransferItem(candidate.getPath(), logFile,
                                                    candidateName));
                                        }
                                    }
                                }
                                _lastOutOfOrderDataDirId = dataDirId;
                            }
                        }

                        LOG.info("Finish Scan. Last Known Scan Id is now:" + _lastScanId);

                    } catch (Exception e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                    }

                    try {
                        Thread.sleep(SCAN_INTERVAL_MS);
                    } catch (InterruptedException e) {
                    }
                }
            }
        });

        // start scanner thread ... 
        scannerThread.start();

        LOG.info("Waiting on Transfer Threads");
        shutdownSemaphore.acquireUninterruptibly(TRANSFER_THREADS_PER_HOST * mappingsTable.size());
        LOG.info("ALL Transfer Threads Dead.");
        // wait for scanner thread to die 
        LOG.info("Waiting for Scanner Thread to Die.");
        try {
            scannerThread.join();
        } catch (InterruptedException e) {
        }
        LOG.info("Killing Event Loop");
        eventLoop.stop();

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }

}

From source file:org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java

License:Open Source License

static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FileSystem localFS = FileSystem.getLocal(conf);

    final Multimap<Long, Range> rangeMap = TreeMultimap.create();
    FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*"));

    for (FileStatus candidate : candidateDirs) {
        String fileName = candidate.getPath().getName();
        // get scaled timestamp start 
        long timestampStart = Long.parseLong(fileName) * 1000000000;
        // ok see if exceeds our cutoff time 
        if (timestampStart < cutOffTimeMillisecond) {
            FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*"));
            for (FileStatus range : ranges) {
                String rangeName = range.getPath().getName();
                long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-")));
                long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1));

                rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd));
            }/*from w w  w.j a va  2 s  . co  m*/
        }
    }

    PathFilter cacheDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) {
                long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                long timestampPrefix = timestamp / 1000000000L;
                //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix);
                for (Range range : rangeMap.get(timestampPrefix)) {
                    if (timestamp >= range.e0 && timestamp <= range.e1) {
                        return true;
                    }
                }
            }
            return false;
        }
    };

    PathFilter historyDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) {
                int indexOfDot = path.getName().indexOf(".");
                long timestamp = -1L;
                if (indexOfDot != -1) {
                    timestamp = Long
                            .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot));
                } else {
                    timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                }

                if (timestamp < cutOffTimeMillisecond) {
                    return true;
                }
            }
            return false;
        }
    };

    FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter);

    for (FileStatus candidate : purgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath());
    }

    FileStatus localcacheDataPurgeCandidates[] = localFS
            .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter);

    for (FileStatus candidate : localcacheDataPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath());
    }

    // now delete bloom filter data
    FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    // now delete bloom filter data
    FileStatus localHistoryPurgeCandidates[] = localFS.globStatus(
            new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    for (FileStatus candidate : localHistoryPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath(), true);
    }

}

From source file:org.elasticsearch.common.blobstore.hdfs.AbstractHdfsBlobContainer.java

License:Apache License

@Override
public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final String blobNamePrefix) throws IOException {
    FileStatus[] files = blobStore.fileSystem().listStatus(path, new PathFilter() {
        @Override/*from w w  w .j  a v a  2 s . c om*/
        public boolean accept(Path path) {
            return path.getName().startsWith(blobNamePrefix);
        }
    });
    if (files == null || files.length == 0) {
        return ImmutableMap.of();
    }
    ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder();
    for (FileStatus file : files) {
        builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen()));
    }
    return builder.build();
}

From source file:org.elasticsearch.hadoop.hdfs.blobstore.AbstractHdfsBlobContainer.java

License:Apache License

@Override
public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final @Nullable String blobNamePrefix)
        throws IOException {
    FileStatus[] files = blobStore.fileSystem().listStatus(path, new PathFilter() {
        @Override/*from w  ww  . j ava2 s. c o m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(blobNamePrefix);
        }
    });
    if (files == null || files.length == 0) {
        return ImmutableMap.of();
    }
    ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder();
    for (FileStatus file : files) {
        builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen()));
    }
    return builder.build();
}

From source file:org.elasticsearch.hadoop.hdfs.blobstore.HdfsBlobContainer.java

License:Apache License

@Override
public ImmutableMap<String, BlobMetaData> listBlobsByPrefix(final @Nullable String blobNamePrefix)
        throws IOException {
    FileStatus[] files = blobStore.fileSystemFactory().getFileSystem().listStatus(path, new PathFilter() {
        @Override/*from w w  w. ja va2 s .co m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(blobNamePrefix);
        }
    });
    if (files == null || files.length == 0) {
        return ImmutableMap.of();
    }
    ImmutableMap.Builder<String, BlobMetaData> builder = ImmutableMap.builder();
    for (FileStatus file : files) {
        builder.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen()));
    }
    return builder.build();
}

From source file:org.elasticsearch.repositories.hdfs.HdfsBlobContainer.java

License:Apache License

@Override
public Map<String, BlobMetaData> listBlobsByPrefix(@Nullable final String prefix) throws IOException {
    FileStatus[] files = store.execute(new Operation<FileStatus[]>() {
        @Override//from  w ww .  j av a  2 s . com
        public FileStatus[] run(FileContext fileContext) throws IOException {
            return (fileContext.util().listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return prefix == null || path.getName().startsWith(prefix);
                }
            }));
        }
    });
    Map<String, BlobMetaData> map = new LinkedHashMap<String, BlobMetaData>();
    for (FileStatus file : files) {
        map.put(file.getPath().getName(), new PlainBlobMetaData(file.getPath().getName(), file.getLen()));
    }
    return Collections.unmodifiableMap(map);
}

From source file:org.kitesdk.apps.spi.oozie.ShareLibs.java

License:Apache License

/**
 * Based on ShareLibService.getLatestLibPath, which is not available
 * in client libraries.//from   www .j  av  a 2  s .  c o  m
 */
private static Path getLatestLibPath(FileSystem fs, Path rootDir) throws IOException {
    Date max = new Date(0L);
    Path path = null;
    PathFilter directoryFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().startsWith("lib_");
        }
    };

    FileStatus[] files = fs.listStatus(rootDir, directoryFilter);
    for (FileStatus file : files) {
        String name = file.getPath().getName().toString();
        String time = name.substring("lib_".length());
        Date d = null;
        try {
            d = new SimpleDateFormat("yyyyMMddHHmmss").parse(time);
        } catch (ParseException e) {
            continue;
        }
        if (d.compareTo(max) > 0) {
            path = file.getPath();
            max = d;
        }
    }
    //If there are no timestamped directories, fall back to root directory
    if (path == null) {
        path = rootDir;
    }
    return path;
}

From source file:org.mrgeo.format.DirectoryInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Path parent = getParentDirectory(context);
    final String[] subdirs = getSubdirNames(context);
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileSystem fs = parent.getFileSystem(context.getConfiguration());
    // Only look at subdirs requested by the caller (if specified)
    PathFilter pathFilter = null;/*  ww w  . j av  a  2 s. c o  m*/
    if (subdirs != null && subdirs.length > 0) {
        pathFilter = new PathFilter() {
            @Override
            public boolean accept(Path p) {
                String name = p.getName();
                for (String subdir : subdirs) {
                    if (subdir.equals(name)) {
                        return true;
                    }
                }
                return false;
            }
        };
    }
    FileStatus[] children = (pathFilter == null) ? fs.listStatus(parent) : fs.listStatus(parent, pathFilter);
    for (FileStatus child : children) {
        // Return each subdirectory as a separate split. This means that entire subdirs will
        // be processed by a single mapper.
        if (child.isDir()) {
            //        BlockLocation[] blkLocations = fs.getFileBlockLocations(child, 0L, child.getLen());
            //        splits.add(new DirectorySplit(child.getPath(), (blkLocations.length == 0) ? new String[0] : blkLocations[0].getHosts()));
            splits.add(new DirectorySplit(child.getPath(), new String[0]));
        }
    }
    return splits;
}