Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

public long getLatestDatabaseTimestamp() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    LOG.info("Scanning for Database Candidates in:" + getTaskIdentityBasePath());

    FileStatus candidates[] = fs.globStatus(new Path(getTaskIdentityBasePath(), "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        try {//from  w  w w. ja  va2 s. c  om
            long timestamp = Long.parseLong(candidate.getPath().getName());
            if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
                candidateTimestamp = timestamp;
            }
        } catch (Exception e) {
            LOG.error("Skipping Path:" + candidate.getPath());
        }
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {//from  w  ww.  ja  v a 2 s .  c o  m
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.commoncrawl.service.crawler.CrawlerEngine.java

License:Open Source License

/** internal helper routine to load crawl segment metdata given list id **/
private List<CrawlSegment> populateCrawlSegmentsFromHDFS(int listId) throws IOException {

    ArrayList<CrawlSegment> crawlSegments = new ArrayList<CrawlSegment>();

    LOG.info("Populating CrawlSegment(s) from HDFS for List:" + listId);
    // get root path for crawl segment data for the specified list id 
    Path hdfsSearchPath = CrawlSegmentLog.buildHDFSCrawlSegmentSearchPathForListId(listId,
            _server.getHostName());/*  w ww.  ja v a  2  s . c o  m*/
    // scan hdfs for relevant path information for crawl segments
    FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
    LOG.info("Searching for crawl segments with hdfs search path:" + hdfsSearchPath);
    // scan hdfs for matching files ... 
    FileStatus fileStatusArray[] = hdfs.globStatus(hdfsSearchPath);
    LOG.info("Found:" + fileStatusArray.length + " segments at path:" + hdfsSearchPath);

    // now walk matched set 
    for (FileStatus fileStatus : fileStatusArray) {
        // segment id is the parent path name of the matched file
        String segmentName = fileStatus.getPath().getParent().getName();
        int segmentId = Integer.parseInt(segmentName);
        //now populate crawl segment information 
        CrawlSegment crawlSegment = new CrawlSegment();

        crawlSegment.setListId(listId);
        crawlSegment.setSegmentId(segmentId);

        LOG.info("adding crawl segment:" + crawlSegment.getSegmentId() + " for List:" + listId);

        crawlSegments.add(crawlSegment);
    }
    return crawlSegments;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private List<Path> reloadActiveHistory() throws IOException {
    ArrayList<Path> paths = new ArrayList<Path>();
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // create scan pattern 
    Path hdfsScanPath = new Path(
            CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/"
                    + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

    // scan hdfs for log files
    FileStatus candidates[];//w w w . ja v a 2s  .c  om

    LOG.info("Scanning For Cadnidates in:" + hdfsScanPath);
    candidates = fs.globStatus(hdfsScanPath);

    // iterate candidates 
    for (FileStatus candidate : candidates) {

        // ok found a candidate we can work on 
        LOG.info("Found Candidate:" + candidate.getPath());
        final URLFPV2 placeHolderFP = new URLFPV2();
        CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                new CrawlSegmentLog.LogFileItemCallback() {

                    @Override
                    public void processItem(long domainHash, long urlFingerprint) {
                        placeHolderFP.setDomainHash(domainHash);
                        placeHolderFP.setUrlHash(urlFingerprint);
                        // add item for bloom filter 
                        _bloomFilter.add(placeHolderFP);
                    }
                });
        LOG.info("Finished Processing Candidate:" + candidate.getPath());

        paths.add(candidate.getPath());
    }

    return paths;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void reloadLaggingHistory(int previousCrawlNumber) throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // create scan pattern 
    Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + previousCrawlNumber
            + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

    // scan hdfs for log files
    FileStatus candidates[];/*from  w  w w. j ava  2 s. c om*/

    LOG.info("Scanning For Cadnidates in:" + hdfsScanPath);
    candidates = fs.globStatus(hdfsScanPath);

    // iterate candidates 
    for (FileStatus candidate : candidates) {

        // ok found a candidate we can work on 
        LOG.info("Found Candidate:" + candidate.getPath());
        final URLFPV2 placeHolderFP = new URLFPV2();
        CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                new CrawlSegmentLog.LogFileItemCallback() {

                    @Override
                    public void processItem(long domainHash, long urlFingerprint) {
                        placeHolderFP.setDomainHash(domainHash);
                        placeHolderFP.setUrlHash(urlFingerprint);
                        // add item for bloom filter 
                        _bloomFilter.add(placeHolderFP);
                    }
                });
        LOG.info("Finished Processing Candidate:" + candidate.getPath());
    }
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void moveToTransitioningState(final AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) {

    // start a thread an wait for checkpoint thread to purge all log files ... 
    new Thread(new Runnable() {

        @Override//from w  w w. j a va2 s.co m
        public void run() {
            // create scan pattern 
            Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                    + _state.getCurrentCrawlNumber() + "/*/"
                    + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

            LOG.info("Scanning Log Directory at Path:" + hdfsScanPath + " for Log Files");
            // scan hdfs for log files
            while (true) {
                try {

                    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

                    if (fs.globStatus(hdfsScanPath).length != 0) {
                        LOG.info(
                                "Waiting for CheckpointThread to Purge All Existing Log Files for Crawl Number:"
                                        + _state.getCurrentCrawlNumber());
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException e) {

                        }
                    } else {
                        break;
                    }
                } catch (IOException e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }
            }

            LOG.info("Acquiring Checkpoint Thread Semaphore");
            _checkpointThreadSemaphore.acquireUninterruptibly();
            LOG.info("Acquired Checkpoint Thread Semaphore - Scheduling Async Callback");
            // ok now we can safely reset state, shift back to async thread ... 
            getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() {

                @Override
                public void timerFired(Timer timer) {
                    try {
                        LOG.info("Updating State to Transitioning");
                        // set server to appropriate state 
                        _state.setCurrentCheckpointState(CrawlHistoryStatus.CheckpointState.TRANSITIONING);
                        LOG.info("Serializing Database State");
                        updateState();
                        rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber());
                        rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState());

                    } catch (IOException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                        rpcContext.setStatus(Status.Error_RequestFailed);
                        rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
                    }

                    // complete the request ... 
                    try {
                        rpcContext.completeRequest();
                    } catch (RPCException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                    } finally {
                        _checkpointThreadSemaphore.release();
                    }
                }
            }));

        }

    }).start();

}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void startCheckpointThread(final FileSystem fs) {

    _checkpointThread = new Thread(new Runnable() {

        @Override/*  ww w  .j  a  v a  2 s .  co m*/
        public void run() {

            // ok check point thread run in perpetuty
            while (!_shutdownFlag) {

                if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1
                        || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL
                        || (System.currentTimeMillis()
                                - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) {

                    //LOG.info("Checkpoint Thread Grabbing Semaphore");
                    // grab checkpoint thread semaphore 
                    _checkpointThreadSemaphore.acquireUninterruptibly();
                    //LOG.info("Checkpoint Thread Grabbed Semaphore");

                    try {
                        // create scan pattern 
                        Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                                + _state.getCurrentCrawlNumber() + "/*/"
                                + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

                        // scan hdfs for log files
                        FileStatus candidates[];
                        try {
                            LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath);
                            candidates = fs.globStatus(hdfsScanPath);

                            // iterate candidates 
                            for (FileStatus candidate : candidates) {

                                // check candidate against processed path list ... 
                                if (!_processedPaths.contains(candidate.getPath())) {
                                    int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get();
                                    // ok found a candidate we can work on 
                                    LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath());
                                    final URLFPV2 placeHolderFP = new URLFPV2();
                                    CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                                            new CrawlSegmentLog.LogFileItemCallback() {

                                                @Override
                                                public void processItem(long domainHash, long urlFingerprint) {
                                                    placeHolderFP.setDomainHash(domainHash);
                                                    placeHolderFP.setUrlHash(urlFingerprint);
                                                    // add item for bloom filter 
                                                    _bloomFilter.add(placeHolderFP);
                                                    // inrement urls processed count ...
                                                    _urlsProcessedSinceCheckpoint.addAndGet(1);
                                                }
                                            });
                                    _processedPaths.add(candidate.getPath());
                                    LOG.info("Finished Processing Candidate:" + candidate.getPath());
                                }
                            }

                            // update scan time ... 
                            _lastCheckpointScanTime = System.currentTimeMillis();

                            // see if can do a full checkpoint ... 
                            if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis()
                                    - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) {

                                int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get();
                                // ok at this point we are read to initialize a checkpoint 
                                if (approximateItemsToFlush != 0) {

                                    Path checkpointMutexPath = getCheckpointMutexPath();

                                    if (fs.createNewFile(checkpointMutexPath)) {
                                        try {
                                            LOG.info("Checkpoint Thread Starting Checkpoint");

                                            // get the checkpoint path ... 
                                            Path checkpointPath = getDataFileCheckpointPath();
                                            Path finalPath = getDataFileFinalPath();

                                            LOG.info("Checkpoint Thread Writing BloomFilter Data");
                                            // serialize the filter ... 
                                            serializeBloomFilter(checkpointPath);

                                            LOG.info("Checkpoint Thread Deleting Old Checkpoint Data");
                                            // ok now everything seems to have gone fine ... delete existing data file 
                                            fs.delete(finalPath);
                                            LOG.info("Checkpoint Thread ReWriting New Checkpoint Data");
                                            // rename checkpoint to final ... 
                                            fs.rename(checkpointPath, finalPath);

                                            if (_state
                                                    .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) {
                                                LOG.info("Checkpoint Thread Deleting Processed Files");
                                                // ok safely delete all processed files
                                                for (Path processedFilePath : _processedPaths) {
                                                    fs.delete(processedFilePath);
                                                }
                                                _processedPaths.clear();
                                            } else {
                                                LOG.info(
                                                        "Skipping Processed Files Purge because we are in Transitioning State");
                                            }
                                            _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush);
                                        } finally {
                                            LOG.info(
                                                    "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath);
                                            fs.delete(checkpointMutexPath, false);
                                        }
                                    } else {
                                        int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY);
                                        LOG.info("Checkpoint thread failed to acquire Mutex:"
                                                + checkpointMutexPath + " Waiting " + delay
                                                + "(MS) before retry");
                                        try {
                                            Thread.sleep(delay);
                                        } catch (InterruptedException e) {
                                        }
                                    }
                                }
                                // update last checkpoint no time no matter what ...
                                _lastCheckpointFlushTime = System.currentTimeMillis();
                            }

                        } catch (IOException e) {
                            LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:"
                                    + CCStringUtils.stringifyException(e));
                            try {
                                Thread.sleep(60000);
                            } catch (InterruptedException e1) {
                            }
                        }
                    } finally {
                        LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore");
                        _checkpointThreadSemaphore.release();
                    }
                } else {
                    try {
                        //LOG.info("Checkpoint Thread IDLE");
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                    }
                }
            }

        }

    });
    _checkpointThread.start();
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void loadExistingLists() throws IOException {
    // scan data directory for list id pattern
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    FileStatus loadTargets[] = localFileSystem
            .globStatus(new Path(_localLogFileDir.getAbsolutePath(), CrawlList.LIST_URL_DATA_PREFIX + "*"));

    // sort list so that we load newer lists first ...
    Arrays.sort(loadTargets, new Comparator<FileStatus>() {

        @Override//w  w  w.j a v a 2  s . c  o  m
        public int compare(FileStatus o1, FileStatus o2) {
            return ((Long) o2.getModificationTime()).compareTo(o1.getModificationTime());
        }

    });

    for (FileStatus loadTarget : loadTargets) {
        // extract timestamp ...
        long listId = Long
                .parseLong(loadTarget.getPath().getName().substring(CrawlList.LIST_URL_DATA_PREFIX.length()));
        LOG.info("Found List Data for List:" + listId);
        // validate
        if (CrawlList.allFilesPresent(_localLogFileDir, listId)) {
            LOG.info("List looks valid. Loading");
            try {
                CrawlList list = new CrawlList(this, listId);
                synchronized (_crawlLists) {
                    CrawlList oldList = _crawlLists.get(listId);
                    if (oldList != null) {
                        list.setEventListener(oldList.getEventListener());
                    }
                    _crawlLists.put(listId, list);
                }
                LOG.info("Loaded List:" + listId + " Scheduling for Queueing");
                _queueLoaderQueue.add(new QueueItem<CrawlList>(list));
            } catch (IOException e) {
                LOG.error(
                        "Failed to load list:" + listId + " Exception:" + CCStringUtils.stringifyException(e));
                synchronized (_crawlLists) {
                    _crawlLists.put(listId, CrawlList.createListWithLoadErrorState(this, listId, e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java

License:Open Source License

static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FileSystem localFS = FileSystem.getLocal(conf);

    final Multimap<Long, Range> rangeMap = TreeMultimap.create();
    FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*"));

    for (FileStatus candidate : candidateDirs) {
        String fileName = candidate.getPath().getName();
        // get scaled timestamp start 
        long timestampStart = Long.parseLong(fileName) * 1000000000;
        // ok see if exceeds our cutoff time 
        if (timestampStart < cutOffTimeMillisecond) {
            FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*"));
            for (FileStatus range : ranges) {
                String rangeName = range.getPath().getName();
                long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-")));
                long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1));

                rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd));
            }//from  www . j  av  a  2s.  c  o m
        }
    }

    PathFilter cacheDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) {
                long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                long timestampPrefix = timestamp / 1000000000L;
                //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix);
                for (Range range : rangeMap.get(timestampPrefix)) {
                    if (timestamp >= range.e0 && timestamp <= range.e1) {
                        return true;
                    }
                }
            }
            return false;
        }
    };

    PathFilter historyDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) {
                int indexOfDot = path.getName().indexOf(".");
                long timestamp = -1L;
                if (indexOfDot != -1) {
                    timestamp = Long
                            .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot));
                } else {
                    timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                }

                if (timestamp < cutOffTimeMillisecond) {
                    return true;
                }
            }
            return false;
        }
    };

    FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter);

    for (FileStatus candidate : purgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath());
    }

    FileStatus localcacheDataPurgeCandidates[] = localFS
            .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter);

    for (FileStatus candidate : localcacheDataPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath());
    }

    // now delete bloom filter data
    FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    // now delete bloom filter data
    FileStatus localHistoryPurgeCandidates[] = localFS.globStatus(
            new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    for (FileStatus candidate : localHistoryPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath(), true);
    }

}

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private void clearAllCheckpointAndDistributionFiles(FileSystem fs, Path jobDataPath) throws IOException {
    // scan job directory for best value candidate 
    Path checkpointSearchPattern = new Path(jobDataPath, "*-CheckpointComplete-*");
    Path distroSearchPattern = new Path(jobDataPath, "OutlinkPR-*");

    FileStatus checkpointCandidates[] = fs.globStatus(checkpointSearchPattern);
    for (FileStatus candidate : checkpointCandidates) {
        LOG.info("Deleting:" + candidate.getPath());
        fs.delete(candidate.getPath(), false);
    }/*from  w w w. j  av a  2 s  .co m*/
    FileStatus distroCandidates[] = fs.globStatus(distroSearchPattern);
    for (FileStatus candidate : distroCandidates) {
        LOG.info("Deleting:" + candidate.getPath());
        fs.delete(candidate.getPath(), false);
    }

}