Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

public long getLatestDatabaseTimestamp() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    LOG.info("Scanning for Database Candidates in:" + getTaskIdentityBasePath());

    FileStatus candidates[] = fs.globStatus(new Path(getTaskIdentityBasePath(), "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        try {//from  w  w w. ja  va2 s. c  om
            long timestamp = Long.parseLong(candidate.getPath().getName());
            if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
                candidateTimestamp = timestamp;
            }
        } catch (Exception e) {
            LOG.error("Skipping Path:" + candidate.getPath());
        }
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {//from  w  ww.  ja  v a 2 s .  c o  m
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.commoncrawl.service.crawler.CrawlerEngine.java

License:Open Source License

/** internal helper routine to load crawl segment metdata given list id **/
private List<CrawlSegment> populateCrawlSegmentsFromHDFS(int listId) throws IOException {

    ArrayList<CrawlSegment> crawlSegments = new ArrayList<CrawlSegment>();

    LOG.info("Populating CrawlSegment(s) from HDFS for List:" + listId);
    // get root path for crawl segment data for the specified list id 
    Path hdfsSearchPath = CrawlSegmentLog.buildHDFSCrawlSegmentSearchPathForListId(listId,
            _server.getHostName());/*  w ww.  ja v a  2  s . c o  m*/
    // scan hdfs for relevant path information for crawl segments
    FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
    LOG.info("Searching for crawl segments with hdfs search path:" + hdfsSearchPath);
    // scan hdfs for matching files ... 
    FileStatus fileStatusArray[] = hdfs.globStatus(hdfsSearchPath);
    LOG.info("Found:" + fileStatusArray.length + " segments at path:" + hdfsSearchPath);

    // now walk matched set 
    for (FileStatus fileStatus : fileStatusArray) {
        // segment id is the parent path name of the matched file
        String segmentName = fileStatus.getPath().getParent().getName();
        int segmentId = Integer.parseInt(segmentName);
        //now populate crawl segment information 
        CrawlSegment crawlSegment = new CrawlSegment();

        crawlSegment.setListId(listId);
        crawlSegment.setSegmentId(segmentId);

        LOG.info("adding crawl segment:" + crawlSegment.getSegmentId() + " for List:" + listId);

        crawlSegments.add(crawlSegment);
    }
    return crawlSegments;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private List<Path> reloadActiveHistory() throws IOException {
    ArrayList<Path> paths = new ArrayList<Path>();
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // create scan pattern 
    Path hdfsScanPath = new Path(
            CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/"
                    + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

    // scan hdfs for log files
    FileStatus candidates[];//w w w . ja v a 2s  .c  om

    LOG.info("Scanning For Cadnidates in:" + hdfsScanPath);
    candidates = fs.globStatus(hdfsScanPath);

    // iterate candidates 
    for (FileStatus candidate : candidates) {

        // ok found a candidate we can work on 
        LOG.info("Found Candidate:" + candidate.getPath());
        final URLFPV2 placeHolderFP = new URLFPV2();
        CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                new CrawlSegmentLog.LogFileItemCallback() {

                    @Override
                    public void processItem(long domainHash, long urlFingerprint) {
                        placeHolderFP.setDomainHash(domainHash);
                        placeHolderFP.setUrlHash(urlFingerprint);
                        // add item for bloom filter 
                        _bloomFilter.add(placeHolderFP);
                    }
                });
        LOG.info("Finished Processing Candidate:" + candidate.getPath());

        paths.add(candidate.getPath());
    }

    return paths;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void reloadLaggingHistory(int previousCrawlNumber) throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // create scan pattern 
    Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + previousCrawlNumber
            + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

    // scan hdfs for log files
    FileStatus candidates[];/*from  w  w w. j ava  2 s. c om*/

    LOG.info("Scanning For Cadnidates in:" + hdfsScanPath);
    candidates = fs.globStatus(hdfsScanPath);

    // iterate candidates 
    for (FileStatus candidate : candidates) {

        // ok found a candidate we can work on 
        LOG.info("Found Candidate:" + candidate.getPath());
        final URLFPV2 placeHolderFP = new URLFPV2();
        CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                new CrawlSegmentLog.LogFileItemCallback() {

                    @Override
                    public void processItem(long domainHash, long urlFingerprint) {
                        placeHolderFP.setDomainHash(domainHash);
                        placeHolderFP.setUrlHash(urlFingerprint);
                        // add item for bloom filter 
                        _bloomFilter.add(placeHolderFP);
                    }
                });
        LOG.info("Finished Processing Candidate:" + candidate.getPath());
    }
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void moveToTransitioningState(final AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) {

    // start a thread an wait for checkpoint thread to purge all log files ... 
    new Thread(new Runnable() {

        @Override//from w  w w. j a va2 s.co m
        public void run() {
            // create scan pattern 
            Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                    + _state.getCurrentCrawlNumber() + "/*/"
                    + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

            LOG.info("Scanning Log Directory at Path:" + hdfsScanPath + " for Log Files");
            // scan hdfs for log files
            while (true) {
                try {

                    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

                    if (fs.globStatus(hdfsScanPath).length != 0) {
                        LOG.info(
                                "Waiting for CheckpointThread to Purge All Existing Log Files for Crawl Number:"
                                        + _state.getCurrentCrawlNumber());
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException e) {

                        }
                    } else {
                        break;
                    }
                } catch (IOException e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }
            }

            LOG.info("Acquiring Checkpoint Thread Semaphore");
            _checkpointThreadSemaphore.acquireUninterruptibly();
            LOG.info("Acquired Checkpoint Thread Semaphore - Scheduling Async Callback");
            // ok now we can safely reset state, shift back to async thread ... 
            getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() {

                @Override
                public void timerFired(Timer timer) {
                    try {
                        LOG.info("Updating State to Transitioning");
                        // set server to appropriate state 
                        _state.setCurrentCheckpointState(CrawlHistoryStatus.CheckpointState.TRANSITIONING);
                        LOG.info("Serializing Database State");
                        updateState();
                        rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber());
                        rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState());

                    } catch (IOException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                        rpcContext.setStatus(Status.Error_RequestFailed);
                        rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
                    }

                    // complete the request ... 
                    try {
                        rpcContext.completeRequest();
                    } catch (RPCException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                    } finally {
                        _checkpointThreadSemaphore.release();
                    }
                }
            }));

        }

    }).start();

}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void startCheckpointThread(final FileSystem fs) {

    _checkpointThread = new Thread(new Runnable() {

        @Override/*  ww w  .j  a  v a  2 s .  co m*/
        public void run() {

            // ok check point thread run in perpetuty
            while (!_shutdownFlag) {

                if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1
                        || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL
                        || (System.currentTimeMillis()
                                - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) {

                    //LOG.info("Checkpoint Thread Grabbing Semaphore");
                    // grab checkpoint thread semaphore 
                    _checkpointThreadSemaphore.acquireUninterruptibly();
                    //LOG.info("Checkpoint Thread Grabbed Semaphore");

                    try {
                        // create scan pattern 
                        Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                                + _state.getCurrentCrawlNumber() + "/*/"
                                + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

                        // scan hdfs for log files
                        FileStatus candidates[];
                        try {
                            LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath);
                            candidates = fs.globStatus(hdfsScanPath);

                            // iterate candidates 
                            for (FileStatus candidate : candidates) {

                                // check candidate against processed path list ... 
                                if (!_processedPaths.contains(candidate.getPath())) {
                                    int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get();
                                    // ok found a candidate we can work on 
                                    LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath());
                                    final URLFPV2 placeHolderFP = new URLFPV2();
                                    CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                                            new CrawlSegmentLog.LogFileItemCallback() {

                                                @Override
                                                public void processItem(long domainHash, long urlFingerprint) {
                                                    placeHolderFP.setDomainHash(domainHash);
                                                    placeHolderFP.setUrlHash(urlFingerprint);
                                                    // add item for bloom filter 
                                                    _bloomFilter.add(placeHolderFP);
                                                    // inrement urls processed count ...
                                                    _urlsProcessedSinceCheckpoint.addAndGet(1);
                                                }
                                            });
                                    _processedPaths.add(candidate.getPath());
                                    LOG.info("Finished Processing Candidate:" + candidate.getPath());
                                }
                            }

                            // update scan time ... 
                            _lastCheckpointScanTime = System.currentTimeMillis();

                            // see if can do a full checkpoint ... 
                            if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis()
                                    - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) {

                                int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get();
                                // ok at this point we are read to initialize a checkpoint 
                                if (approximateItemsToFlush != 0) {

                                    Path checkpointMutexPath = getCheckpointMutexPath();

                                    if (fs.createNewFile(checkpointMutexPath)) {
                                        try {
                                            LOG.info("Checkpoint Thread Starting Checkpoint");

                                            // get the checkpoint path ... 
                                            Path checkpointPath = getDataFileCheckpointPath();
                                            Path finalPath = getDataFileFinalPath();

                                            LOG.info("Checkpoint Thread Writing BloomFilter Data");
                                            // serialize the filter ... 
                                            serializeBloomFilter(checkpointPath);

                                            LOG.info("Checkpoint Thread Deleting Old Checkpoint Data");
                                            // ok now everything seems to have gone fine ... delete existing data file 
                                            fs.delete(finalPath);
                                            LOG.info("Checkpoint Thread ReWriting New Checkpoint Data");
                                            // rename checkpoint to final ... 
                                            fs.rename(checkpointPath, finalPath);

                                            if (_state
                                                    .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) {
                                                LOG.info("Checkpoint Thread Deleting Processed Files");
                                                // ok safely delete all processed files
                                                for (Path processedFilePath : _processedPaths) {
                                                    fs.delete(processedFilePath);
                                                }
                                                _processedPaths.clear();
                                            } else {
                                                LOG.info(
                                                        "Skipping Processed Files Purge because we are in Transitioning State");
                                            }
                                            _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush);
                                        } finally {
                                            LOG.info(
                                                    "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath);
                                            fs.delete(checkpointMutexPath, false);
                                        }
                                    } else {
                                        int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY);
                                        LOG.info("Checkpoint thread failed to acquire Mutex:"
                                                + checkpointMutexPath + " Waiting " + delay
                                                + "(MS) before retry");
                                        try {
                                            Thread.sleep(delay);
                                        } catch (InterruptedException e) {
                                        }
                                    }
                                }
                                // update last checkpoint no time no matter what ...
                                _lastCheckpointFlushTime = System.currentTimeMillis();
                            }

                        } catch (IOException e) {
                            LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:"
                                    + CCStringUtils.stringifyException(e));
                            try {
                                Thread.sleep(60000);
                            } catch (InterruptedException e1) {
                            }
                        }
                    } finally {
                        LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore");
                        _checkpointThreadSemaphore.release();
                    }
                } else {
                    try {
                        //LOG.info("Checkpoint Thread IDLE");
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                    }
                }
            }

        }

    });
    _checkpointThread.start();
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void loadExistingLists() throws IOException {
    // scan data directory for list id pattern
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    FileStatus loadTargets[] = localFileSystem
            .globStatus(new Path(_localLogFileDir.getAbsolutePath(), CrawlList.LIST_URL_DATA_PREFIX + "*"));

    // sort list so that we load newer lists first ...
    Arrays.sort(loadTargets, new Comparator<FileStatus>() {

        @Override//w  w  w.j a v a 2  s . c  o  m
        public int compare(FileStatus o1, FileStatus o2) {
            return ((Long) o2.getModificationTime()).compareTo(o1.getModificationTime());
        }

    });

    for (FileStatus loadTarget : loadTargets) {
        // extract timestamp ...
        long listId = Long
                .parseLong(loadTarget.getPath().getName().substring(CrawlList.LIST_URL_DATA_PREFIX.length()));
        LOG.info("Found List Data for List:" + listId);
        // validate
        if (CrawlList.allFilesPresent(_localLogFileDir, listId)) {
            LOG.info("List looks valid. Loading");
            try {
                CrawlList list = new CrawlList(this, listId);
                synchronized (_crawlLists) {
                    CrawlList oldList = _crawlLists.get(listId);
                    if (oldList != null) {
                        list.setEventListener(oldList.getEventListener());
                    }
                    _crawlLists.put(listId, list);
                }
                LOG.info("Loaded List:" + listId + " Scheduling for Queueing");
                _queueLoaderQueue.add(new QueueItem<CrawlList>(list));
            } catch (IOException e) {
                LOG.error(
                        "Failed to load list:" + listId + " Exception:" + CCStringUtils.stringifyException(e));
                synchronized (_crawlLists) {
                    _crawlLists.put(listId, CrawlList.createListWithLoadErrorState(this, listId, e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java

License:Open Source License

static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FileSystem localFS = FileSystem.getLocal(conf);

    final Multimap<Long, Range> rangeMap = TreeMultimap.create();
    FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*"));

    for (FileStatus candidate : candidateDirs) {
        String fileName = candidate.getPath().getName();
        // get scaled timestamp start 
        long timestampStart = Long.parseLong(fileName) * 1000000000;
        // ok see if exceeds our cutoff time 
        if (timestampStart < cutOffTimeMillisecond) {
            FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*"));
            for (FileStatus range : ranges) {
                String rangeName = range.getPath().getName();
                long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-")));
                long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1));

                rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd));
            }//from  www . j  av  a  2s.  c  o m
        }
    }

    PathFilter cacheDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) {
                long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                long timestampPrefix = timestamp / 1000000000L;
                //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix);
                for (Range range : rangeMap.get(timestampPrefix)) {
                    if (timestamp >= range.e0 && timestamp <= range.e1) {
                        return true;
                    }
                }
            }
            return false;
        }
    };

    PathFilter historyDataFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) {
                int indexOfDot = path.getName().indexOf(".");
                long timestamp = -1L;
                if (indexOfDot != -1) {
                    timestamp = Long
                            .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot));
                } else {
                    timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                }

                if (timestamp < cutOffTimeMillisecond) {
                    return true;
                }
            }
            return false;
        }
    };

    FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter);

    for (FileStatus candidate : purgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath());
    }

    FileStatus localcacheDataPurgeCandidates[] = localFS
            .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter);

    for (FileStatus candidate : localcacheDataPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath());
    }

    // now delete bloom filter data
    FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    // now delete bloom filter data
    FileStatus localHistoryPurgeCandidates[] = localFS.globStatus(
            new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter);

    for (FileStatus candidate : historyPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        fs.delete(candidate.getPath(), true);
    }

    for (FileStatus candidate : localHistoryPurgeCandidates) {
        System.out.println("Purging Candidate:" + candidate.getPath());
        localFS.delete(candidate.getPath(), true);
    }

}

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private void clearAllCheckpointAndDistributionFiles(FileSystem fs, Path jobDataPath) throws IOException {
    // scan job directory for best value candidate 
    Path checkpointSearchPattern = new Path(jobDataPath, "*-CheckpointComplete-*");
    Path distroSearchPattern = new Path(jobDataPath, "OutlinkPR-*");

    FileStatus checkpointCandidates[] = fs.globStatus(checkpointSearchPattern);
    for (FileStatus candidate : checkpointCandidates) {
        LOG.info("Deleting:" + candidate.getPath());
        fs.delete(candidate.getPath(), false);
    }/*from  w w w. j  av a  2 s  .co m*/
    FileStatus distroCandidates[] = fs.globStatus(distroSearchPattern);
    for (FileStatus candidate : distroCandidates) {
        LOG.info("Deleting:" + candidate.getPath());
        fs.delete(candidate.getPath(), false);
    }

}