List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java
License:Open Source License
public long getLatestDatabaseTimestamp() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); LOG.info("Scanning for Database Candidates in:" + getTaskIdentityBasePath()); FileStatus candidates[] = fs.globStatus(new Path(getTaskIdentityBasePath(), "*")); long candidateTimestamp = -1L; for (FileStatus candidate : candidates) { LOG.info("Found Seed Candidate:" + candidate.getPath()); try {//from w w w. ja va2 s. c om long timestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp == -1 || candidateTimestamp < timestamp) { candidateTimestamp = timestamp; } } catch (Exception e) { LOG.error("Skipping Path:" + candidate.getPath()); } } LOG.info("Selected Candidate is:" + candidateTimestamp); return candidateTimestamp; }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {//from w ww. ja v a 2 s . c o m try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.commoncrawl.service.crawler.CrawlerEngine.java
License:Open Source License
/** internal helper routine to load crawl segment metdata given list id **/ private List<CrawlSegment> populateCrawlSegmentsFromHDFS(int listId) throws IOException { ArrayList<CrawlSegment> crawlSegments = new ArrayList<CrawlSegment>(); LOG.info("Populating CrawlSegment(s) from HDFS for List:" + listId); // get root path for crawl segment data for the specified list id Path hdfsSearchPath = CrawlSegmentLog.buildHDFSCrawlSegmentSearchPathForListId(listId, _server.getHostName());/* w ww. ja v a 2 s . c o m*/ // scan hdfs for relevant path information for crawl segments FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); LOG.info("Searching for crawl segments with hdfs search path:" + hdfsSearchPath); // scan hdfs for matching files ... FileStatus fileStatusArray[] = hdfs.globStatus(hdfsSearchPath); LOG.info("Found:" + fileStatusArray.length + " segments at path:" + hdfsSearchPath); // now walk matched set for (FileStatus fileStatus : fileStatusArray) { // segment id is the parent path name of the matched file String segmentName = fileStatus.getPath().getParent().getName(); int segmentId = Integer.parseInt(segmentName); //now populate crawl segment information CrawlSegment crawlSegment = new CrawlSegment(); crawlSegment.setListId(listId); crawlSegment.setSegmentId(segmentId); LOG.info("adding crawl segment:" + crawlSegment.getSegmentId() + " for List:" + listId); crawlSegments.add(crawlSegment); } return crawlSegments; }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private List<Path> reloadActiveHistory() throws IOException { ArrayList<Path> paths = new ArrayList<Path>(); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // create scan pattern Path hdfsScanPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[];//w w w . ja v a 2s .c om LOG.info("Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // ok found a candidate we can work on LOG.info("Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); } }); LOG.info("Finished Processing Candidate:" + candidate.getPath()); paths.add(candidate.getPath()); } return paths; }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private void reloadLaggingHistory(int previousCrawlNumber) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + previousCrawlNumber + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[];/*from w w w. j ava 2 s. c om*/ LOG.info("Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // ok found a candidate we can work on LOG.info("Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); } }); LOG.info("Finished Processing Candidate:" + candidate.getPath()); } }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private void moveToTransitioningState(final AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) { // start a thread an wait for checkpoint thread to purge all log files ... new Thread(new Runnable() { @Override//from w w w. j a va2 s.co m public void run() { // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); LOG.info("Scanning Log Directory at Path:" + hdfsScanPath + " for Log Files"); // scan hdfs for log files while (true) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); if (fs.globStatus(hdfsScanPath).length != 0) { LOG.info( "Waiting for CheckpointThread to Purge All Existing Log Files for Crawl Number:" + _state.getCurrentCrawlNumber()); try { Thread.sleep(5000); } catch (InterruptedException e) { } } else { break; } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } LOG.info("Acquiring Checkpoint Thread Semaphore"); _checkpointThreadSemaphore.acquireUninterruptibly(); LOG.info("Acquired Checkpoint Thread Semaphore - Scheduling Async Callback"); // ok now we can safely reset state, shift back to async thread ... getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { try { LOG.info("Updating State to Transitioning"); // set server to appropriate state _state.setCurrentCheckpointState(CrawlHistoryStatus.CheckpointState.TRANSITIONING); LOG.info("Serializing Database State"); updateState(); rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber()); rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } // complete the request ... try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { _checkpointThreadSemaphore.release(); } } })); } }).start(); }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private void startCheckpointThread(final FileSystem fs) { _checkpointThread = new Thread(new Runnable() { @Override/* ww w .j a v a 2 s . co m*/ public void run() { // ok check point thread run in perpetuty while (!_shutdownFlag) { if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1 || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL || (System.currentTimeMillis() - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) { //LOG.info("Checkpoint Thread Grabbing Semaphore"); // grab checkpoint thread semaphore _checkpointThreadSemaphore.acquireUninterruptibly(); //LOG.info("Checkpoint Thread Grabbed Semaphore"); try { // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[]; try { LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // check candidate against processed path list ... if (!_processedPaths.contains(candidate.getPath())) { int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get(); // ok found a candidate we can work on LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); // inrement urls processed count ... _urlsProcessedSinceCheckpoint.addAndGet(1); } }); _processedPaths.add(candidate.getPath()); LOG.info("Finished Processing Candidate:" + candidate.getPath()); } } // update scan time ... _lastCheckpointScanTime = System.currentTimeMillis(); // see if can do a full checkpoint ... if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis() - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) { int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get(); // ok at this point we are read to initialize a checkpoint if (approximateItemsToFlush != 0) { Path checkpointMutexPath = getCheckpointMutexPath(); if (fs.createNewFile(checkpointMutexPath)) { try { LOG.info("Checkpoint Thread Starting Checkpoint"); // get the checkpoint path ... Path checkpointPath = getDataFileCheckpointPath(); Path finalPath = getDataFileFinalPath(); LOG.info("Checkpoint Thread Writing BloomFilter Data"); // serialize the filter ... serializeBloomFilter(checkpointPath); LOG.info("Checkpoint Thread Deleting Old Checkpoint Data"); // ok now everything seems to have gone fine ... delete existing data file fs.delete(finalPath); LOG.info("Checkpoint Thread ReWriting New Checkpoint Data"); // rename checkpoint to final ... fs.rename(checkpointPath, finalPath); if (_state .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) { LOG.info("Checkpoint Thread Deleting Processed Files"); // ok safely delete all processed files for (Path processedFilePath : _processedPaths) { fs.delete(processedFilePath); } _processedPaths.clear(); } else { LOG.info( "Skipping Processed Files Purge because we are in Transitioning State"); } _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush); } finally { LOG.info( "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath); fs.delete(checkpointMutexPath, false); } } else { int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY); LOG.info("Checkpoint thread failed to acquire Mutex:" + checkpointMutexPath + " Waiting " + delay + "(MS) before retry"); try { Thread.sleep(delay); } catch (InterruptedException e) { } } } // update last checkpoint no time no matter what ... _lastCheckpointFlushTime = System.currentTimeMillis(); } } catch (IOException e) { LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e)); try { Thread.sleep(60000); } catch (InterruptedException e1) { } } } finally { LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore"); _checkpointThreadSemaphore.release(); } } else { try { //LOG.info("Checkpoint Thread IDLE"); Thread.sleep(100); } catch (InterruptedException e) { } } } } }); _checkpointThread.start(); }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void loadExistingLists() throws IOException { // scan data directory for list id pattern FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()); FileStatus loadTargets[] = localFileSystem .globStatus(new Path(_localLogFileDir.getAbsolutePath(), CrawlList.LIST_URL_DATA_PREFIX + "*")); // sort list so that we load newer lists first ... Arrays.sort(loadTargets, new Comparator<FileStatus>() { @Override//w w w.j a v a 2 s . c o m public int compare(FileStatus o1, FileStatus o2) { return ((Long) o2.getModificationTime()).compareTo(o1.getModificationTime()); } }); for (FileStatus loadTarget : loadTargets) { // extract timestamp ... long listId = Long .parseLong(loadTarget.getPath().getName().substring(CrawlList.LIST_URL_DATA_PREFIX.length())); LOG.info("Found List Data for List:" + listId); // validate if (CrawlList.allFilesPresent(_localLogFileDir, listId)) { LOG.info("List looks valid. Loading"); try { CrawlList list = new CrawlList(this, listId); synchronized (_crawlLists) { CrawlList oldList = _crawlLists.get(listId); if (oldList != null) { list.setEventListener(oldList.getEventListener()); } _crawlLists.put(listId, list); } LOG.info("Loaded List:" + listId + " Scheduling for Queueing"); _queueLoaderQueue.add(new QueueItem<CrawlList>(list)); } catch (IOException e) { LOG.error( "Failed to load list:" + listId + " Exception:" + CCStringUtils.stringifyException(e)); synchronized (_crawlLists) { _crawlLists.put(listId, CrawlList.createListWithLoadErrorState(this, listId, e)); } } } } }
From source file:org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java
License:Open Source License
static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException { FileSystem fs = FileSystem.get(conf); FileSystem localFS = FileSystem.getLocal(conf); final Multimap<Long, Range> rangeMap = TreeMultimap.create(); FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*")); for (FileStatus candidate : candidateDirs) { String fileName = candidate.getPath().getName(); // get scaled timestamp start long timestampStart = Long.parseLong(fileName) * 1000000000; // ok see if exceeds our cutoff time if (timestampStart < cutOffTimeMillisecond) { FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*")); for (FileStatus range : ranges) { String rangeName = range.getPath().getName(); long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-"))); long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1)); rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd)); }//from www . j av a 2s. c o m } } PathFilter cacheDataFilter = new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) { long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1)); long timestampPrefix = timestamp / 1000000000L; //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix); for (Range range : rangeMap.get(timestampPrefix)) { if (timestamp >= range.e0 && timestamp <= range.e1) { return true; } } } return false; } }; PathFilter historyDataFilter = new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) { int indexOfDot = path.getName().indexOf("."); long timestamp = -1L; if (indexOfDot != -1) { timestamp = Long .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot)); } else { timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1)); } if (timestamp < cutOffTimeMillisecond) { return true; } } return false; } }; FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter); for (FileStatus candidate : purgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath()); } FileStatus localcacheDataPurgeCandidates[] = localFS .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter); for (FileStatus candidate : localcacheDataPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); localFS.delete(candidate.getPath()); } // now delete bloom filter data FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter); for (FileStatus candidate : historyPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath(), true); } // now delete bloom filter data FileStatus localHistoryPurgeCandidates[] = localFS.globStatus( new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter); for (FileStatus candidate : historyPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); fs.delete(candidate.getPath(), true); } for (FileStatus candidate : localHistoryPurgeCandidates) { System.out.println("Purging Candidate:" + candidate.getPath()); localFS.delete(candidate.getPath(), true); } }
From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java
License:Open Source License
private void clearAllCheckpointAndDistributionFiles(FileSystem fs, Path jobDataPath) throws IOException { // scan job directory for best value candidate Path checkpointSearchPattern = new Path(jobDataPath, "*-CheckpointComplete-*"); Path distroSearchPattern = new Path(jobDataPath, "OutlinkPR-*"); FileStatus checkpointCandidates[] = fs.globStatus(checkpointSearchPattern); for (FileStatus candidate : checkpointCandidates) { LOG.info("Deleting:" + candidate.getPath()); fs.delete(candidate.getPath(), false); }/*from w w w. j av a 2 s .co m*/ FileStatus distroCandidates[] = fs.globStatus(distroSearchPattern); for (FileStatus candidate : distroCandidates) { LOG.info("Deleting:" + candidate.getPath()); fs.delete(candidate.getPath(), false); } }