List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java
License:Open Source License
private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException { SortedSet<Long> completeSegmentIds = Sets.newTreeSet(); for (FileStatus fileStatus : fs .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) { // ok look for the SUCCESS file Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE); if (fs.exists(successPath)) { completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName())); }/*w ww .ja va 2s. c o m*/ } return completeSegmentIds; }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) { Configuration conf = new Configuration(); conf.addResource(new Path("/home/hadoop/conf/core-site.xml")); conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml")); LinkGraphDataEmitterJob task = new LinkGraphDataEmitterJob(conf); task.run();/*from w w w.j a v a 2s. co m*/ } else { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*")); // first find an affinity segment Path affinityTarget = null; for (FileStatus segment : segments) { long segmentId = Long.parseLong(segment.getPath().getName()); Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId)); if (fs.exists(stage1Path)) { LOG.info("Found existing segment to build affinity against"); affinityTarget = stage1Path; } } for (FileStatus segment : segments) { long segmentId = Long.parseLong(segment.getPath().getName()); LOG.info("Segment Id:" + segmentId); Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId)); if (!fs.exists(stage1Path)) { LOG.info("Need to run stage 1 for Segment:" + segmentId); try { mergeSegmentInternal(fs, conf, segmentId, affinityTarget); if (affinityTarget == null) { LOG.info("Adopting Successfully create merge output as affinity segment"); affinityTarget = stage1Path; } } catch (IOException e) { LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:" + CCStringUtils.stringifyException(e)); } } } } }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java
License:Open Source License
public static void mergeSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException { Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId)); LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath); if (s3fs.exists(outputPath)) { LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!"); s3fs.delete(outputPath, true);/*from ww w.j av a2 s . c om*/ } // ok collect merge files ArrayList<Path> pathList = new ArrayList<Path>(); for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) { pathList.add(metadataFile.getPath().makeQualified(s3fs)); } LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList); JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList) .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class) .mapper(LinkDataResharder.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(100) .reducer(LinkDataResharder.class, true).partition(LinkKeyPartitioner.class) .sort(LinkKeyComparator.class).numReducers(5000).speculativeExecution(true) .output(outputPath).outputFormat(SequenceFileOutputFormat.class) .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class) .delayReducersUntil(1.0f) .build(); JobClient.runJob(jobConf); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java
License:Open Source License
private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException { SortedSet<Long> completeSegmentIds = Sets.newTreeSet(); for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) { completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName())); }/* w w w.j a va 2 s. c o m*/ return completeSegmentIds; }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java
License:Open Source License
private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException { SortedSet<Long> completeSegmentIds = Sets.newTreeSet(); for (FileStatus fileStatus : fs.globStatus(new Path(MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) { // ok look for the SUCCESS file Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE); if (fs.exists(successPath)) { completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName())); }// w w w . j a v a2s . com } return completeSegmentIds; }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) { Configuration conf = new Configuration(); conf.addResource(new Path("/home/hadoop/conf/core-site.xml")); conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml")); LinkCollectorJob task = new LinkCollectorJob(conf); task.run();/*from ww w .j a v a 2 s . co m*/ } else { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*")); // first find an affinity segment Path affinityTarget = null; for (FileStatus segment : segments) { long segmentId = Long.parseLong(segment.getPath().getName()); Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId)); if (fs.exists(stage1Path)) { LOG.info("Found existing segment to build affinity against"); affinityTarget = stage1Path; } } for (FileStatus segment : segments) { long segmentId = Long.parseLong(segment.getPath().getName()); LOG.info("Segment Id:" + segmentId); Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId)); if (!fs.exists(stage1Path)) { LOG.info("Need to run stage 1 for Segment:" + segmentId); try { mergeSegmentInternal(fs, conf, segmentId, affinityTarget); if (affinityTarget == null) { LOG.info("Adopting Successfully create merge output as affinity segment"); affinityTarget = stage1Path; } } catch (IOException e) { LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:" + CCStringUtils.stringifyException(e)); } } } } }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkGraphDataEmitterJob.java
License:Open Source License
public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException { Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId)); LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath); if (s3fs.exists(outputPath)) { LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!"); s3fs.delete(outputPath, true);//from w w w. ja v a2 s. c o m } // ok collect merge files ArrayList<Path> pathList = new ArrayList<Path>(); for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) { pathList.add(metadataFile.getPath().makeQualified(s3fs)); } LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList); JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList) .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class) .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000) .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).numReducers(500) .speculativeExecution(true).output(outputPath).outputFormat(SequenceFileOutputFormat.class) .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class).build(); JobClient.runJob(jobConf); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException { long timestampOut = -1L; FileStatus files[] = fs.globStatus(new Path(internalMergedDBPath, "[0-9]*")); for (FileStatus candidate : files) { long timestamp = Long.parseLong(candidate.getPath().getName()); timestampOut = Math.max(timestamp, timestampOut); }/*from ww w. j a v a 2s. c om*/ return timestampOut; }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp) throws IOException { ArrayList<Path> list = new ArrayList<Path>(); FileStatus candidates[] = fs.globStatus(new Path(internalMergedSegmentPath, "[0-9]*")); for (FileStatus candidate : candidates) { long candidateTimestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp > latestMergeDBTimestamp) { list.add(candidate.getPath()); }//from www. ja v a 2 s . co m } return list; }
From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java
License:Open Source License
protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException { if (promoteFinalStepOutput()) { FileSystem fs = getFileSystem(); Path taskAsStepOutputDir = getOutputDir(); fs.mkdirs(taskAsStepOutputDir);/*w w w. j a v a2 s .c om*/ getLogger().info("finalStepComplete callback triggered - promoting output from:" + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir); // copy everything from final step into task output ... FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*")); if (files.length != 0) { fs.delete(taskAsStepOutputDir, true); fs.mkdirs(taskAsStepOutputDir); } for (FileStatus file : files) { fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName())); } } }