Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs
            .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) {
        // ok look for the SUCCESS file
        Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE);
        if (fs.exists(successPath)) {
            completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
        }/*w  ww  .ja  va  2s. c o m*/
    }
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) {
        Configuration conf = new Configuration();
        conf.addResource(new Path("/home/hadoop/conf/core-site.xml"));
        conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml"));

        LinkGraphDataEmitterJob task = new LinkGraphDataEmitterJob(conf);
        task.run();/*from   w  w  w.j  a  v  a 2s. co m*/
    } else {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*"));

        // first find an affinity segment 
        Path affinityTarget = null;
        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));
            if (fs.exists(stage1Path)) {
                LOG.info("Found existing segment to build affinity against");
                affinityTarget = stage1Path;
            }
        }

        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            LOG.info("Segment Id:" + segmentId);
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));

            if (!fs.exists(stage1Path)) {
                LOG.info("Need to run stage 1 for Segment:" + segmentId);
                try {
                    mergeSegmentInternal(fs, conf, segmentId, affinityTarget);
                    if (affinityTarget == null) {
                        LOG.info("Adopting Successfully create merge output as affinity segment");
                        affinityTarget = stage1Path;
                    }
                } catch (IOException e) {
                    LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:"
                            + CCStringUtils.stringifyException(e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void mergeSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);/*from  ww w.j  av a2 s . c  om*/
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkDataResharder.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(100)
            .reducer(LinkDataResharder.class, true).partition(LinkKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(5000).speculativeExecution(true)

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .delayReducersUntil(1.0f)

            .build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) {
        completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
    }/* w  w  w.j  a va 2 s. c o  m*/
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) {
        // ok look for the SUCCESS file
        Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE);
        if (fs.exists(successPath)) {
            completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
        }//  w w w . j  a v  a2s  .  com
    }
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) {
        Configuration conf = new Configuration();
        conf.addResource(new Path("/home/hadoop/conf/core-site.xml"));
        conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml"));

        LinkCollectorJob task = new LinkCollectorJob(conf);
        task.run();/*from  ww w  .j  a  v  a  2  s  . co  m*/
    } else {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*"));

        // first find an affinity segment 
        Path affinityTarget = null;
        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));
            if (fs.exists(stage1Path)) {
                LOG.info("Found existing segment to build affinity against");
                affinityTarget = stage1Path;
            }
        }

        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            LOG.info("Segment Id:" + segmentId);
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));

            if (!fs.exists(stage1Path)) {
                LOG.info("Need to run stage 1 for Segment:" + segmentId);
                try {
                    mergeSegmentInternal(fs, conf, segmentId, affinityTarget);
                    if (affinityTarget == null) {
                        LOG.info("Adopting Successfully create merge output as affinity segment");
                        affinityTarget = stage1Path;
                    }
                } catch (IOException e) {
                    LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:"
                            + CCStringUtils.stringifyException(e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkGraphDataEmitterJob.java

License:Open Source License

public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);//from   w w  w.  ja v a2  s.  c  o m
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000)
            .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).numReducers(500)
            .speculativeExecution(true).output(outputPath).outputFormat(SequenceFileOutputFormat.class)
            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class).build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException {
    long timestampOut = -1L;

    FileStatus files[] = fs.globStatus(new Path(internalMergedDBPath, "[0-9]*"));

    for (FileStatus candidate : files) {
        long timestamp = Long.parseLong(candidate.getPath().getName());
        timestampOut = Math.max(timestamp, timestampOut);
    }/*from ww w.  j a v  a 2s.  c  om*/
    return timestampOut;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp)
        throws IOException {
    ArrayList<Path> list = new ArrayList<Path>();
    FileStatus candidates[] = fs.globStatus(new Path(internalMergedSegmentPath, "[0-9]*"));

    for (FileStatus candidate : candidates) {
        long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp > latestMergeDBTimestamp) {
            list.add(candidate.getPath());
        }//from   www.  ja  v  a  2 s .  co m
    }
    return list;
}

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException {

    if (promoteFinalStepOutput()) {
        FileSystem fs = getFileSystem();
        Path taskAsStepOutputDir = getOutputDir();
        fs.mkdirs(taskAsStepOutputDir);/*w w w.  j a v  a2 s  .c  om*/
        getLogger().info("finalStepComplete callback triggered - promoting output from:"
                + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir);

        // copy everything from final step into task output ...
        FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*"));

        if (files.length != 0) {
            fs.delete(taskAsStepOutputDir, true);
            fs.mkdirs(taskAsStepOutputDir);
        }

        for (FileStatus file : files) {
            fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName()));
        }
    }
}