Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs
            .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) {
        // ok look for the SUCCESS file
        Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE);
        if (fs.exists(successPath)) {
            completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
        }/*w  ww  .ja  va  2s. c o m*/
    }
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) {
        Configuration conf = new Configuration();
        conf.addResource(new Path("/home/hadoop/conf/core-site.xml"));
        conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml"));

        LinkGraphDataEmitterJob task = new LinkGraphDataEmitterJob(conf);
        task.run();/*from   w  w  w.j  a  v  a 2s. co m*/
    } else {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*"));

        // first find an affinity segment 
        Path affinityTarget = null;
        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));
            if (fs.exists(stage1Path)) {
                LOG.info("Found existing segment to build affinity against");
                affinityTarget = stage1Path;
            }
        }

        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            LOG.info("Segment Id:" + segmentId);
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));

            if (!fs.exists(stage1Path)) {
                LOG.info("Need to run stage 1 for Segment:" + segmentId);
                try {
                    mergeSegmentInternal(fs, conf, segmentId, affinityTarget);
                    if (affinityTarget == null) {
                        LOG.info("Adopting Successfully create merge output as affinity segment");
                        affinityTarget = stage1Path;
                    }
                } catch (IOException e) {
                    LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:"
                            + CCStringUtils.stringifyException(e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void mergeSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);/*from  ww w.j  av a2 s . c  om*/
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkDataResharder.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(100)
            .reducer(LinkDataResharder.class, true).partition(LinkKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(5000).speculativeExecution(true)

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .delayReducersUntil(1.0f)

            .build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) {
        completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
    }/* w  w  w.j  a va 2 s. c o  m*/
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

private static SortedSet<Long> scanForMergedSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(MERGE_INTERMEDIATE_OUTPUT_PATH + "[0-9]*"))) {
        // ok look for the SUCCESS file
        Path successPath = new Path(fileStatus.getPath(), JOB_SUCCESS_FILE);
        if (fs.exists(successPath)) {
            completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
        }//  w w w . j  a v  a2s  .  com
    }
    return completeSegmentIds;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 0 && args[0].equalsIgnoreCase("--runOnEC2")) {
        Configuration conf = new Configuration();
        conf.addResource(new Path("/home/hadoop/conf/core-site.xml"));
        conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml"));

        LinkCollectorJob task = new LinkCollectorJob(conf);
        task.run();/*from  ww w  .j  a  v  a  2  s  . co  m*/
    } else {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        FileStatus segments[] = fs.globStatus(new Path(internalEC2SegmentPath, "[0-9]*"));

        // first find an affinity segment 
        Path affinityTarget = null;
        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));
            if (fs.exists(stage1Path)) {
                LOG.info("Found existing segment to build affinity against");
                affinityTarget = stage1Path;
            }
        }

        for (FileStatus segment : segments) {
            long segmentId = Long.parseLong(segment.getPath().getName());
            LOG.info("Segment Id:" + segmentId);
            Path stage1Path = new Path(internalMergedSegmentPath, Long.toString(segmentId));

            if (!fs.exists(stage1Path)) {
                LOG.info("Need to run stage 1 for Segment:" + segmentId);
                try {
                    mergeSegmentInternal(fs, conf, segmentId, affinityTarget);
                    if (affinityTarget == null) {
                        LOG.info("Adopting Successfully create merge output as affinity segment");
                        affinityTarget = stage1Path;
                    }
                } catch (IOException e) {
                    LOG.error("stage 1 for Segment:" + segmentId + " Failed with Exception:"
                            + CCStringUtils.stringifyException(e));
                }
            }
        }
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkGraphDataEmitterJob.java

License:Open Source License

public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);//from   w w  w.  ja v a2  s.  c  o m
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000)
            .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).numReducers(500)
            .speculativeExecution(true).output(outputPath).outputFormat(SequenceFileOutputFormat.class)
            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class).build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException {
    long timestampOut = -1L;

    FileStatus files[] = fs.globStatus(new Path(internalMergedDBPath, "[0-9]*"));

    for (FileStatus candidate : files) {
        long timestamp = Long.parseLong(candidate.getPath().getName());
        timestampOut = Math.max(timestamp, timestampOut);
    }/*from ww w.  j a v  a 2s.  c  om*/
    return timestampOut;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp)
        throws IOException {
    ArrayList<Path> list = new ArrayList<Path>();
    FileStatus candidates[] = fs.globStatus(new Path(internalMergedSegmentPath, "[0-9]*"));

    for (FileStatus candidate : candidates) {
        long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp > latestMergeDBTimestamp) {
            list.add(candidate.getPath());
        }//from   www.  ja  v  a  2 s .  co m
    }
    return list;
}

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException {

    if (promoteFinalStepOutput()) {
        FileSystem fs = getFileSystem();
        Path taskAsStepOutputDir = getOutputDir();
        fs.mkdirs(taskAsStepOutputDir);/*w w w.  j a v  a2 s  .c  om*/
        getLogger().info("finalStepComplete callback triggered - promoting output from:"
                + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir);

        // copy everything from final step into task output ...
        FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*"));

        if (files.length != 0) {
            fs.delete(taskAsStepOutputDir, true);
            fs.mkdirs(taskAsStepOutputDir);
        }

        for (FileStatus file : files) {
            fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName()));
        }
    }
}