Example usage for org.apache.hadoop.fs FileSystem rename

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem rename.

Prototype

public abstract boolean rename(Path src, Path dst) throws IOException;

Source Link

Document

Renames Path src to Path dst.

Usage

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath)
        throws IOException {
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath);
    fs.delete(outputPath, true);// w w  w.j a v  a  2 s  .c o m

    Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId));
    LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapTaskFailures(100)
            .reducer(LinkGraphDataEmitter.class, true).partition(CrawlDBKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(CrawlEnvironment.NUM_DB_SHARDS)
            .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution()

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .build();

    JobClient.runJob(jobConf);

    Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId));
    LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
    fs.rename(outputPath, finalOutputPath);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath)
        throws IOException {
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath);
    fs.delete(outputPath, true);//from   w ww .ja  va  2  s.c  o m

    Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId));
    LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkDataResharder.class).maxMapTaskFailures(100).reducer(LinkDataResharder.class, true)
            .partition(LinkKeyPartitioner.class).sort(LinkKeyComparator.class)
            .numReducers(CrawlEnvironment.NUM_DB_SHARDS)
            .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution()

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .build();

    JobClient.runJob(jobConf);

    Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId));
    LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
    fs.rename(outputPath, finalOutputPath);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // establish merge timestamp 
    long mergeTimesmap = System.currentTimeMillis();
    // get a temp directory ... 
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(mergeTimesmap));

    // find latest merge timestamp ... 
    long latestMergeDBTimestamp = findLatestMergeDBTimestamp(fs, conf);
    LOG.info("Latest MergeDB Timestmap is:" + latestMergeDBTimestamp);
    // find list of merge candidates ... 
    List<Path> candidateList = filterMergeCandidtes(fs, conf, latestMergeDBTimestamp);
    LOG.info("Merge Candidate List is:" + candidateList);
    if (candidateList.size() != 0) {
        ArrayList<Path> inputPaths = new ArrayList<Path>();

        // add all input paths to list 
        inputPaths.addAll(candidateList);
        // establish an affinity path ... 
        Path affinityPath = candidateList.get(0);
        // add merge db path if it exists 
        if (latestMergeDBTimestamp != -1L) {
            affinityPath = new Path(internalMergedDBPath, Long.toString(latestMergeDBTimestamp));
            inputPaths.add(affinityPath);
        }/*from w ww . j a  va2s . c om*/

        JobConf jobConf = new JobBuilder("Final Merge Job", conf).inputs(inputPaths)
                .inputFormat(MultiFileMergeInputFormat.class).mapperKeyValue(IntWritable.class, Text.class)
                .outputKeyValue(TextBytes.class, TextBytes.class).outputFormat(SequenceFileOutputFormat.class)
                .reducer(LinkMergerJob.class, false).partition(MultiFileMergePartitioner.class)
                .numReducers(CrawlEnvironment.NUM_DB_SHARDS).speculativeExecution(false).output(outputPath)
                .setAffinityNoBalancing(affinityPath,
                        ImmutableSet.of("ccd001.commoncrawl.org", "ccd006.commoncrawl.org"))

                .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)

                .build();

        JsonArray hack = new JsonArray();

        hack.add(new JsonPrimitive(11));
        hack.add(new JsonPrimitive(21));
        hack.add(new JsonPrimitive(82));
        hack.add(new JsonPrimitive(83));
        hack.add(new JsonPrimitive(90));

        jobConf.set("hack", hack.toString());

        LOG.info("Starting JOB");
        JobClient.runJob(jobConf);

        Path finalOutputPath = new Path(internalMergedDBPath, Long.toString(mergeTimesmap));
        LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
        fs.rename(outputPath, finalOutputPath);
    }

}

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException {

    if (promoteFinalStepOutput()) {
        FileSystem fs = getFileSystem();
        Path taskAsStepOutputDir = getOutputDir();
        fs.mkdirs(taskAsStepOutputDir);// ww w.  jav a2s .c  o m
        getLogger().info("finalStepComplete callback triggered - promoting output from:"
                + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir);

        // copy everything from final step into task output ...
        FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*"));

        if (files.length != 0) {
            fs.delete(taskAsStepOutputDir, true);
            fs.mkdirs(taskAsStepOutputDir);
        }

        for (FileStatus file : files) {
            fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName()));
        }
    }
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {/*from  w  w w  .ja  v a2s.c  o m*/
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.commoncrawl.service.crawler.CrawlLog.java

License:Open Source License

/** perform the actual checkpoint work here ... **/
private void doCheckpoint() {
    // at this point, we should be in the async thread, and all flusher
    // activities are blocked ...
    LOG.info("CrawlLog Checkpoint - Starting ");
    // collect all necessary information from thread-unsafe data structure now
    // (in async thread context)
    final Set<Long> activeSegments = new HashSet<Long>();

    try {//  w w w . j  a v  a 2 s .  co  m
        // add all active segment ids to our key set ...
        activeSegments.addAll(_loggers.keySet());
        LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files");
        // checkpoint crawl log ...
        checkpointLocalCrawlLog();
        LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files");
        // next checkpoint all active segment logs ...
        for (CrawlSegmentLog segmentLog : _loggers.values()) {
            segmentLog.checkpointLocalLog();
        }
        LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer");
    } catch (IOException e) {
        LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e));
    }

    // spawn a thread to do most of the blocking io ...
    _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

            new Callable<Boolean>() {

                public Boolean call() throws Exception {

                    // we need to track these in case of failure ...
                    Vector<Path> segmentLogStagingPaths = new Vector<Path>();
                    Vector<Path> segmentLogFinalPaths = new Vector<Path>();

                    // get the file system
                    final FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();

                    try {

                        LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS");

                        // construct a target path (where we are going to store the
                        // checkpointed crawl log )
                        Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory());

                        SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter(
                                CrawlEnvironment.getHadoopConfig(), hdfs, stagingDirectory, getNodeName(),
                                _checkpointId);

                        try {
                            // write out crawl log to hdfs ...
                            transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter,
                                    _checkpointId);
                        } catch (Exception e) {
                            LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:"
                                    + hdfsWriter.getFilenames() + " Exception:"
                                    + CCStringUtils.stringifyException(e));

                            // close writer
                            hdfsWriter.close();
                            // delete any hdfs output ...
                            for (Path path : hdfsWriter.getFilenames()) {
                                LOG.info("Deleting temp (HDFS) checkpoint file:" + path);
                                hdfs.delete(path, false);
                            }
                            throw e;
                        } finally {
                            hdfsWriter.close();
                        }

                        LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs");
                        // and next for every segment
                        for (long packedLogId : activeSegments) {

                            File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory,
                                    getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId));

                            // LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for Segment:"
                            // + segmentId);
                            // copy the segment log ...
                            Path remoteLogFilePath = transferLocalSegmentLog(hdfs, segmentLogPath,
                                    _checkpointId, getListIdFromLogId(packedLogId),
                                    getSegmentIdFromLogId(packedLogId));
                            // if path is not null (data was copied) ...
                            if (remoteLogFilePath != null) {
                                // add it to vector ...
                                segmentLogStagingPaths.add(remoteLogFilePath);
                                // and add final path to vector while we are at it ...
                                segmentLogFinalPaths.add(getFinalSegmentLogPath(hdfs, _checkpointId,
                                        getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)));
                            }
                        }
                        LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs");

                        // now if we got here ... all hdfs transfers succeeded ...
                        // go ahead and move checkpoint log from staging to final data
                        // directory ...
                        Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory());

                        // if no checkpoint data directory ... create one ...
                        if (!hdfs.exists(checkpointDirectory))
                            hdfs.mkdirs(checkpointDirectory);

                        for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) {
                            Path checkpointFinalPath = new Path(checkpointDirectory,
                                    checkpointTempFilePath.getName());
                            LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:"
                                    + checkpointFinalPath);
                            // and essentially move the crawl log file from staging to data
                            // directory ..
                            boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath);
                            if (!success) {
                                throw new IOException("Failed to Rename Checkpoint Temp:"
                                        + checkpointTempFilePath + " to:" + checkpointFinalPath);
                            }
                        }
                        // and now do the same thing for each segment log files
                        for (int i = 0; i < segmentLogStagingPaths.size(); ++i) {
                            hdfs.rename(segmentLogStagingPaths.get(i), segmentLogFinalPaths.get(i));
                        }
                        // if we got here checkpoint was successfull...
                        return true;
                    } catch (Exception e) {
                        LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:"
                                + CCStringUtils.stringifyException(e));
                        for (Path segmentPath : segmentLogStagingPaths) {
                            hdfs.delete(segmentPath);
                        }
                        for (Path segmentPath : segmentLogFinalPaths) {
                            hdfs.delete(segmentPath);
                        }
                        throw e;
                    }
                }
            },

            new CompletionCallback<Boolean>() {

                public void taskComplete(Boolean updateResult) {

                    Vector<Long> completedSegmentList = new Vector<Long>();

                    LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint");
                    // delete the local checkpoint log ...
                    finalizeCheckpoint();

                    LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs");
                    for (CrawlSegmentLog segmentLog : _loggers.values()) {
                        // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:"
                        // + segmentLog.getSegmentId());
                        // finalize the checkpoint on the segment log ...
                        segmentLog.finalizeCheckpoint();
                        // and check to see if the segment has been completed ...
                        if (segmentLog.isSegmentComplete()) {
                            // if so, add it our completed segments list ...
                            completedSegmentList
                                    .add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId()));
                        }
                    }

                    // now for all completed segments ... purge hdfs logs ...
                    for (long packedSegmentId : completedSegmentList) {
                        try {
                            LOG.info(
                                    "CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:"
                                            + getListIdFromLogId(packedSegmentId) + " Segment:"
                                            + getSegmentIdFromLogId(packedSegmentId));
                            // purge hdfs files (and create a completion log file)
                            purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(),
                                    getListIdFromLogId(packedSegmentId),
                                    getSegmentIdFromLogId(packedSegmentId));
                            LOG.info(
                                    "CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:"
                                            + getListIdFromLogId(packedSegmentId) + " Segment:"
                                            + getSegmentIdFromLogId(packedSegmentId));
                            // and purge local files as well ...
                            _loggers.get(packedSegmentId).purgeLocalFiles();
                        } catch (IOException e) {
                            LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId)
                                    + " Segment:" + getSegmentIdFromLogId(packedSegmentId)
                                    + " threw IOException:" + CCStringUtils.stringifyException(e));
                        }
                        LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:"
                                + getListIdFromLogId(packedSegmentId) + " Segment:"
                                + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog");
                        // no matter what ... unload the segment ...
                        _loggers.remove(packedSegmentId);
                    }

                    CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                    long checkpointId = _checkpointId;

                    // otherwise transition to a checkpoint in progress state
                    _checkpointCompletionCallback = null;
                    _checkpointId = -1;

                    LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback");

                    // and complete transaction ...
                    callback.checkpointComplete(checkpointId, completedSegmentList);

                }

                public void taskFailed(Exception e) {

                    // all failures are critical in this particular task ...
                    LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                    // revert checkpoint logs ...
                    abortCheckpoint();

                    for (CrawlSegmentLog segmentLog : _loggers.values()) {
                        segmentLog.abortCheckpoint();
                    }

                    CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                    long checkpointId = _checkpointId;

                    // otherwise transition to a checkpoint in progress state
                    _checkpointCompletionCallback = null;
                    _checkpointId = -1;

                    // now check to see if this was corrupt crawl log exception
                    if (e.getCause() instanceof CorruptCrawlLogException) {
                        // ACK!!!
                        LOG.fatal("Corrupt CrawlLog detected with Exception:"
                                + CCStringUtils.stringifyException(e));

                        try {
                            // this is a serious error ... time to purge the crawl log directory
                            // altogether ...
                            purgeActiveLog();

                            // and all active segment logs as well...
                            for (CrawlSegmentLog segmentLog : _loggers.values()) {
                                segmentLog.purgeActiveLog();
                            }
                        } catch (IOException e2) {
                            LOG.error("IOException during Segment Log PURGE:"
                                    + CCStringUtils.stringifyException(e2));
                        }

                        // time to die hard ...
                        throw new RuntimeException(e);
                    }

                    // and complete transaction ...
                    callback.checkpointFailed(checkpointId, e);
                }
            }));
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private boolean validateOnDiskVersion() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
    Path dataFilePath = getDataFileFinalPath();
    LOG.info("Loading BloomFilter From Disk at Path:" + dataFilePath);
    if (fs.exists(dataFilePath)) {
        FSDataInputStream stream = null;
        try {// w  w w  .ja  va 2s.  c  o  m
            stream = fs.open(dataFilePath);
            DataInputStream dataInput = new DataInputStream(stream);
            // skip version
            dataInput.readInt();
            // read crawl version ... 
            int serializedCrawlVersion = dataInput.readInt();
            LOG.info("BloomFilter From On Disk has CrawlVersion:" + serializedCrawlVersion);
            if (serializedCrawlVersion < _state.getCurrentCrawlNumber()) {
                LOG.error("skipping load because serial crawl number is less than current crawl");
                stream.close();
                stream = null;
                fs.rename(dataFilePath, new Path(dataFilePath.getParent(),
                        dataFilePath.getName() + "-V-" + serializedCrawlVersion));
                return false;
            }
            return true;
        } finally {
            if (stream != null)
                stream.close();
        }
    }
    return false;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void startCheckpointThread(final FileSystem fs) {

    _checkpointThread = new Thread(new Runnable() {

        @Override/*from   w w w.j  a  v  a2 s  .  c  om*/
        public void run() {

            // ok check point thread run in perpetuty
            while (!_shutdownFlag) {

                if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1
                        || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL
                        || (System.currentTimeMillis()
                                - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) {

                    //LOG.info("Checkpoint Thread Grabbing Semaphore");
                    // grab checkpoint thread semaphore 
                    _checkpointThreadSemaphore.acquireUninterruptibly();
                    //LOG.info("Checkpoint Thread Grabbed Semaphore");

                    try {
                        // create scan pattern 
                        Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                                + _state.getCurrentCrawlNumber() + "/*/"
                                + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

                        // scan hdfs for log files
                        FileStatus candidates[];
                        try {
                            LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath);
                            candidates = fs.globStatus(hdfsScanPath);

                            // iterate candidates 
                            for (FileStatus candidate : candidates) {

                                // check candidate against processed path list ... 
                                if (!_processedPaths.contains(candidate.getPath())) {
                                    int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get();
                                    // ok found a candidate we can work on 
                                    LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath());
                                    final URLFPV2 placeHolderFP = new URLFPV2();
                                    CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                                            new CrawlSegmentLog.LogFileItemCallback() {

                                                @Override
                                                public void processItem(long domainHash, long urlFingerprint) {
                                                    placeHolderFP.setDomainHash(domainHash);
                                                    placeHolderFP.setUrlHash(urlFingerprint);
                                                    // add item for bloom filter 
                                                    _bloomFilter.add(placeHolderFP);
                                                    // inrement urls processed count ...
                                                    _urlsProcessedSinceCheckpoint.addAndGet(1);
                                                }
                                            });
                                    _processedPaths.add(candidate.getPath());
                                    LOG.info("Finished Processing Candidate:" + candidate.getPath());
                                }
                            }

                            // update scan time ... 
                            _lastCheckpointScanTime = System.currentTimeMillis();

                            // see if can do a full checkpoint ... 
                            if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis()
                                    - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) {

                                int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get();
                                // ok at this point we are read to initialize a checkpoint 
                                if (approximateItemsToFlush != 0) {

                                    Path checkpointMutexPath = getCheckpointMutexPath();

                                    if (fs.createNewFile(checkpointMutexPath)) {
                                        try {
                                            LOG.info("Checkpoint Thread Starting Checkpoint");

                                            // get the checkpoint path ... 
                                            Path checkpointPath = getDataFileCheckpointPath();
                                            Path finalPath = getDataFileFinalPath();

                                            LOG.info("Checkpoint Thread Writing BloomFilter Data");
                                            // serialize the filter ... 
                                            serializeBloomFilter(checkpointPath);

                                            LOG.info("Checkpoint Thread Deleting Old Checkpoint Data");
                                            // ok now everything seems to have gone fine ... delete existing data file 
                                            fs.delete(finalPath);
                                            LOG.info("Checkpoint Thread ReWriting New Checkpoint Data");
                                            // rename checkpoint to final ... 
                                            fs.rename(checkpointPath, finalPath);

                                            if (_state
                                                    .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) {
                                                LOG.info("Checkpoint Thread Deleting Processed Files");
                                                // ok safely delete all processed files
                                                for (Path processedFilePath : _processedPaths) {
                                                    fs.delete(processedFilePath);
                                                }
                                                _processedPaths.clear();
                                            } else {
                                                LOG.info(
                                                        "Skipping Processed Files Purge because we are in Transitioning State");
                                            }
                                            _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush);
                                        } finally {
                                            LOG.info(
                                                    "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath);
                                            fs.delete(checkpointMutexPath, false);
                                        }
                                    } else {
                                        int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY);
                                        LOG.info("Checkpoint thread failed to acquire Mutex:"
                                                + checkpointMutexPath + " Waiting " + delay
                                                + "(MS) before retry");
                                        try {
                                            Thread.sleep(delay);
                                        } catch (InterruptedException e) {
                                        }
                                    }
                                }
                                // update last checkpoint no time no matter what ...
                                _lastCheckpointFlushTime = System.currentTimeMillis();
                            }

                        } catch (IOException e) {
                            LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:"
                                    + CCStringUtils.stringifyException(e));
                            try {
                                Thread.sleep(60000);
                            } catch (InterruptedException e1) {
                            }
                        }
                    } finally {
                        LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore");
                        _checkpointThreadSemaphore.release();
                    }
                } else {
                    try {
                        //LOG.info("Checkpoint Thread IDLE");
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                    }
                }
            }

        }

    });
    _checkpointThread.start();
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from   ww w  .ja va2 s .  c o m*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.deeplearning4j.hadoop.modelsaving.HdfsModelSaver.java

License:Apache License

@Override
public void save(Serializable ser) {
    FileSystem system;
    try {// w w w  .j  a va  2  s .  c o  m
        system = FileSystem.get(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    if (reWrite) {
        try {
            system.rename(path, new Path(path.getParent(), path.getName() + System.currentTimeMillis()));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    } else {
        try {
            OutputStream os = system.create(path);
            ObjectOutputStream bos = new ObjectOutputStream(os);
            bos.writeObject(ser);
            bos.flush();
            bos.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}