Example usage for org.apache.hadoop.fs FileSystem rename

List of usage examples for org.apache.hadoop.fs FileSystem rename

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem rename.

Prototype

public abstract boolean rename(Path src, Path dst) throws IOException;

Source Link

Document

Renames Path src to Path dst.

Usage

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath)
        throws IOException {
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath);
    fs.delete(outputPath, true);// w w  w.j a v  a  2 s  .c o m

    Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId));
    LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapTaskFailures(100)
            .reducer(LinkGraphDataEmitter.class, true).partition(CrawlDBKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(CrawlEnvironment.NUM_DB_SHARDS)
            .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution()

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .build();

    JobClient.runJob(jobConf);

    Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId));
    LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
    fs.rename(outputPath, finalOutputPath);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java

License:Open Source License

public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath)
        throws IOException {
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath);
    fs.delete(outputPath, true);//from   w ww .ja  va  2  s.c  o m

    Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId));
    LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkDataResharder.class).maxMapTaskFailures(100).reducer(LinkDataResharder.class, true)
            .partition(LinkKeyPartitioner.class).sort(LinkKeyComparator.class)
            .numReducers(CrawlEnvironment.NUM_DB_SHARDS)
            .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution()

            .output(outputPath).outputFormat(SequenceFileOutputFormat.class)

            .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class)

            .build();

    JobClient.runJob(jobConf);

    Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId));
    LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
    fs.rename(outputPath, finalOutputPath);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // establish merge timestamp 
    long mergeTimesmap = System.currentTimeMillis();
    // get a temp directory ... 
    Path outputPath = JobBuilder.tempDir(conf, Long.toString(mergeTimesmap));

    // find latest merge timestamp ... 
    long latestMergeDBTimestamp = findLatestMergeDBTimestamp(fs, conf);
    LOG.info("Latest MergeDB Timestmap is:" + latestMergeDBTimestamp);
    // find list of merge candidates ... 
    List<Path> candidateList = filterMergeCandidtes(fs, conf, latestMergeDBTimestamp);
    LOG.info("Merge Candidate List is:" + candidateList);
    if (candidateList.size() != 0) {
        ArrayList<Path> inputPaths = new ArrayList<Path>();

        // add all input paths to list 
        inputPaths.addAll(candidateList);
        // establish an affinity path ... 
        Path affinityPath = candidateList.get(0);
        // add merge db path if it exists 
        if (latestMergeDBTimestamp != -1L) {
            affinityPath = new Path(internalMergedDBPath, Long.toString(latestMergeDBTimestamp));
            inputPaths.add(affinityPath);
        }/*from w ww . j a  va2s . c om*/

        JobConf jobConf = new JobBuilder("Final Merge Job", conf).inputs(inputPaths)
                .inputFormat(MultiFileMergeInputFormat.class).mapperKeyValue(IntWritable.class, Text.class)
                .outputKeyValue(TextBytes.class, TextBytes.class).outputFormat(SequenceFileOutputFormat.class)
                .reducer(LinkMergerJob.class, false).partition(MultiFileMergePartitioner.class)
                .numReducers(CrawlEnvironment.NUM_DB_SHARDS).speculativeExecution(false).output(outputPath)
                .setAffinityNoBalancing(affinityPath,
                        ImmutableSet.of("ccd001.commoncrawl.org", "ccd006.commoncrawl.org"))

                .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)

                .build();

        JsonArray hack = new JsonArray();

        hack.add(new JsonPrimitive(11));
        hack.add(new JsonPrimitive(21));
        hack.add(new JsonPrimitive(82));
        hack.add(new JsonPrimitive(83));
        hack.add(new JsonPrimitive(90));

        jobConf.set("hack", hack.toString());

        LOG.info("Starting JOB");
        JobClient.runJob(jobConf);

        Path finalOutputPath = new Path(internalMergedDBPath, Long.toString(mergeTimesmap));
        LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
        fs.rename(outputPath, finalOutputPath);
    }

}

From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java

License:Open Source License

protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException {

    if (promoteFinalStepOutput()) {
        FileSystem fs = getFileSystem();
        Path taskAsStepOutputDir = getOutputDir();
        fs.mkdirs(taskAsStepOutputDir);// ww w.  jav a2s .c  o m
        getLogger().info("finalStepComplete callback triggered - promoting output from:"
                + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir);

        // copy everything from final step into task output ...
        FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*"));

        if (files.length != 0) {
            fs.delete(taskAsStepOutputDir, true);
            fs.mkdirs(taskAsStepOutputDir);
        }

        for (FileStatus file : files) {
            fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName()));
        }
    }
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {/*from  w  w w  .ja  v a2s.c  o m*/
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.commoncrawl.service.crawler.CrawlLog.java

License:Open Source License

/** perform the actual checkpoint work here ... **/
private void doCheckpoint() {
    // at this point, we should be in the async thread, and all flusher
    // activities are blocked ...
    LOG.info("CrawlLog Checkpoint - Starting ");
    // collect all necessary information from thread-unsafe data structure now
    // (in async thread context)
    final Set<Long> activeSegments = new HashSet<Long>();

    try {//  w w w . j  a v  a 2 s .  co  m
        // add all active segment ids to our key set ...
        activeSegments.addAll(_loggers.keySet());
        LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files");
        // checkpoint crawl log ...
        checkpointLocalCrawlLog();
        LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files");
        // next checkpoint all active segment logs ...
        for (CrawlSegmentLog segmentLog : _loggers.values()) {
            segmentLog.checkpointLocalLog();
        }
        LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer");
    } catch (IOException e) {
        LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e));
    }

    // spawn a thread to do most of the blocking io ...
    _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

            new Callable<Boolean>() {

                public Boolean call() throws Exception {

                    // we need to track these in case of failure ...
                    Vector<Path> segmentLogStagingPaths = new Vector<Path>();
                    Vector<Path> segmentLogFinalPaths = new Vector<Path>();

                    // get the file system
                    final FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();

                    try {

                        LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS");

                        // construct a target path (where we are going to store the
                        // checkpointed crawl log )
                        Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory());

                        SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter(
                                CrawlEnvironment.getHadoopConfig(), hdfs, stagingDirectory, getNodeName(),
                                _checkpointId);

                        try {
                            // write out crawl log to hdfs ...
                            transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter,
                                    _checkpointId);
                        } catch (Exception e) {
                            LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:"
                                    + hdfsWriter.getFilenames() + " Exception:"
                                    + CCStringUtils.stringifyException(e));

                            // close writer
                            hdfsWriter.close();
                            // delete any hdfs output ...
                            for (Path path : hdfsWriter.getFilenames()) {
                                LOG.info("Deleting temp (HDFS) checkpoint file:" + path);
                                hdfs.delete(path, false);
                            }
                            throw e;
                        } finally {
                            hdfsWriter.close();
                        }

                        LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs");
                        // and next for every segment
                        for (long packedLogId : activeSegments) {

                            File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory,
                                    getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId));

                            // LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for Segment:"
                            // + segmentId);
                            // copy the segment log ...
                            Path remoteLogFilePath = transferLocalSegmentLog(hdfs, segmentLogPath,
                                    _checkpointId, getListIdFromLogId(packedLogId),
                                    getSegmentIdFromLogId(packedLogId));
                            // if path is not null (data was copied) ...
                            if (remoteLogFilePath != null) {
                                // add it to vector ...
                                segmentLogStagingPaths.add(remoteLogFilePath);
                                // and add final path to vector while we are at it ...
                                segmentLogFinalPaths.add(getFinalSegmentLogPath(hdfs, _checkpointId,
                                        getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)));
                            }
                        }
                        LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs");

                        // now if we got here ... all hdfs transfers succeeded ...
                        // go ahead and move checkpoint log from staging to final data
                        // directory ...
                        Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory());

                        // if no checkpoint data directory ... create one ...
                        if (!hdfs.exists(checkpointDirectory))
                            hdfs.mkdirs(checkpointDirectory);

                        for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) {
                            Path checkpointFinalPath = new Path(checkpointDirectory,
                                    checkpointTempFilePath.getName());
                            LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:"
                                    + checkpointFinalPath);
                            // and essentially move the crawl log file from staging to data
                            // directory ..
                            boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath);
                            if (!success) {
                                throw new IOException("Failed to Rename Checkpoint Temp:"
                                        + checkpointTempFilePath + " to:" + checkpointFinalPath);
                            }
                        }
                        // and now do the same thing for each segment log files
                        for (int i = 0; i < segmentLogStagingPaths.size(); ++i) {
                            hdfs.rename(segmentLogStagingPaths.get(i), segmentLogFinalPaths.get(i));
                        }
                        // if we got here checkpoint was successfull...
                        return true;
                    } catch (Exception e) {
                        LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:"
                                + CCStringUtils.stringifyException(e));
                        for (Path segmentPath : segmentLogStagingPaths) {
                            hdfs.delete(segmentPath);
                        }
                        for (Path segmentPath : segmentLogFinalPaths) {
                            hdfs.delete(segmentPath);
                        }
                        throw e;
                    }
                }
            },

            new CompletionCallback<Boolean>() {

                public void taskComplete(Boolean updateResult) {

                    Vector<Long> completedSegmentList = new Vector<Long>();

                    LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint");
                    // delete the local checkpoint log ...
                    finalizeCheckpoint();

                    LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs");
                    for (CrawlSegmentLog segmentLog : _loggers.values()) {
                        // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:"
                        // + segmentLog.getSegmentId());
                        // finalize the checkpoint on the segment log ...
                        segmentLog.finalizeCheckpoint();
                        // and check to see if the segment has been completed ...
                        if (segmentLog.isSegmentComplete()) {
                            // if so, add it our completed segments list ...
                            completedSegmentList
                                    .add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId()));
                        }
                    }

                    // now for all completed segments ... purge hdfs logs ...
                    for (long packedSegmentId : completedSegmentList) {
                        try {
                            LOG.info(
                                    "CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:"
                                            + getListIdFromLogId(packedSegmentId) + " Segment:"
                                            + getSegmentIdFromLogId(packedSegmentId));
                            // purge hdfs files (and create a completion log file)
                            purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(),
                                    getListIdFromLogId(packedSegmentId),
                                    getSegmentIdFromLogId(packedSegmentId));
                            LOG.info(
                                    "CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:"
                                            + getListIdFromLogId(packedSegmentId) + " Segment:"
                                            + getSegmentIdFromLogId(packedSegmentId));
                            // and purge local files as well ...
                            _loggers.get(packedSegmentId).purgeLocalFiles();
                        } catch (IOException e) {
                            LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId)
                                    + " Segment:" + getSegmentIdFromLogId(packedSegmentId)
                                    + " threw IOException:" + CCStringUtils.stringifyException(e));
                        }
                        LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:"
                                + getListIdFromLogId(packedSegmentId) + " Segment:"
                                + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog");
                        // no matter what ... unload the segment ...
                        _loggers.remove(packedSegmentId);
                    }

                    CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                    long checkpointId = _checkpointId;

                    // otherwise transition to a checkpoint in progress state
                    _checkpointCompletionCallback = null;
                    _checkpointId = -1;

                    LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback");

                    // and complete transaction ...
                    callback.checkpointComplete(checkpointId, completedSegmentList);

                }

                public void taskFailed(Exception e) {

                    // all failures are critical in this particular task ...
                    LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                    // revert checkpoint logs ...
                    abortCheckpoint();

                    for (CrawlSegmentLog segmentLog : _loggers.values()) {
                        segmentLog.abortCheckpoint();
                    }

                    CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                    long checkpointId = _checkpointId;

                    // otherwise transition to a checkpoint in progress state
                    _checkpointCompletionCallback = null;
                    _checkpointId = -1;

                    // now check to see if this was corrupt crawl log exception
                    if (e.getCause() instanceof CorruptCrawlLogException) {
                        // ACK!!!
                        LOG.fatal("Corrupt CrawlLog detected with Exception:"
                                + CCStringUtils.stringifyException(e));

                        try {
                            // this is a serious error ... time to purge the crawl log directory
                            // altogether ...
                            purgeActiveLog();

                            // and all active segment logs as well...
                            for (CrawlSegmentLog segmentLog : _loggers.values()) {
                                segmentLog.purgeActiveLog();
                            }
                        } catch (IOException e2) {
                            LOG.error("IOException during Segment Log PURGE:"
                                    + CCStringUtils.stringifyException(e2));
                        }

                        // time to die hard ...
                        throw new RuntimeException(e);
                    }

                    // and complete transaction ...
                    callback.checkpointFailed(checkpointId, e);
                }
            }));
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private boolean validateOnDiskVersion() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
    Path dataFilePath = getDataFileFinalPath();
    LOG.info("Loading BloomFilter From Disk at Path:" + dataFilePath);
    if (fs.exists(dataFilePath)) {
        FSDataInputStream stream = null;
        try {// w  w w  .ja  va 2s.  c  o  m
            stream = fs.open(dataFilePath);
            DataInputStream dataInput = new DataInputStream(stream);
            // skip version
            dataInput.readInt();
            // read crawl version ... 
            int serializedCrawlVersion = dataInput.readInt();
            LOG.info("BloomFilter From On Disk has CrawlVersion:" + serializedCrawlVersion);
            if (serializedCrawlVersion < _state.getCurrentCrawlNumber()) {
                LOG.error("skipping load because serial crawl number is less than current crawl");
                stream.close();
                stream = null;
                fs.rename(dataFilePath, new Path(dataFilePath.getParent(),
                        dataFilePath.getName() + "-V-" + serializedCrawlVersion));
                return false;
            }
            return true;
        } finally {
            if (stream != null)
                stream.close();
        }
    }
    return false;
}

From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java

License:Open Source License

private void startCheckpointThread(final FileSystem fs) {

    _checkpointThread = new Thread(new Runnable() {

        @Override/*from   w w w.j  a  v  a2 s  .  c  om*/
        public void run() {

            // ok check point thread run in perpetuty
            while (!_shutdownFlag) {

                if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1
                        || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL
                        || (System.currentTimeMillis()
                                - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) {

                    //LOG.info("Checkpoint Thread Grabbing Semaphore");
                    // grab checkpoint thread semaphore 
                    _checkpointThreadSemaphore.acquireUninterruptibly();
                    //LOG.info("Checkpoint Thread Grabbed Semaphore");

                    try {
                        // create scan pattern 
                        Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"
                                + _state.getCurrentCrawlNumber() + "/*/"
                                + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName()));

                        // scan hdfs for log files
                        FileStatus candidates[];
                        try {
                            LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath);
                            candidates = fs.globStatus(hdfsScanPath);

                            // iterate candidates 
                            for (FileStatus candidate : candidates) {

                                // check candidate against processed path list ... 
                                if (!_processedPaths.contains(candidate.getPath())) {
                                    int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get();
                                    // ok found a candidate we can work on 
                                    LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath());
                                    final URLFPV2 placeHolderFP = new URLFPV2();
                                    CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(),
                                            new CrawlSegmentLog.LogFileItemCallback() {

                                                @Override
                                                public void processItem(long domainHash, long urlFingerprint) {
                                                    placeHolderFP.setDomainHash(domainHash);
                                                    placeHolderFP.setUrlHash(urlFingerprint);
                                                    // add item for bloom filter 
                                                    _bloomFilter.add(placeHolderFP);
                                                    // inrement urls processed count ...
                                                    _urlsProcessedSinceCheckpoint.addAndGet(1);
                                                }
                                            });
                                    _processedPaths.add(candidate.getPath());
                                    LOG.info("Finished Processing Candidate:" + candidate.getPath());
                                }
                            }

                            // update scan time ... 
                            _lastCheckpointScanTime = System.currentTimeMillis();

                            // see if can do a full checkpoint ... 
                            if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis()
                                    - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) {

                                int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get();
                                // ok at this point we are read to initialize a checkpoint 
                                if (approximateItemsToFlush != 0) {

                                    Path checkpointMutexPath = getCheckpointMutexPath();

                                    if (fs.createNewFile(checkpointMutexPath)) {
                                        try {
                                            LOG.info("Checkpoint Thread Starting Checkpoint");

                                            // get the checkpoint path ... 
                                            Path checkpointPath = getDataFileCheckpointPath();
                                            Path finalPath = getDataFileFinalPath();

                                            LOG.info("Checkpoint Thread Writing BloomFilter Data");
                                            // serialize the filter ... 
                                            serializeBloomFilter(checkpointPath);

                                            LOG.info("Checkpoint Thread Deleting Old Checkpoint Data");
                                            // ok now everything seems to have gone fine ... delete existing data file 
                                            fs.delete(finalPath);
                                            LOG.info("Checkpoint Thread ReWriting New Checkpoint Data");
                                            // rename checkpoint to final ... 
                                            fs.rename(checkpointPath, finalPath);

                                            if (_state
                                                    .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) {
                                                LOG.info("Checkpoint Thread Deleting Processed Files");
                                                // ok safely delete all processed files
                                                for (Path processedFilePath : _processedPaths) {
                                                    fs.delete(processedFilePath);
                                                }
                                                _processedPaths.clear();
                                            } else {
                                                LOG.info(
                                                        "Skipping Processed Files Purge because we are in Transitioning State");
                                            }
                                            _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush);
                                        } finally {
                                            LOG.info(
                                                    "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath);
                                            fs.delete(checkpointMutexPath, false);
                                        }
                                    } else {
                                        int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY);
                                        LOG.info("Checkpoint thread failed to acquire Mutex:"
                                                + checkpointMutexPath + " Waiting " + delay
                                                + "(MS) before retry");
                                        try {
                                            Thread.sleep(delay);
                                        } catch (InterruptedException e) {
                                        }
                                    }
                                }
                                // update last checkpoint no time no matter what ...
                                _lastCheckpointFlushTime = System.currentTimeMillis();
                            }

                        } catch (IOException e) {
                            LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:"
                                    + CCStringUtils.stringifyException(e));
                            try {
                                Thread.sleep(60000);
                            } catch (InterruptedException e1) {
                            }
                        }
                    } finally {
                        LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore");
                        _checkpointThreadSemaphore.release();
                    }
                } else {
                    try {
                        //LOG.info("Checkpoint Thread IDLE");
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                    }
                }
            }

        }

    });
    _checkpointThread.start();
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from   ww w  .ja va2 s .  c o m*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.deeplearning4j.hadoop.modelsaving.HdfsModelSaver.java

License:Apache License

@Override
public void save(Serializable ser) {
    FileSystem system;
    try {// w w w  .j  a va  2  s .  c o  m
        system = FileSystem.get(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    if (reWrite) {
        try {
            system.rename(path, new Path(path.getParent(), path.getName() + System.currentTimeMillis()));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    } else {
        try {
            OutputStream os = system.create(path);
            ObjectOutputStream bos = new ObjectOutputStream(os);
            bos.writeObject(ser);
            bos.flush();
            bos.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}