List of usage examples for org.apache.hadoop.fs FileSystem rename
public abstract boolean rename(Path src, Path dst) throws IOException;
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java
License:Open Source License
public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath) throws IOException { Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId)); LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath); fs.delete(outputPath, true);// w w w.j a v a 2 s .c o m Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId)); LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath); JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath) .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class) .mapper(LinkGraphDataEmitter.class).maxMapTaskFailures(100) .reducer(LinkGraphDataEmitter.class, true).partition(CrawlDBKeyPartitioner.class) .sort(LinkKeyComparator.class).numReducers(CrawlEnvironment.NUM_DB_SHARDS) .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution() .output(outputPath).outputFormat(SequenceFileOutputFormat.class) .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); JobClient.runJob(jobConf); Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId)); LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath); fs.rename(outputPath, finalOutputPath); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkCollectorJob.java
License:Open Source License
public static void mergeSegmentInternal(FileSystem fs, Configuration conf, long segmentId, Path affinityPath) throws IOException { Path outputPath = JobBuilder.tempDir(conf, Long.toString(segmentId)); LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Temp Output path is:" + outputPath); fs.delete(outputPath, true);//from w ww .ja va 2 s.c o m Path inputPath = new Path(internalEC2SegmentPath, Long.toString(segmentId)); LOG.info("Input Path for Segment:" + segmentId + " is:" + inputPath); JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).input(inputPath) .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class) .mapper(LinkDataResharder.class).maxMapTaskFailures(100).reducer(LinkDataResharder.class, true) .partition(LinkKeyPartitioner.class).sort(LinkKeyComparator.class) .numReducers(CrawlEnvironment.NUM_DB_SHARDS) .setAffinity(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org")).speculativeMapExecution() .output(outputPath).outputFormat(SequenceFileOutputFormat.class) .compressMapOutput(true).compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); JobClient.runJob(jobConf); Path finalOutputPath = new Path(internalMergedSegmentPath, Long.toString(segmentId)); LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath); fs.rename(outputPath, finalOutputPath); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // establish merge timestamp long mergeTimesmap = System.currentTimeMillis(); // get a temp directory ... Path outputPath = JobBuilder.tempDir(conf, Long.toString(mergeTimesmap)); // find latest merge timestamp ... long latestMergeDBTimestamp = findLatestMergeDBTimestamp(fs, conf); LOG.info("Latest MergeDB Timestmap is:" + latestMergeDBTimestamp); // find list of merge candidates ... List<Path> candidateList = filterMergeCandidtes(fs, conf, latestMergeDBTimestamp); LOG.info("Merge Candidate List is:" + candidateList); if (candidateList.size() != 0) { ArrayList<Path> inputPaths = new ArrayList<Path>(); // add all input paths to list inputPaths.addAll(candidateList); // establish an affinity path ... Path affinityPath = candidateList.get(0); // add merge db path if it exists if (latestMergeDBTimestamp != -1L) { affinityPath = new Path(internalMergedDBPath, Long.toString(latestMergeDBTimestamp)); inputPaths.add(affinityPath); }/*from w ww . j a va2s . c om*/ JobConf jobConf = new JobBuilder("Final Merge Job", conf).inputs(inputPaths) .inputFormat(MultiFileMergeInputFormat.class).mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class).outputFormat(SequenceFileOutputFormat.class) .reducer(LinkMergerJob.class, false).partition(MultiFileMergePartitioner.class) .numReducers(CrawlEnvironment.NUM_DB_SHARDS).speculativeExecution(false).output(outputPath) .setAffinityNoBalancing(affinityPath, ImmutableSet.of("ccd001.commoncrawl.org", "ccd006.commoncrawl.org")) .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); JsonArray hack = new JsonArray(); hack.add(new JsonPrimitive(11)); hack.add(new JsonPrimitive(21)); hack.add(new JsonPrimitive(82)); hack.add(new JsonPrimitive(83)); hack.add(new JsonPrimitive(90)); jobConf.set("hack", hack.toString()); LOG.info("Starting JOB"); JobClient.runJob(jobConf); Path finalOutputPath = new Path(internalMergedDBPath, Long.toString(mergeTimesmap)); LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath); fs.rename(outputPath, finalOutputPath); } }
From source file:org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask.java
License:Open Source License
protected void finalStepComplete(CrawlPipelineStep finalStep, Path finalStepOutputDir) throws IOException { if (promoteFinalStepOutput()) { FileSystem fs = getFileSystem(); Path taskAsStepOutputDir = getOutputDir(); fs.mkdirs(taskAsStepOutputDir);// ww w. jav a2s .c o m getLogger().info("finalStepComplete callback triggered - promoting output from:" + finalStep.getDescription() + " to output dir:" + taskAsStepOutputDir); // copy everything from final step into task output ... FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*")); if (files.length != 0) { fs.delete(taskAsStepOutputDir, true); fs.mkdirs(taskAsStepOutputDir); } for (FileStatus file : files) { fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath().getName())); } } }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {/*from w w w .ja v a2s.c o m*/ try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.commoncrawl.service.crawler.CrawlLog.java
License:Open Source License
/** perform the actual checkpoint work here ... **/ private void doCheckpoint() { // at this point, we should be in the async thread, and all flusher // activities are blocked ... LOG.info("CrawlLog Checkpoint - Starting "); // collect all necessary information from thread-unsafe data structure now // (in async thread context) final Set<Long> activeSegments = new HashSet<Long>(); try {// w w w . j a v a 2 s . co m // add all active segment ids to our key set ... activeSegments.addAll(_loggers.keySet()); LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files"); // checkpoint crawl log ... checkpointLocalCrawlLog(); LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files"); // next checkpoint all active segment logs ... for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.checkpointLocalLog(); } LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer"); } catch (IOException e) { LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e)); } // spawn a thread to do most of the blocking io ... _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop, new Callable<Boolean>() { public Boolean call() throws Exception { // we need to track these in case of failure ... Vector<Path> segmentLogStagingPaths = new Vector<Path>(); Vector<Path> segmentLogFinalPaths = new Vector<Path>(); // get the file system final FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); try { LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS"); // construct a target path (where we are going to store the // checkpointed crawl log ) Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory()); SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter( CrawlEnvironment.getHadoopConfig(), hdfs, stagingDirectory, getNodeName(), _checkpointId); try { // write out crawl log to hdfs ... transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter, _checkpointId); } catch (Exception e) { LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:" + hdfsWriter.getFilenames() + " Exception:" + CCStringUtils.stringifyException(e)); // close writer hdfsWriter.close(); // delete any hdfs output ... for (Path path : hdfsWriter.getFilenames()) { LOG.info("Deleting temp (HDFS) checkpoint file:" + path); hdfs.delete(path, false); } throw e; } finally { hdfsWriter.close(); } LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs"); // and next for every segment for (long packedLogId : activeSegments) { File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory, getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)); // LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for Segment:" // + segmentId); // copy the segment log ... Path remoteLogFilePath = transferLocalSegmentLog(hdfs, segmentLogPath, _checkpointId, getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)); // if path is not null (data was copied) ... if (remoteLogFilePath != null) { // add it to vector ... segmentLogStagingPaths.add(remoteLogFilePath); // and add final path to vector while we are at it ... segmentLogFinalPaths.add(getFinalSegmentLogPath(hdfs, _checkpointId, getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId))); } } LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs"); // now if we got here ... all hdfs transfers succeeded ... // go ahead and move checkpoint log from staging to final data // directory ... Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory()); // if no checkpoint data directory ... create one ... if (!hdfs.exists(checkpointDirectory)) hdfs.mkdirs(checkpointDirectory); for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) { Path checkpointFinalPath = new Path(checkpointDirectory, checkpointTempFilePath.getName()); LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:" + checkpointFinalPath); // and essentially move the crawl log file from staging to data // directory .. boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath); if (!success) { throw new IOException("Failed to Rename Checkpoint Temp:" + checkpointTempFilePath + " to:" + checkpointFinalPath); } } // and now do the same thing for each segment log files for (int i = 0; i < segmentLogStagingPaths.size(); ++i) { hdfs.rename(segmentLogStagingPaths.get(i), segmentLogFinalPaths.get(i)); } // if we got here checkpoint was successfull... return true; } catch (Exception e) { LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:" + CCStringUtils.stringifyException(e)); for (Path segmentPath : segmentLogStagingPaths) { hdfs.delete(segmentPath); } for (Path segmentPath : segmentLogFinalPaths) { hdfs.delete(segmentPath); } throw e; } } }, new CompletionCallback<Boolean>() { public void taskComplete(Boolean updateResult) { Vector<Long> completedSegmentList = new Vector<Long>(); LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint"); // delete the local checkpoint log ... finalizeCheckpoint(); LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs"); for (CrawlSegmentLog segmentLog : _loggers.values()) { // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:" // + segmentLog.getSegmentId()); // finalize the checkpoint on the segment log ... segmentLog.finalizeCheckpoint(); // and check to see if the segment has been completed ... if (segmentLog.isSegmentComplete()) { // if so, add it our completed segments list ... completedSegmentList .add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId())); } } // now for all completed segments ... purge hdfs logs ... for (long packedSegmentId : completedSegmentList) { try { LOG.info( "CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId)); // purge hdfs files (and create a completion log file) purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(), getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId)); LOG.info( "CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId)); // and purge local files as well ... _loggers.get(packedSegmentId).purgeLocalFiles(); } catch (IOException e) { LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId) + " threw IOException:" + CCStringUtils.stringifyException(e)); } LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog"); // no matter what ... unload the segment ... _loggers.remove(packedSegmentId); } CheckpointCompletionCallback callback = _checkpointCompletionCallback; long checkpointId = _checkpointId; // otherwise transition to a checkpoint in progress state _checkpointCompletionCallback = null; _checkpointId = -1; LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback"); // and complete transaction ... callback.checkpointComplete(checkpointId, completedSegmentList); } public void taskFailed(Exception e) { // all failures are critical in this particular task ... LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); // revert checkpoint logs ... abortCheckpoint(); for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.abortCheckpoint(); } CheckpointCompletionCallback callback = _checkpointCompletionCallback; long checkpointId = _checkpointId; // otherwise transition to a checkpoint in progress state _checkpointCompletionCallback = null; _checkpointId = -1; // now check to see if this was corrupt crawl log exception if (e.getCause() instanceof CorruptCrawlLogException) { // ACK!!! LOG.fatal("Corrupt CrawlLog detected with Exception:" + CCStringUtils.stringifyException(e)); try { // this is a serious error ... time to purge the crawl log directory // altogether ... purgeActiveLog(); // and all active segment logs as well... for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.purgeActiveLog(); } } catch (IOException e2) { LOG.error("IOException during Segment Log PURGE:" + CCStringUtils.stringifyException(e2)); } // time to die hard ... throw new RuntimeException(e); } // and complete transaction ... callback.checkpointFailed(checkpointId, e); } })); }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private boolean validateOnDiskVersion() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path dataFilePath = getDataFileFinalPath(); LOG.info("Loading BloomFilter From Disk at Path:" + dataFilePath); if (fs.exists(dataFilePath)) { FSDataInputStream stream = null; try {// w w w .ja va 2s. c o m stream = fs.open(dataFilePath); DataInputStream dataInput = new DataInputStream(stream); // skip version dataInput.readInt(); // read crawl version ... int serializedCrawlVersion = dataInput.readInt(); LOG.info("BloomFilter From On Disk has CrawlVersion:" + serializedCrawlVersion); if (serializedCrawlVersion < _state.getCurrentCrawlNumber()) { LOG.error("skipping load because serial crawl number is less than current crawl"); stream.close(); stream = null; fs.rename(dataFilePath, new Path(dataFilePath.getParent(), dataFilePath.getName() + "-V-" + serializedCrawlVersion)); return false; } return true; } finally { if (stream != null) stream.close(); } } return false; }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
private void startCheckpointThread(final FileSystem fs) { _checkpointThread = new Thread(new Runnable() { @Override/*from w w w.j a v a2 s . c om*/ public void run() { // ok check point thread run in perpetuty while (!_shutdownFlag) { if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1 || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL || (System.currentTimeMillis() - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) { //LOG.info("Checkpoint Thread Grabbing Semaphore"); // grab checkpoint thread semaphore _checkpointThreadSemaphore.acquireUninterruptibly(); //LOG.info("Checkpoint Thread Grabbed Semaphore"); try { // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[]; try { LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // check candidate against processed path list ... if (!_processedPaths.contains(candidate.getPath())) { int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get(); // ok found a candidate we can work on LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); // inrement urls processed count ... _urlsProcessedSinceCheckpoint.addAndGet(1); } }); _processedPaths.add(candidate.getPath()); LOG.info("Finished Processing Candidate:" + candidate.getPath()); } } // update scan time ... _lastCheckpointScanTime = System.currentTimeMillis(); // see if can do a full checkpoint ... if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis() - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) { int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get(); // ok at this point we are read to initialize a checkpoint if (approximateItemsToFlush != 0) { Path checkpointMutexPath = getCheckpointMutexPath(); if (fs.createNewFile(checkpointMutexPath)) { try { LOG.info("Checkpoint Thread Starting Checkpoint"); // get the checkpoint path ... Path checkpointPath = getDataFileCheckpointPath(); Path finalPath = getDataFileFinalPath(); LOG.info("Checkpoint Thread Writing BloomFilter Data"); // serialize the filter ... serializeBloomFilter(checkpointPath); LOG.info("Checkpoint Thread Deleting Old Checkpoint Data"); // ok now everything seems to have gone fine ... delete existing data file fs.delete(finalPath); LOG.info("Checkpoint Thread ReWriting New Checkpoint Data"); // rename checkpoint to final ... fs.rename(checkpointPath, finalPath); if (_state .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) { LOG.info("Checkpoint Thread Deleting Processed Files"); // ok safely delete all processed files for (Path processedFilePath : _processedPaths) { fs.delete(processedFilePath); } _processedPaths.clear(); } else { LOG.info( "Skipping Processed Files Purge because we are in Transitioning State"); } _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush); } finally { LOG.info( "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath); fs.delete(checkpointMutexPath, false); } } else { int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY); LOG.info("Checkpoint thread failed to acquire Mutex:" + checkpointMutexPath + " Waiting " + delay + "(MS) before retry"); try { Thread.sleep(delay); } catch (InterruptedException e) { } } } // update last checkpoint no time no matter what ... _lastCheckpointFlushTime = System.currentTimeMillis(); } } catch (IOException e) { LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e)); try { Thread.sleep(60000); } catch (InterruptedException e1) { } } } finally { LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore"); _checkpointThreadSemaphore.release(); } } else { try { //LOG.info("Checkpoint Thread IDLE"); Thread.sleep(100); } catch (InterruptedException e) { } } } } }); _checkpointThread.start(); }
From source file:org.commoncrawl.util.CrawlLogSplitter.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*")); for (FileStatus candidate : arcFiles) { if (candidate.getLen() > SPLIT_SIZE) { candidateList.add(candidate.getPath()); }/*from ww w .ja va2 s . c o m*/ } LOG.info("Found:" + candidateList.size() + " oversized candidates"); Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")); while (candidateList.size() != 0) { Path candidateName = candidateList.first(); candidateList.remove(candidateName); LOG.info("Processing Candidate:" + candidateName); long fileSize = fs.getFileStatus(candidateName).getLen(); //get crawl log filename components ArrayList<Path> splitItems = new ArrayList<Path>(); int index = 0; Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index); LOG.info("Initial Output Path is:" + outputPart); fs.delete(outputPart, false); // create reader SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf); ValueBytes sourceVB = reader.createValueBytes(); DataOutputBuffer sourceKeyData = new DataOutputBuffer(); try { // ok create temp file SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); // add to split items array splitItems.add(outputPart); try { long recordsWritten = 0; while (reader.nextRawKey(sourceKeyData) != -1) { reader.nextRawValue(sourceVB); long lengthPreWrite = activeWriter.getLength(); activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB); if (++recordsWritten % 10000 == 0) { LOG.info("Write 10000 records"); } long lengthPostWrite = activeWriter.getLength(); if (lengthPostWrite != lengthPreWrite) { if (lengthPostWrite >= IDEAL_SIZE) { LOG.info("Hit Split Point. Flushing File:" + outputPart); activeWriter.close(); outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), ++index); LOG.info("Creating New File:" + outputPart); activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); splitItems.add(outputPart); } } sourceKeyData.reset(); } } finally { activeWriter.close(); } } finally { reader.close(); } LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files"); for (Path splitItem : splitItems) { Path destPath = new Path("crawl/checkpoint_data", splitItem.getName()); LOG.info("Moving:" + splitItem + " to:" + destPath); fs.rename(splitItem, destPath); } Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName()); LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation); fs.rename(candidateName, sourceMoveLocation); } }
From source file:org.deeplearning4j.hadoop.modelsaving.HdfsModelSaver.java
License:Apache License
@Override public void save(Serializable ser) { FileSystem system; try {// w w w .j a va 2 s . c o m system = FileSystem.get(conf); } catch (IOException e) { throw new RuntimeException(e); } if (reWrite) { try { system.rename(path, new Path(path.getParent(), path.getName() + System.currentTimeMillis())); } catch (IOException e) { throw new RuntimeException(e); } } else { try { OutputStream os = system.create(path); ObjectOutputStream bos = new ObjectOutputStream(os); bos.writeObject(ser); bos.flush(); bos.close(); } catch (IOException e) { throw new RuntimeException(e); } } }