List of usage examples for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer
public DataOutputBuffer()
From source file:org.apache.tez.runtime.library.common.TestValuesIterator.java
License:Apache License
/** * create inmemory segments/* w w w . j ava 2 s .c o m*/ * * @return * @throws IOException */ public List<TezMerger.Segment> createInMemStreams() throws IOException { int numberOfStreams = Math.max(2, rnd.nextInt(10)); LOG.info("No of streams : " + numberOfStreams); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(keyClass); Serializer valueSerializer = serializationFactory.getSerializer(valClass); LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS); InputContext context = createTezInputContext(); MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null, null, 1024 * 1024 * 10, null, false, -1); DataOutputBuffer keyBuf = new DataOutputBuffer(); DataOutputBuffer valBuf = new DataOutputBuffer(); DataInputBuffer keyIn = new DataInputBuffer(); DataInputBuffer valIn = new DataInputBuffer(); keySerializer.open(keyBuf); valueSerializer.open(valBuf); List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>(); for (int i = 0; i < numberOfStreams; i++) { BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024); InMemoryWriter writer = new InMemoryWriter(bout); Map<Writable, Writable> data = createData(); //write data for (Map.Entry<Writable, Writable> entry : data.entrySet()) { keySerializer.serialize(entry.getKey()); valueSerializer.serialize(entry.getValue()); keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength()); valIn.reset(valBuf.getData(), 0, valBuf.getLength()); writer.append(keyIn, valIn); originalData.put(entry.getKey(), entry.getValue()); keyBuf.reset(); valBuf.reset(); keyIn.reset(); valIn.reset(); } IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length); segments.add(new TezMerger.Segment(reader, true)); data.clear(); writer.close(); } return segments; }
From source file:org.apache.tez.runtime.library.output.TestOnFileSortedOutput.java
License:Apache License
private OutputContext createTezOutputContext() throws IOException { String[] workingDirs = { workingDir.toString() }; UserPayload payLoad = TezUtils.createUserPayloadFromConf(conf); DataOutputBuffer serviceProviderMetaData = new DataOutputBuffer(); serviceProviderMetaData.writeInt(PORT); TezCounters counters = new TezCounters(); OutputContext context = mock(OutputContext.class); doReturn(counters).when(context).getCounters(); doReturn(workingDirs).when(context).getWorkDirs(); doReturn(payLoad).when(context).getUserPayload(); doReturn(5 * 1024 * 1024l).when(context).getTotalMemoryAvailableToTask(); doReturn(UniqueID).when(context).getUniqueIdentifier(); doReturn("v1").when(context).getDestinationVertexName(); doReturn(ByteBuffer.wrap(serviceProviderMetaData.getData())).when(context) .getServiceProviderMetaData(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID); doAnswer(new Answer() { @Override// w w w.java 2 s . c o m public Object answer(InvocationOnMock invocation) throws Throwable { long requestedSize = (Long) invocation.getArguments()[0]; MemoryUpdateCallbackHandler callback = (MemoryUpdateCallbackHandler) invocation.getArguments()[1]; callback.memoryAssigned(requestedSize); return null; } }).when(context).requestInitialMemory(anyLong(), any(MemoryUpdateCallback.class)); ExecutionContext ExecutionContext = mock(ExecutionContext.class); doReturn(HOST).when(ExecutionContext).getHostName(); doReturn(ExecutionContext).when(context).getExecutionContext(); return context; }
From source file:org.apache.twill.internal.yarn.YarnUtils.java
License:Apache License
/** * Encodes the given {@link Credentials} as bytes. *//*from w w w . j av a 2 s. com*/ public static ByteBuffer encodeCredentials(Credentials credentials) { try { DataOutputBuffer out = new DataOutputBuffer(); credentials.writeTokenStorageToStream(out); return ByteBuffer.wrap(out.getData(), 0, out.getLength()); } catch (IOException e) { // Shouldn't throw LOG.error("Failed to encode Credentials.", e); throw Throwables.propagate(e); } }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java
License:Open Source License
private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2, int expectedResult) { long nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(key1, key2), expectedResult); long nanoEnd = System.nanoTime(); System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart)); DataOutputBuffer outputBuffer1 = new DataOutputBuffer(); DataOutputBuffer outputBuffer2 = new DataOutputBuffer(); try {//from ww w. ja v a 2s .co m key1.write(outputBuffer1); key2.write(outputBuffer2); nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(), outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult); nanoEnd = System.nanoTime(); System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart)); int offset1 = outputBuffer1.getLength(); int offset2 = outputBuffer2.getLength(); key1.write(outputBuffer1); key2.write(outputBuffer2); Assert.assertEquals( comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1, outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2), expectedResult); if (comparator instanceof LinkKeyComparator) { DataInputBuffer inputStream1 = new DataInputBuffer(); DataInputBuffer inputStream2 = new DataInputBuffer(); inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength()); inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength()); CrawlDBKey cdbkey1 = new CrawlDBKey(); CrawlDBKey cdbkey2 = new CrawlDBKey(); cdbkey1.readFields(inputStream1); cdbkey2.readFields(inputStream2); CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator(); System.out.println("*Comparing Using CrawlDBKey Comparator"); nanoStart = System.nanoTime(); Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult); nanoEnd = System.nanoTime(); System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart)); } } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.commoncrawl.service.crawler.CrawlerEngine.java
License:Open Source License
FlexBuffer getActiveHostListAsBuffer() throws IOException { if (_crawlActive && _httpCrawlQueue != null) { DataOutputBuffer outputBuffer = new DataOutputBuffer(); Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs(); WritableUtils.writeVInt(outputBuffer, ipAddressSet.size()); for (int hostIP : ipAddressSet) { WritableUtils.writeVInt(outputBuffer, hostIP); }/* ww w . j a va2 s . c om*/ return new FlexBuffer(outputBuffer.getData(), 0, outputBuffer.getLength()); } return null; }
From source file:org.commoncrawl.service.crawler.CrawlLog.java
License:Open Source License
private void flushLog(final FlushCompletionCallback completionCallback) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:Collecting Entries...."); // set flush in progress indicator ... setFlushInProgress(true);/*w w w . j a v a 2s . c o m*/ // and collect buffers in async thread context (thus not requiring // synchronization) final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>(); // flush robots log _robotsSegment.flushLog(collector); // walk segments collecting log items .... for (CrawlSegmentLog logger : _loggers.values()) { // flush any log items into the collector logger.flushLog(collector); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers"); // walk collector list identifying the list of unique segment ids final Set<Long> packedSegmentIdSet = new HashSet<Long>(); int urlItemCount = 0; for (CrawlSegmentLog.LogItemBuffer buffer : collector) { if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) { packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())); } urlItemCount += buffer.getItemCount(); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:There are " + urlItemCount + " Items in Flush Buffer Associated With " + packedSegmentIdSet.size() + " Segments"); final File crawlLogFile = getActivePath(_rootDirectory); // now check to see if there is anything to do ... if (collector.size() != 0) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread"); // ok ... time to spawn a thread to do the blocking flush io _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop, new Callable<Boolean>() { public Boolean call() throws Exception { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Started"); long startTime = System.currentTimeMillis(); Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>(); Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>(); long crawlLogRecordCount = 0; // open the actual crawler log file ... final DataOutputStream crawlLogStream = new DataOutputStream( new FileOutputStream(crawlLogFile, true)); try { if (Environment.detailLogEnabled()) LOG.info( "LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer"); // now open a set of file descriptors related to the identified // segments for (long packedSegmentId : packedSegmentIdSet) { // construct the unique filename for the given log file... File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory, getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId)); // initialize the segment log ... CrawlSegmentLog.initializeLogFile(activeSegmentLog); // initialize record counts per stream ... recordCountsByPackedId.put(packedSegmentId, CrawlSegmentLog.readerHeader(activeSegmentLog)); // and open an output stream for the specified log file ... streamsMapByPackedId.put(packedSegmentId, new DataOutputStream(new FileOutputStream(activeSegmentLog, true))); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer"); // initialize a total item count variable int totalItemCount = 0; // crawl history stream DataOutputBuffer historyStream = new DataOutputBuffer(); // and now walk log buffers ... for (CrawlSegmentLog.LogItemBuffer buffer : collector) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount() + " Entries for Segment:" + buffer.getSegmentId()); // output stream DataOutputStream segmentLogStream = null; if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) { // update segment count first ... recordCountsByPackedId.put( makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()), recordCountsByPackedId.get( makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())) + buffer.getItemCount()); // get output stream associated with segment id segmentLogStream = streamsMapByPackedId .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())); } // and our local record counter ... crawlLogRecordCount += buffer.getItemCount(); // and next do the actual disk flush ... totalItemCount += buffer.flushToDisk(totalItemCount, new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() { SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter(); public void writeItem(CrawlURL url) throws IOException { // log it logCrawlLogWrite(url, url.getContentSize()); // write it syncedLogWriter.writeItem(crawlLogStream, url); } public void writeItemCount(int entryCount) throws IOException { } }, segmentLogStream, historyStream); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk"); collector.clear(); } catch (IOException e) { LOG.error("Critical Exception during Crawl Log Flush:" + CCStringUtils.stringifyException(e)); throw e; } finally { if (crawlLogStream != null) { crawlLogStream.flush(); crawlLogStream.close(); } for (DataOutputStream stream : streamsMapByPackedId.values()) { if (stream != null) stream.flush(); stream.close(); } } // at this point... update the crawl log header ... try { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Updating Log File Headers"); // update the log file header updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount); // and update each completion log header ... for (long packedSegmentId : recordCountsByPackedId.keySet()) { File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory, getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId)); CrawlSegmentLog.writeHeader(activeSegmentLogPath, recordCountsByPackedId.get(packedSegmentId)); } } catch (IOException e) { LOG.error("Criticial Exception during Crawl Log Fluhs:" + CCStringUtils.stringifyException(e)); throw e; } finally { } long endTime = System.currentTimeMillis(); _flushTimeAVG.addSample((double) endTime - startTime); _flushTimeSmoothed.addSample((double) endTime - startTime); _lastFlushTime = endTime - startTime; LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully"); return true; } }, new CompletionCallback<Boolean>() { public void taskComplete(Boolean updateResult) { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushComplete(); } } public void taskFailed(Exception e) { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushFailed(e); } // all failures are critical in this particular task ... LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); // no matter ... it is time to CORE the server ... throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); } })); } else { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushComplete(); } } }
From source file:org.commoncrawl.service.crawler.SegmentLoader.java
License:Open Source License
@SuppressWarnings("unchecked") public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName, CancelOperationCallback cancelCallback) throws IOException { CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap(); WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost"); // construct hdfs path to segment ... Path hdfsPath;/* w ww . jav a 2 s .c om*/ if (segmentId != -1) hdfsPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/"); else hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"); Path workUnitDetailPath = new Path(hdfsPath, crawlerName); SequenceFile.Reader reader = null; try { FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig()); LongWritable hostFP = new LongWritable(); CrawlSegmentHost segmentHost = new CrawlSegmentHost(); DataOutputBuffer outputBuffer = new DataOutputBuffer(); int segmentUrlCount = 0; while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) { // and update url count ... segmentUrlCount += segmentHost.getUrlTargets().size(); // set the url vector to the appropriate size ... for (CrawlSegmentURL url : segmentHost.getUrlTargets()) { WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP()); WritableUtils.writeVLong(outputBuffer, url.getUrlFP()); } } outputBuffer.flush(); // ok set the urlfp stream fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength()); // now initialize the if (cancelCallback.cancelOperation()) { return null; } else { return fpMap; } } finally { if (reader != null) reader.close(); } }
From source file:org.commoncrawl.service.listcrawler.CacheWriterThread.java
License:Open Source License
@Override public void run() { boolean shutdown = false; while (!shutdown) { try {//w w w . j av a2s. c om final CacheWriteRequest request = _writeRequestQueue.take(); switch (request._requestType) { case ExitThreadRequest: { // shutdown condition ... CacheManager.LOG.info("Disk Writer Thread Received Shutdown. Exiting!"); shutdown = true; } break; case WriteRequest: { long timeStart = System.currentTimeMillis(); try { // reset crc calculator (single thread so no worries on synchronization) _crc32Out.reset(); // figure out if we need to compress the item ... if ((request._item.getFlags() & CacheItem.Flags.Flag_IsCompressed) == 0 && request._item.getContent().getCount() != 0) { LOG.info("Incoming Cache Request Content for:" + request._item.getUrl() + " is not compressed. Compressing..."); ByteStream compressedBytesOut = new ByteStream(request._item.getContent().getCount()); ThriftyGZIPOutputStream gzipOutputStream = new ThriftyGZIPOutputStream( compressedBytesOut); gzipOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0, request._item.getContent().getCount()); gzipOutputStream.finish(); LOG.info("Finished Compressing Incoming Content for:" + request._item.getUrl() + " BytesIn:" + request._item.getContent().getCount() + " BytesOut:" + compressedBytesOut.size()); // replace buffer request._item.setContent( new FlexBuffer(compressedBytesOut.getBuffer(), 0, compressedBytesOut.size())); request._item.setFlags((request._item.getFlags() | CacheItem.Flags.Flag_IsCompressed)); } // create streams ... ByteStream bufferOutputStream = new ByteStream(8192); CheckedOutputStream checkedStream = new CheckedOutputStream(bufferOutputStream, _crc32Out); DataOutputStream dataOutputStream = new DataOutputStream(checkedStream); // remember if this item has content ... boolean hasContent = request._item.isFieldDirty(CacheItem.Field_CONTENT); // now mark the content field as clean, so that it will not be serialized in our current serialization attempt ... request._item.setFieldClean(CacheItem.Field_CONTENT); // and go ahead and write out the data to the intermediate buffer while also computing partial checksum request._item.write(dataOutputStream); request._item.setFieldDirty(CacheItem.Field_CONTENT); // ok, now ... write out file header ... CacheItemHeader itemHeader = new CacheItemHeader(_manager.getLocalLogSyncBytes()); itemHeader._status = CacheItemHeader.STATUS_ALIVE; itemHeader._lastAccessTime = System.currentTimeMillis(); itemHeader._fingerprint = request._itemFingerprint; // compute total length ... // first the header bytes in the cacheItem itemHeader._dataLength = bufferOutputStream.size(); // next the content length (encoded - as in size + bytes) ... itemHeader._dataLength += 4 + request._item.getContent().getCount(); // lastly the crc value iteself ... itemHeader._dataLength += 8; // open the log file ... DataOutputBuffer logStream = new DataOutputBuffer(); // ok, go ahead and write the header itemHeader.writeHeader(logStream); // ok now write out the item data minus content... logStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size()); // now create a checked stream for the content ... CheckedOutputStream checkedStream2 = new CheckedOutputStream(logStream, checkedStream.getChecksum()); dataOutputStream = new DataOutputStream(checkedStream2); // content size dataOutputStream.writeInt(request._item.getContent().getCount()); // now write out the content (via checked stream so that we can calc checksum on content) dataOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0, request._item.getContent().getCount()); // ok ... lastly write out the checksum bytes ... dataOutputStream.writeLong(checkedStream2.getChecksum().getValue()); // and FINALLY, write out the total item bytes (so that we can seek in reverse to read last request log logStream.writeInt(CacheItemHeader.SIZE + itemHeader._dataLength); // ok flush everyting to the memory stream dataOutputStream.flush(); //ok - time to acquire the log semaphore //LOG.info("Acquiring Local Log Semaphore"); _manager.getLocalLogAccessSemaphore().acquireUninterruptibly(); try { // now time to acquire the write semaphore ... _manager.getLocalLogWriteAccessSemaphore().acquireUninterruptibly(); // get the current file position long recordOffset = _manager.getLocalLogFilePos(); try { long ioTimeStart = System.currentTimeMillis(); RandomAccessFile logFile = new RandomAccessFile(_manager.getActiveLogFilePath(), "rw"); try { // seek to our known record offset logFile.seek(recordOffset); // write out the data logFile.write(logStream.getData(), 0, logStream.getLength()); } finally { logFile.close(); } // now we need to update the file header _manager.updateLogFileHeader(_manager.getActiveLogFilePath(), 1, CacheItemHeader.SIZE + itemHeader._dataLength + 4 /*trailing bytes*/); CacheManager.LOG .info("#### Wrote Cache Item in:" + (System.currentTimeMillis() - timeStart) + " iotime:" + (System.currentTimeMillis() - ioTimeStart) + " QueueSize:" + _writeRequestQueue.size()); } finally { // release write semaphore quickly _manager.getLocalLogWriteAccessSemaphore().release(); } // now inform the manager of the completed request ... _manager.writeRequestComplete(request, recordOffset); } finally { //LOG.info("Releasing Local Log Semaphore"); _manager.getLocalLogAccessSemaphore().release(); } } catch (IOException e) { CacheManager.LOG.error("### FUC# BATMAN! - GONNA LOSE THIS REQUEST!!!!:" + CCStringUtils.stringifyException(e)); _manager.writeRequestFailed(request, e); } } break; } } catch (InterruptedException e) { } } }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try {/*from w w w.j a v a 2 s . c o m*/ while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria, ItemUpdater targetList) throws IOException { // ok copy stuff locally if possible ... File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index"); File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data"); File localBloomFilterPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom"); SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp); // ok copy local first if (!localIndexPath.exists()) { LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:" + localIndexPath.getAbsolutePath()); try {//from w w w.j a v a 2 s . c o m _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localIndexPath.delete(); throw e; } } if (!localDataPath.exists()) { LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:" + localDataPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localDataPath.delete(); throw e; } } if (!localBloomFilterPath.exists()) { LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:" + localBloomFilterPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localBloomFilterPath.delete(); throw e; } } // ok open local FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()); SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem, new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); try { URLFP firstIndexKey = null; URLFP lastIndexKey = new URLFP(); LongWritable position = new LongWritable(); while (indexReader.next(lastIndexKey, position)) { if (firstIndexKey == null) { try { firstIndexKey = (URLFP) lastIndexKey.clone(); } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:" + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:" + lastIndexKey.getUrlHash()); URLFP criteriaFirstKey = criteria.first(); URLFP criteriaLastKey = criteria.last(); if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) { LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!"); LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash() + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:" + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash()); return; } } finally { indexReader.close(); } LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan"); // load bloom filter FSDataInputStream bloomFilterStream = localFileSystem .open(new Path(localBloomFilterPath.getAbsolutePath())); int hitCount = 0; try { URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream); URLFP fpOut = new URLFP(); ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer(); ValueBytes valueBytes = null; DataInputBuffer valueReader = new DataInputBuffer(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyReader = new DataInputBuffer(); URLFP lastFP = null; outerLoop: // now iterate each item in the criteria for (URLFP targetFP : criteria) { // if fingerprint is present in filter ... if (filter.isPresent(targetFP)) { // check to see if reader is initialzied ... if (reader == null) { LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:" + localDataPath.getAbsolutePath()); reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:" + localDataPath.getAbsolutePath()); valueBytes = reader.createValueBytes(); } // if last read fingerprint was not null ... if (lastFP != null) { // does it match the current item if (lastFP.compareTo(targetFP) == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash() + " File:" + dataFilePath); // if so, null out last fp lastFP = null; // and update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; continue; } } // ok at this point .. read the next item in the list ... lastFP = null; while (reader.nextRaw(keyBytes, valueBytes) != -1) { // init reader ... keyReader.reset(keyBytes.getData(), keyBytes.getLength()); // read key fpOut.readFields(keyReader); // reset output buffer keyBytes.reset(); // LOG.info("LIST:" + listId +" nextRaw Returned DH:" + // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" + // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash()); // compare it to target ... int result = fpOut.compareTo(targetFP); // ok does it match .. ? if (result == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash() + " File:" + dataFilePath); // update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; // and break to outer loop continue outerLoop; } else if (result == 1) { // LOG.info("LIST:" + listId + // " FP Comparison Returned 1. Going to OuterLoop"); // update last FP lastFP = fpOut; // continue outer loop continue outerLoop; } else { // otherwise skip } } // ok if we got here .. we are done reading the sequence file and did // not find a trailing match LOG.warn("LIST:" + listId + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:" + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")"); // break out of outer loop break; } } } finally { bloomFilterStream.close(); if (reader != null) { reader.close(); } LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount); } }