List of usage examples for org.apache.hadoop.io WritableUtils writeVInt
public static void writeVInt(DataOutput stream, int i) throws IOException
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public void beginField(DataOutput out, String fieldName, int fieldId) throws IOException { if (_currentMode == FIELD_ID_ENCODING_MODE_UNKNOWN) { throw new IOException("Unknown Field Id Encoding Mode!"); }/*from ww w. ja v a 2 s.c o m*/ if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT) out.writeShort(fieldId); else WritableUtils.writeVInt(out, fieldId); }
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public void endFields(DataOutput out) throws IOException { if (_currentMode == FIELD_ID_ENCODING_MODE_UNKNOWN) { throw new IOException("Unknown Field Id Encoding Mode!"); }/*ww w .j a v a2s . com*/ if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT) out.writeShort(-1); else WritableUtils.writeVInt(out, -1); // ok pop encoding mode popFieldIdEncodingMode(); // reduce nesting level _nestingLevel--; }
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public void writeVInt(DataOutput out, int i) throws IOException { WritableUtils.writeVInt(out, i); }
From source file:org.commoncrawl.service.crawler.CrawlerEngine.java
License:Open Source License
FlexBuffer getActiveHostListAsBuffer() throws IOException { if (_crawlActive && _httpCrawlQueue != null) { DataOutputBuffer outputBuffer = new DataOutputBuffer(); Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs(); WritableUtils.writeVInt(outputBuffer, ipAddressSet.size()); for (int hostIP : ipAddressSet) { WritableUtils.writeVInt(outputBuffer, hostIP); }//from w w w. j a v a 2 s.c o m return new FlexBuffer(outputBuffer.getData(), 0, outputBuffer.getLength()); } return null; }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try {//from ww w. j av a 2 s . c o m while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * update list state of a recently crawled item * //from w w w . j a v a2 s .co m * @param fingerprint - the fingerprint of the updated item * @param newData - the updated crawl history data for the given item * @throws IOException */ @Override public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem newData) throws IOException { if (_listState == LoadState.LOADED) { // check for membership ... if (_bloomFilter.isPresent(fingerprint)) { //LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId()); //LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId()); // extract existing item from disk OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint); //if present (null if false cache hit) if (originalItem != null) { // build an on disk item data structure for any potential changes ... OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint, newData); // set inital offset information newItem._fileOffset = originalItem._fileOffset; newItem._stringsOffset = originalItem._stringsOffset; // LOG.info("UpdateItemState Comparing OnDisk Item to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId()); // compare the two items ... if (!newItem.equals(originalItem)) { //LOG.info("UpdateItemState Items Don't Match for URL:" + newData.getOriginalURL() + " List:" + getListId()); // ok items do not match ... figure out if strings are different ... if (newItem._stringsCRC != originalItem._stringsCRC) { RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw"); try { // seek to end stringsFile.seek(stringsFile.length()); // update offset info newItem._stringsOffset = stringsFile.length(); // write out string data length WritableUtils.writeVInt(stringsFile, _stringBuffer1.getLength()); // write strings to log file stringsFile.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength()); } finally { stringsFile.close(); } } // otherwise take the offset from old item else { newItem._stringsOffset = originalItem._stringsOffset; } //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint); // ok, different paths depending on wether this is an in memory update or not ... boolean wroteToMemory = false; synchronized (this) { if (_tempFixedDataBuffer != null) { wroteToMemory = true; // reset output buffer _tempOutputBuffer.reset(); // serizlie to output buffer newItem.serialize(_tempOutputBuffer); // copy to appropriate location System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer, (int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE); } } if (!wroteToMemory) { // write to disk RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw"); try { while (true) { try { //LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); FileLock lock = file.getChannel().tryLock(originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false); try { //LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); file.seek(originalItem._fileOffset); newItem.serialize(file); //LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint); break; } finally { //LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); lock.release(); } } catch (OverlappingFileLockException e) { LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e)); } } } finally { file.close(); } } // ok now update metadata ... synchronized (_metadata) { int updateFlags = calculateUpdateFlags(originalItem, newItem); if (updateFlags != 0) { int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0); // only write metadata to disk if temp data buffer is null if (metadataDirtyFlags != 0 && !wroteToMemory) { if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) { _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1); } writeMetadataToDisk(); } // if not writing to memory then update subdomain metadata if (!wroteToMemory) { synchronized (_subDomainMetadataFile) { CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL( newData.getOriginalURL()); int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata, processFileOffsets); if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) { if ((subDomainMetadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) { subDomainMetadata.setQueuedItemCount( subDomainMetadata.getQueuedItemCount() - 1); } writeSubDomainMetadataToDisk(subDomainMetadata); } } } } } synchronized (this) { if (_eventListener != null) { _eventListener.itemUpdated(fingerprint); } } } } } } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private void writeInitialOnDiskItem(URLFP fp, ProxyCrawlHistoryItem historyItem, DataOutputStream valueStreamOut, RandomAccessFile stringStream) throws IOException { OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem); // update string offset ... itemOut._stringsOffset = stringStream.length(); // write out string data length WritableUtils.writeVInt(stringStream, _stringBuffer1.getLength()); // write strings to log file stringStream.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength()); // update timestamp ... itemOut._updateTimestamp = -1;/* w w w . j a va 2s . co m*/ // and write to disk itemOut.serialize(valueStreamOut); }
From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java
License:Open Source License
public static final void writeURLFPAndCountToStream(DataOutput stream, URLFPV2 key, int urlCount) throws IOException { stream.writeLong(key.getDomainHash()); stream.writeLong(key.getRootDomainHash()); stream.writeLong(key.getUrlHash());/*from www . j a v a 2s . c o m*/ WritableUtils.writeVInt(stream, urlCount); }
From source file:org.commoncrawl.util.FlexBuffer.java
License:Open Source License
@Override public void write(DataOutput out) throws IOException { // write out count WritableUtils.writeVInt(out, getCount()); // and bytes if count is not zero ... if (getCount() != 0) { out.write(get(), getOffset(), getCount()); }/*from ww w.j a va2s . com*/ }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
/** * serialize write this object to out length uses zero-compressed encoding * //from w ww .ja v a2s. c om * @see Writable#write(DataOutput) */ public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, bytes.getCount()); if (bytes.getCount() != 0) { out.write(bytes.get(), bytes.getOffset(), bytes.getCount()); } }