Example usage for org.apache.hadoop.io WritableUtils writeVInt

List of usage examples for org.apache.hadoop.io WritableUtils writeVInt

Introduction

In this page you can find the example usage for org.apache.hadoop.io WritableUtils writeVInt.

Prototype

public static void writeVInt(DataOutput stream, int i) throws IOException 

Source Link

Document

Serializes an integer to a binary stream with zero-compressed encoding.

Usage

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public void beginField(DataOutput out, String fieldName, int fieldId) throws IOException {
    if (_currentMode == FIELD_ID_ENCODING_MODE_UNKNOWN) {
        throw new IOException("Unknown Field Id Encoding Mode!");
    }/*from  ww w. ja v a  2 s.c  o m*/
    if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT)
        out.writeShort(fieldId);
    else
        WritableUtils.writeVInt(out, fieldId);
}

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public void endFields(DataOutput out) throws IOException {
    if (_currentMode == FIELD_ID_ENCODING_MODE_UNKNOWN) {
        throw new IOException("Unknown Field Id Encoding Mode!");
    }/*ww  w .j  a v  a2s  .  com*/
    if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT)
        out.writeShort(-1);
    else
        WritableUtils.writeVInt(out, -1);

    // ok pop encoding mode
    popFieldIdEncodingMode();
    // reduce nesting level
    _nestingLevel--;
}

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public void writeVInt(DataOutput out, int i) throws IOException {
    WritableUtils.writeVInt(out, i);
}

From source file:org.commoncrawl.service.crawler.CrawlerEngine.java

License:Open Source License

FlexBuffer getActiveHostListAsBuffer() throws IOException {
    if (_crawlActive && _httpCrawlQueue != null) {

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs();

        WritableUtils.writeVInt(outputBuffer, ipAddressSet.size());

        for (int hostIP : ipAddressSet) {
            WritableUtils.writeVInt(outputBuffer, hostIP);
        }//from   w w  w. j  a  v  a 2  s.c  o m

        return new FlexBuffer(outputBuffer.getData(), 0, outputBuffer.getLength());
    }
    return null;
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {//from   ww w. j av  a 2  s  . c o  m
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * update list state of a recently crawled item 
 * //from  w  w w . j a  v a2  s .co  m
 * @param fingerprint - the fingerprint of the updated item 
 * @param newData         - the updated crawl history data for the given item    
 * @throws IOException
 */
@Override
public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem newData) throws IOException {

    if (_listState == LoadState.LOADED) {
        // check for membership ... 
        if (_bloomFilter.isPresent(fingerprint)) {

            //LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId());

            //LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
            // extract existing item from disk 
            OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint);

            //if present (null if false cache hit) 
            if (originalItem != null) {

                // build an on disk item data structure for any potential changes ... 
                OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint, newData);

                // set inital offset information 
                newItem._fileOffset = originalItem._fileOffset;
                newItem._stringsOffset = originalItem._stringsOffset;

                // LOG.info("UpdateItemState Comparing OnDisk Item  to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
                // compare the two items ... 
                if (!newItem.equals(originalItem)) {
                    //LOG.info("UpdateItemState Items Don't Match for  URL:" + newData.getOriginalURL() + " List:" + getListId());
                    // ok items do not match ... figure out if strings are different ... 
                    if (newItem._stringsCRC != originalItem._stringsCRC) {
                        RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw");

                        try {
                            // seek to end 
                            stringsFile.seek(stringsFile.length());
                            // update offset info 
                            newItem._stringsOffset = stringsFile.length();
                            // write out string data length 
                            WritableUtils.writeVInt(stringsFile, _stringBuffer1.getLength());
                            // write strings to log file
                            stringsFile.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength());
                        } finally {
                            stringsFile.close();
                        }
                    }
                    // otherwise take the offset from old item 
                    else {
                        newItem._stringsOffset = originalItem._stringsOffset;
                    }
                    //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint);

                    // ok, different paths depending on wether this is an in memory update or not ... 
                    boolean wroteToMemory = false;
                    synchronized (this) {
                        if (_tempFixedDataBuffer != null) {
                            wroteToMemory = true;
                            // reset output buffer 
                            _tempOutputBuffer.reset();
                            // serizlie to output buffer 
                            newItem.serialize(_tempOutputBuffer);
                            // copy to appropriate location 
                            System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer,
                                    (int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                        }
                    }

                    if (!wroteToMemory) {
                        // write to disk 
                        RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw");

                        try {

                            while (true) {
                                try {
                                    //LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                    FileLock lock = file.getChannel().tryLock(originalItem._fileOffset,
                                            OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

                                    try {
                                        //LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                        file.seek(originalItem._fileOffset);
                                        newItem.serialize(file);
                                        //LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint);
                                        break;
                                    } finally {
                                        //LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                        lock.release();
                                    }
                                } catch (OverlappingFileLockException e) {
                                    LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e));
                                }
                            }
                        } finally {
                            file.close();
                        }
                    }

                    // ok now update metadata ... 
                    synchronized (_metadata) {

                        int updateFlags = calculateUpdateFlags(originalItem, newItem);

                        if (updateFlags != 0) {

                            int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0);

                            // only write metadata to disk if temp data buffer is null
                            if (metadataDirtyFlags != 0 && !wroteToMemory) {
                                if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                                    _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1);
                                }
                                writeMetadataToDisk();
                            }

                            // if not writing to memory then update subdomain metadata 
                            if (!wroteToMemory) {

                                synchronized (_subDomainMetadataFile) {
                                    CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL(
                                            newData.getOriginalURL());

                                    int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata,
                                            processFileOffsets);

                                    if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) {
                                        if ((subDomainMetadataDirtyFlags
                                                & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                                            subDomainMetadata.setQueuedItemCount(
                                                    subDomainMetadata.getQueuedItemCount() - 1);
                                        }
                                        writeSubDomainMetadataToDisk(subDomainMetadata);
                                    }
                                }
                            }
                        }
                    }

                    synchronized (this) {
                        if (_eventListener != null) {
                            _eventListener.itemUpdated(fingerprint);
                        }
                    }
                }
            }
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private void writeInitialOnDiskItem(URLFP fp, ProxyCrawlHistoryItem historyItem,
        DataOutputStream valueStreamOut, RandomAccessFile stringStream) throws IOException {

    OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem);

    // update string offset ... 
    itemOut._stringsOffset = stringStream.length();
    // write out string data length 
    WritableUtils.writeVInt(stringStream, _stringBuffer1.getLength());
    // write strings to log file
    stringStream.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength());
    // update timestamp ... 
    itemOut._updateTimestamp = -1;/*  w  w w  . j  a  va 2s . co  m*/
    // and write to disk 
    itemOut.serialize(valueStreamOut);
}

From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java

License:Open Source License

public static final void writeURLFPAndCountToStream(DataOutput stream, URLFPV2 key, int urlCount)
        throws IOException {
    stream.writeLong(key.getDomainHash());
    stream.writeLong(key.getRootDomainHash());
    stream.writeLong(key.getUrlHash());/*from www  .  j  a  v  a  2s  . c  o  m*/
    WritableUtils.writeVInt(stream, urlCount);
}

From source file:org.commoncrawl.util.FlexBuffer.java

License:Open Source License

@Override
public void write(DataOutput out) throws IOException {
    // write out count
    WritableUtils.writeVInt(out, getCount());
    // and bytes if count is not zero ...
    if (getCount() != 0) {
        out.write(get(), getOffset(), getCount());
    }/*from ww  w.j a  va2s  . com*/
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

/**
 * serialize write this object to out length uses zero-compressed encoding
 * //from   w  ww .ja v  a2s. c  om
 * @see Writable#write(DataOutput)
 */
public void write(DataOutput out) throws IOException {
    WritableUtils.writeVInt(out, bytes.getCount());
    if (bytes.getCount() != 0) {
        out.write(bytes.get(), bytes.getOffset(), bytes.getCount());
    }
}