Example usage for org.apache.hadoop.io WritableUtils readVInt

Introduction

In this page you can find the example usage for org.apache.hadoop.io WritableUtils readVInt.

Prototype

public static int readVInt(DataInput stream) throws IOException

Source Link

Document

Reads a zero-compressed encoded integer from input stream and returns it.

Usage

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }/*  www  . j ava2s . com*/

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    TextBytes valueOut = new TextBytes();
    TextBytes keyOut = new TextBytes();

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    // pick up source fp from key ...
    URLFPV2 fpSource = new URLFPV2();

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;
        latestLinkDataTime = -1L;
        outlinks.clear();
        discoveredLinks.clear();

        // scan key components
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // setup fingerprint ...
        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                try {
                    JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            }
            reporter.progress();
        }
        // ok now see if we have anything to emit ...
        if (discoveredLinks.size() != 0) {
            reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1);
            for (String outlink : outlinks) {
                // emit a to tuple
                toJsonObject.addProperty("to", outlink);
                valueBytes.set(toJsonObject.toString());
                output.collect(sourceDomain, valueBytes);
                // now emit a from tuple ...
                fromJsonObject.addProperty("from", sourceDomain.toString());
                keyBytes.set(outlink);
                valueBytes.set(fromJsonObject.toString());
                output.collect(keyBytes, valueBytes);
            }

            bloomKey.setDomainHash(fpSource.getDomainHash());

            for (long destDomainFP : discoveredLinks) {
                // set the bloom filter key ...
                bloomKey.setUrlHash(destDomainFP);
                // add it to the bloom filter
                emittedTuplesFilter.add(bloomKey);
            }
        } else {
            reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1);
        }
    }
}

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public int readVInt(DataInput in) throws IOException {
    return WritableUtils.readVInt(in);
}

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public void skipTextBytes(DataInput in) throws IOException {
    int utflen = 0;
    if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT) {
        utflen = in.readUnsignedShort();
    } else {//from  ww  w.  jav  a 2s  .c o m
        utflen = WritableUtils.readVInt(in);
    }
    if (utflen != 0)
        in.skipBytes(utflen);
}

From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java

License:Open Source License

public void skipVInt(DataInput in) throws IOException {
    WritableUtils.readVInt(in);
}

From source file:org.commoncrawl.service.crawler.CrawlerServer.java

License:Open Source License

void refreshMasterCrawlerActiveHostList() {
    // ok if there is a master crawler, and it is online ... 
    if (_masterCrawlerServiceChannel != null && _masterCrawlerServiceChannel.isOpen()) {
        try {// www.j a  v a2 s .  c o m
            _masterCrawlerStub.queryActiveHosts(new Callback<NullMessage, ActiveHostInfo>() {

                @Override
                public void requestComplete(AsyncRequest<NullMessage, ActiveHostInfo> request) {
                    if (request.getStatus() == Status.Success) {
                        // ok update timestamp no matter what 
                        _pauseStateTimestampIncremental = request.getOutput().getPauseStateTimestamp();
                        // and clear set ... 
                        _pausedHostsSet = null;
                        // now see if we have a valid response ... 
                        if (request.getOutput().getActiveHostIds().getCount() != 0) {
                            LOG.info("Received New Active Host Set From Master Crawler At:"
                                    + _masterCrawlerAddress);
                            // ok we have a valid list of hosts ... 
                            // create a reader stream 
                            DataInputBuffer inputStream = new DataInputBuffer();
                            inputStream.reset(request.getOutput().getActiveHostIds().getReadOnlyBytes(), 0,
                                    request.getOutput().getActiveHostIds().getCount());

                            try {
                                // create a set ... 
                                Set<Integer> ipAddressSet = new TreeSet<Integer>();
                                // populate it 
                                int ipAddressCount = WritableUtils.readVInt(inputStream);
                                for (int i = 0; i < ipAddressCount; ++i) {
                                    ipAddressSet.add(WritableUtils.readVInt(inputStream));
                                }

                                LOG.info("Successfully updated Active Host Set");
                                // ok replace set ... 
                                _pausedHostsSet = ipAddressSet;
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                    }
                }
            });
        } catch (RPCException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
    }

    // ok no matter what... check to see if we need to set up refresh timer ... 
    if (_masterCrawlerHostListRefreshTimer == null) {
        _masterCrawlerHostListRefreshTimer = new Timer(ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT, true,
                new Timer.Callback() {

                    @Override
                    public void timerFired(Timer timer) {
                        // call refresh again ... 
                        refreshMasterCrawlerActiveHostList();
                    }
                });
        _eventLoop.setTimer(_masterCrawlerHostListRefreshTimer);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * Initialize a new CrawlList object from a given input stream of urls 
 * /*from www  . j a  va2  s  . co  m*/
 * @param manager           - reference to the crawl history log manager 
 * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
 * @throws IOException      
 */
public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval)
        throws IOException {

    _manager = manager;

    _listState = LoadState.REALLY_LOADING;

    // initialize a new list id 
    _listId = listId;

    LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());

    //establish file names 
    initializeListFileNames();

    sourceURLFile.renameTo(_listURLDataFile);

    FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

    try {

        // set we will use to hold all fingerprints generated 
        TreeSet<URLFP> urlSet = new TreeSet<URLFP>();

        // create temp files ...
        File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));

        // create mergesortspillwriter 
        SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(),
                new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null,
                false);

        try {

            MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                    CrawlEnvironment.getHadoopConfig(), spillwriter,
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                    new Path(manager.getLocalDataDir().getAbsolutePath()), null,
                    new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

                        DataInputBuffer _key1Buffer = new DataInputBuffer();
                        DataInputBuffer _key2Buffer = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            _key1Buffer.reset(key1Data, key1Offset, key1Length);
                            _key2Buffer.reset(key2Data, key2Offset, key2Length);

                            _key1Buffer.skip(2); // skip verison, and 1 byte id 
                            _key2Buffer.skip(2); // skip verison, and 1 byte id 

                            int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                            int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                            _key1Buffer.skip(1); // skip 1 byte id 
                            _key2Buffer.skip(1); // skip 1 byte id 

                            long fingerprint1 = WritableUtils.readVLong(_key1Buffer);
                            long fingerprint2 = WritableUtils.readVLong(_key2Buffer);

                            int result = ((Integer) domainHash1).compareTo(domainHash2);

                            if (result == 0) {
                                result = ((Long) fingerprint1).compareTo(fingerprint2);
                            }

                            return result;
                        }

                        @Override
                        public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2,
                                ProxyCrawlHistoryItem value2) {
                            return key1.compareTo(key2);
                        }
                    }, URLFP.class, ProxyCrawlHistoryItem.class, false, null);

            try {

                LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(urlInputStream, Charset.forName("UTF-8")));

                String line = null;
                int lineNumber = 0;
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
                while ((line = reader.readLine()) != null) {
                    ++lineNumber;
                    if (line.length() != 0 && !line.startsWith("#")) {
                        URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

                        if (fingerprint != null) {

                            if (!urlSet.contains(fingerprint)) {
                                // and add fingerprint to set 
                                urlSet.add(fingerprint);
                                // initialize item 
                                item.clear();
                                item.setOriginalURL(line);
                                // and spill to merger / sorter .. 
                                merger.spillRecord(fingerprint, item);
                            }
                        } else {
                            LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:"
                                    + lineNumber + " URL" + line);
                        }
                    }
                }
                LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
            } finally {
                merger.close();
            }
        } finally {
            if (spillwriter != null)
                spillwriter.close();
        }
        LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
        // generate bloom filter ...  
        _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10);

        for (URLFP fingerprint : urlSet) {
            _bloomFilter.add(fingerprint);
        }
        LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
        // serialize it
        FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
        try {
            _bloomFilter.serialize(bloomFilterStream);
        } finally {
            bloomFilterStream.flush();
            bloomFilterStream.close();
        }

        LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
        // now initialize value map and string maps based on output sequence file ... 
        SequenceFile.Reader reader = new SequenceFile.Reader(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

        LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:"
                + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        // OK, Allocate room for fixed data file upfront 
        DataOutputBuffer valueStream = new DataOutputBuffer(
                urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
        LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

        try {

            //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
            RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

            try {
                URLFP urlFP = new URLFP();
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

                // read fingerprints ... 
                while (reader.next(urlFP, item)) {
                    // write out fixed data structure and strings 
                    writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream);
                }
            } finally {
                //valueStream.flush();
                //valueStream.close();
                stringsStream.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

        LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength()
                + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
            throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength()
                    + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        }
        // initialize temp data buffer variables 
        _tempFixedDataBuffer = valueStream.getData();
        _tempFixedDataBufferSize = valueStream.getLength();

        // update metadata 
        _metadata.setRefreshInterval(refreshInterval);
        _metadata.setUrlCount(urlSet.size());

        // setup version 
        _metadata.setVersion(1);

        // and write to disk 
        writeMetadataToDisk();

        // mark state as loaded ... 
        _listState = LoadState.LOADED;

        LOG.info("*** LIST:" + getListId() + " SYNCING");
        // reconcile with history log
        _manager.syncList(this.getListId(), urlSet, this);
        LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

        // write metdata to disk again 
        writeMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

        // and finally flush fixed data to disk 
        FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

        try {
            synchronized (this) {
                int blockSize = 1 << 20;
                long bytesCopied = 0;
                for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) {
                    int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset);
                    finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
                    bytesCopied += bytesToCopy;
                }
                // validate bytes copied 
                if (bytesCopied != _tempFixedDataBufferSize) {
                    throw new IOException("Buffer Size:" + _tempFixedDataBufferSize
                            + " Does not Match BytesCopied:" + bytesCopied);
                }

                // ok release the buffer 
                _tempFixedDataBuffer = null;
                _tempFixedDataBufferSize = 0;

                LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
            }

        } finally {
            finalDataStream.flush();
            finalDataStream.close();
        }

        // load sub domain metadata from disk ... 
        loadSubDomainMetadataFromDisk();

    } catch (IOException e) {
        LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:"
                + CCStringUtils.stringifyException(e));

        _fixedDataFile.delete();
        _variableDataFile.delete();
        _bloomFilterData.delete();

        _listState = LoadState.ERROR;

        throw e;
    } finally {
        urlInputStream.close();
    }

}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/** queue uncrawled urls via the CrawlQueueLoader
 * //  w  ww  .  j  a  v  a  2 s.  c  om
 * @param loader
 */
public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException {
    _queueState = QueueState.QUEUEING;

    int metadataVersion = getMetadata().getVersion();

    synchronized (_metadata) {
        // reset metadata PERIOD  
        int urlCount = _metadata.getUrlCount();
        _metadata.clear();
        _metadata.setUrlCount(urlCount);
    }

    RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try {

        OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
        URLFP fingerprint = new URLFP();

        while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

            long position = fixedDataReader.getFilePointer();

            //LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position);
            while (true) {
                // get read lock on position ... 
                try {
                    FileLock lock = fixedDataReader.getChannel().tryLock(position,
                            OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

                    try {
                        //LOG.info("*** GOT READ LOCK FOR OFFSET:" + position);
                        item.deserialize(fixedDataReader);
                        break;
                    } finally {
                        lock.release();
                        //LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position);
                    }
                } catch (OverlappingFileLockException e) {
                    LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
            }

            // seek to string data 
            stringDataReader.seek(item._stringsOffset);
            // and skip buffer length 
            WritableUtils.readVInt(stringDataReader);
            // and read primary string 
            String url = stringDataReader.readUTF();
            // setup fingerprint 
            fingerprint.setDomainHash(item._domainHash);
            fingerprint.setUrlHash(item._urlFingerprint);

            // first, if it has not been crawled ever, crawl it not matter what ... 
            boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);

            // if it has been crawled ... check list metadata version ... 
            if (!crawlItem && metadataVersion >= 1) {
                // ok this is newer version of the list ... 
                // check refresh time if specified ...
                int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS;

                if (getMetadata().getRefreshInterval() != 0) {
                    refreshIntervalInSeconds = getMetadata().getRefreshInterval();
                }

                if (item._updateTimestamp > 0) {
                    long timeSinceLastCrawl = item._updateTimestamp;
                    if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) {
                        crawlItem = true;
                    }
                }
            }

            if (crawlItem) {

                loader.queueURL(fingerprint, url);

                synchronized (_metadata) {
                    // update queued item count 
                    _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1);
                }
            } else {
                updateMetadata(item, _metadata, 0);
            }
            // ok update subdomain stats 
            updateSubDomainMetadataForItemDuringLoad(item, url, fingerprint, crawlItem);
        }

        flushCachedSubDomainMetadata();

        loader.flush();

        _queueState = QueueState.QUEUED;
    } catch (IOException e) {
        LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                + CCStringUtils.stringifyException(e));
        _queueState = QueueState.ERROR;
    } finally {
        fixedDataReader.close();
        stringDataReader.close();
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/** resubmit failed items 
 * /*  w w  w.  j  ava 2  s . com*/
 * @param loader
 */
public void requeueFailedItems(CrawlQueueLoader loader) throws IOException {
    synchronized (this) {
        _queueState = QueueState.QUEUEING;
    }
    RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try {

        OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
        URLFP fingerprint = new URLFP();

        while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
            item.deserialize(fixedDataReader);
            boolean queueItem = false;
            if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {

                if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
                    queueItem = (item._redirectStatus != 0);

                    if (!queueItem) {
                        if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
                            queueItem = true;
                        }
                    }
                } else {
                    queueItem = (item._crawlStatus != 0);

                    if (!queueItem) {
                        if (item._httpResultCode != 200 && item._httpResultCode != 404) {
                            queueItem = true;
                        }
                    }
                }

                if (queueItem) {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();
                    // and spill
                    fingerprint.setDomainHash(item._domainHash);
                    fingerprint.setUrlHash(item._urlFingerprint);

                    loader.queueURL(fingerprint, url);
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                + CCStringUtils.stringifyException(e));
        _queueState = QueueState.QUEUED;
    } finally {
        fixedDataReader.close();
        stringDataReader.close();
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException {

    ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();

    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0)
        itemOut.setCrawlStatus(item._crawlStatus);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0)
        itemOut.setHttpResultCode(item._httpResultCode);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0)
        itemOut.setRedirectStatus(item._redirectStatus);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0)
        itemOut.setRedirectHttpResult(item._redirectHttpResult);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0)
        itemOut.setLastModifiedTime(item._updateTimestamp);
    // now attept to get the string offset 
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try {// ww w  .  j  a  v  a 2s  . com
        // seek to string data 
        stringDataReader.seek(item._stringsOffset);
        // and skip buffer length 
        WritableUtils.readVInt(stringDataReader);
        // now populate original url ... 
        itemOut.setOriginalURL(stringDataReader.readUTF());
        // now if redirect url is present 
        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) {
            itemOut.setRedirectURL(stringDataReader.readUTF());
        }
    } finally {
        stringDataReader.close();
    }
    return itemOut;
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void loadSubDomainMetadataFromDisk() throws IOException {
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

        try {/*from   ww w  .  j av  a2  s. c  o m*/
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    } else {

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

        try {

            //ok rebuild top level metadata as well 
            _metadata.clear();

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

            int processedCount = 0;
            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                // store offset in item 
                item._fileOffset = position;
                // load from disk 
                item.deserialize(fixedDataReader);
                try {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();

                    // get metadata object for subdomain 
                    CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                    // increment url count 
                    subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                    // increment top level metadata count 
                    _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                    // update top level metadata ..
                    updateMetadata(item, _metadata, 0);

                    // update sub-domain metadata object  from item data
                    updateMetadata(item, subDomainMetadata, 0);

                    ++processedCount;
                } catch (IOException e) {
                    LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                    LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                    LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                            + stringDataReader.getFilePointer());
                }

                if (processedCount % 10000 == 0) {
                    LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                }
            }

            // ok commit top level metadata to disk as well 
            writeMetadataToDisk();

        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                    + stringDataReader.getFilePointer());
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

        // write metadat to disk 
        writeInitialSubDomainMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
}