List of usage examples for org.apache.hadoop.io WritableUtils readVInt
public static int readVInt(DataInput stream) throws IOException
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); FlexBuffer scanArray[] = LinkKey.allocateScanArray(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); }/* www . j ava2s . com*/ // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( FileSystem.get(_jobConf), incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); TextBytes valueOut = new TextBytes(); TextBytes keyOut = new TextBytes(); Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; latestLinkDataTime = -1L; outlinks.clear(); discoveredLinks.clear(); // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // setup fingerprint ... fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { try { JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } reporter.progress(); } // ok now see if we have anything to emit ... if (discoveredLinks.size() != 0) { reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1); for (String outlink : outlinks) { // emit a to tuple toJsonObject.addProperty("to", outlink); valueBytes.set(toJsonObject.toString()); output.collect(sourceDomain, valueBytes); // now emit a from tuple ... fromJsonObject.addProperty("from", sourceDomain.toString()); keyBytes.set(outlink); valueBytes.set(fromJsonObject.toString()); output.collect(keyBytes, valueBytes); } bloomKey.setDomainHash(fpSource.getDomainHash()); for (long destDomainFP : discoveredLinks) { // set the bloom filter key ... bloomKey.setUrlHash(destDomainFP); // add it to the bloom filter emittedTuplesFilter.add(bloomKey); } } else { reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1); } } }
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public int readVInt(DataInput in) throws IOException { return WritableUtils.readVInt(in); }
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public void skipTextBytes(DataInput in) throws IOException { int utflen = 0; if (_currentMode == FIELD_ID_ENCODING_MODE_SHORT) { utflen = in.readUnsignedShort(); } else {//from ww w. jav a 2s .c o m utflen = WritableUtils.readVInt(in); } if (utflen != 0) in.skipBytes(utflen); }
From source file:org.commoncrawl.rpc.base.shared.BinaryProtocol.java
License:Open Source License
public void skipVInt(DataInput in) throws IOException { WritableUtils.readVInt(in); }
From source file:org.commoncrawl.service.crawler.CrawlerServer.java
License:Open Source License
void refreshMasterCrawlerActiveHostList() { // ok if there is a master crawler, and it is online ... if (_masterCrawlerServiceChannel != null && _masterCrawlerServiceChannel.isOpen()) { try {// www.j a v a2 s . c o m _masterCrawlerStub.queryActiveHosts(new Callback<NullMessage, ActiveHostInfo>() { @Override public void requestComplete(AsyncRequest<NullMessage, ActiveHostInfo> request) { if (request.getStatus() == Status.Success) { // ok update timestamp no matter what _pauseStateTimestampIncremental = request.getOutput().getPauseStateTimestamp(); // and clear set ... _pausedHostsSet = null; // now see if we have a valid response ... if (request.getOutput().getActiveHostIds().getCount() != 0) { LOG.info("Received New Active Host Set From Master Crawler At:" + _masterCrawlerAddress); // ok we have a valid list of hosts ... // create a reader stream DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(request.getOutput().getActiveHostIds().getReadOnlyBytes(), 0, request.getOutput().getActiveHostIds().getCount()); try { // create a set ... Set<Integer> ipAddressSet = new TreeSet<Integer>(); // populate it int ipAddressCount = WritableUtils.readVInt(inputStream); for (int i = 0; i < ipAddressCount; ++i) { ipAddressSet.add(WritableUtils.readVInt(inputStream)); } LOG.info("Successfully updated Active Host Set"); // ok replace set ... _pausedHostsSet = ipAddressSet; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } } }); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } // ok no matter what... check to see if we need to set up refresh timer ... if (_masterCrawlerHostListRefreshTimer == null) { _masterCrawlerHostListRefreshTimer = new Timer(ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT, true, new Timer.Callback() { @Override public void timerFired(Timer timer) { // call refresh again ... refreshMasterCrawlerActiveHostList(); } }); _eventLoop.setTimer(_masterCrawlerHostListRefreshTimer); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * Initialize a new CrawlList object from a given input stream of urls * /*from www . j a va2 s . co m*/ * @param manager - reference to the crawl history log manager * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... * @throws IOException */ public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval) throws IOException { _manager = manager; _listState = LoadState.REALLY_LOADING; // initialize a new list id _listId = listId; LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); //establish file names initializeListFileNames(); sourceURLFile.renameTo(_listURLDataFile); FileInputStream urlInputStream = new FileInputStream(_listURLDataFile); try { // set we will use to hold all fingerprints generated TreeSet<URLFP> urlSet = new TreeSet<URLFP>(); // create temp files ... File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId)); // create mergesortspillwriter SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(), new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null, false); try { MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>( CrawlEnvironment.getHadoopConfig(), spillwriter, FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(manager.getLocalDataDir().getAbsolutePath()), null, new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() { DataInputBuffer _key1Buffer = new DataInputBuffer(); DataInputBuffer _key2Buffer = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { _key1Buffer.reset(key1Data, key1Offset, key1Length); _key2Buffer.reset(key2Data, key2Offset, key2Length); _key1Buffer.skip(2); // skip verison, and 1 byte id _key2Buffer.skip(2); // skip verison, and 1 byte id int domainHash1 = WritableUtils.readVInt(_key1Buffer); int domainHash2 = WritableUtils.readVInt(_key2Buffer); _key1Buffer.skip(1); // skip 1 byte id _key2Buffer.skip(1); // skip 1 byte id long fingerprint1 = WritableUtils.readVLong(_key1Buffer); long fingerprint2 = WritableUtils.readVLong(_key2Buffer); int result = ((Integer) domainHash1).compareTo(domainHash2); if (result == 0) { result = ((Long) fingerprint1).compareTo(fingerprint2); } return result; } @Override public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2, ProxyCrawlHistoryItem value2) { return key1.compareTo(key2); } }, URLFP.class, ProxyCrawlHistoryItem.class, false, null); try { LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List"); BufferedReader reader = new BufferedReader( new InputStreamReader(urlInputStream, Charset.forName("UTF-8"))); String line = null; int lineNumber = 0; ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); while ((line = reader.readLine()) != null) { ++lineNumber; if (line.length() != 0 && !line.startsWith("#")) { URLFP fingerprint = URLUtils.getURLFPFromURL(line, true); if (fingerprint != null) { if (!urlSet.contains(fingerprint)) { // and add fingerprint to set urlSet.add(fingerprint); // initialize item item.clear(); item.setOriginalURL(line); // and spill to merger / sorter .. merger.spillRecord(fingerprint, item); } } else { LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line); } } } LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS"); } finally { merger.close(); } } finally { if (spillwriter != null) spillwriter.close(); } LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys"); // generate bloom filter ... _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10); for (URLFP fingerprint : urlSet) { _bloomFilter.add(fingerprint); } LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter"); // serialize it FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData); try { _bloomFilter.serialize(bloomFilterStream); } finally { bloomFilterStream.flush(); bloomFilterStream.close(); } LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile); // now initialize value map and string maps based on output sequence file ... SequenceFile.Reader reader = new SequenceFile.Reader( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); // OK, Allocate room for fixed data file upfront DataOutputBuffer valueStream = new DataOutputBuffer( urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE); LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED"); try { //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile)); RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw"); try { URLFP urlFP = new URLFP(); ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); // read fingerprints ... while (reader.next(urlFP, item)) { // write out fixed data structure and strings writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream); } } finally { //valueStream.flush(); //valueStream.close(); stringsStream.close(); } } finally { reader.close(); } LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk"); LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); } // initialize temp data buffer variables _tempFixedDataBuffer = valueStream.getData(); _tempFixedDataBufferSize = valueStream.getLength(); // update metadata _metadata.setRefreshInterval(refreshInterval); _metadata.setUrlCount(urlSet.size()); // setup version _metadata.setVersion(1); // and write to disk writeMetadataToDisk(); // mark state as loaded ... _listState = LoadState.LOADED; LOG.info("*** LIST:" + getListId() + " SYNCING"); // reconcile with history log _manager.syncList(this.getListId(), urlSet, this); LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE"); // write metdata to disk again writeMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA"); // and finally flush fixed data to disk FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile); try { synchronized (this) { int blockSize = 1 << 20; long bytesCopied = 0; for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) { int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset); finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy); bytesCopied += bytesToCopy; } // validate bytes copied if (bytesCopied != _tempFixedDataBufferSize) { throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied); } // ok release the buffer _tempFixedDataBuffer = null; _tempFixedDataBufferSize = 0; LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE"); } } finally { finalDataStream.flush(); finalDataStream.close(); } // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); } catch (IOException e) { LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e)); _fixedDataFile.delete(); _variableDataFile.delete(); _bloomFilterData.delete(); _listState = LoadState.ERROR; throw e; } finally { urlInputStream.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** queue uncrawled urls via the CrawlQueueLoader * // w ww . j a v a 2 s. c om * @param loader */ public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException { _queueState = QueueState.QUEUEING; int metadataVersion = getMetadata().getVersion(); synchronized (_metadata) { // reset metadata PERIOD int urlCount = _metadata.getUrlCount(); _metadata.clear(); _metadata.setUrlCount(urlCount); } RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); //LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position); while (true) { // get read lock on position ... try { FileLock lock = fixedDataReader.getChannel().tryLock(position, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false); try { //LOG.info("*** GOT READ LOCK FOR OFFSET:" + position); item.deserialize(fixedDataReader); break; } finally { lock.release(); //LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position); } } catch (OverlappingFileLockException e) { LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:" + CCStringUtils.stringifyException(e)); } } // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // setup fingerprint fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); // first, if it has not been crawled ever, crawl it not matter what ... boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS); // if it has been crawled ... check list metadata version ... if (!crawlItem && metadataVersion >= 1) { // ok this is newer version of the list ... // check refresh time if specified ... int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS; if (getMetadata().getRefreshInterval() != 0) { refreshIntervalInSeconds = getMetadata().getRefreshInterval(); } if (item._updateTimestamp > 0) { long timeSinceLastCrawl = item._updateTimestamp; if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) { crawlItem = true; } } } if (crawlItem) { loader.queueURL(fingerprint, url); synchronized (_metadata) { // update queued item count _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1); } } else { updateMetadata(item, _metadata, 0); } // ok update subdomain stats updateSubDomainMetadataForItemDuringLoad(item, url, fingerprint, crawlItem); } flushCachedSubDomainMetadata(); loader.flush(); _queueState = QueueState.QUEUED; } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); _queueState = QueueState.ERROR; } finally { fixedDataReader.close(); stringDataReader.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** resubmit failed items * /* w w w. j ava 2 s . com*/ * @param loader */ public void requeueFailedItems(CrawlQueueLoader loader) throws IOException { synchronized (this) { _queueState = QueueState.QUEUEING; } RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { item.deserialize(fixedDataReader); boolean queueItem = false; if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { queueItem = (item._redirectStatus != 0); if (!queueItem) { if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { queueItem = true; } } } else { queueItem = (item._crawlStatus != 0); if (!queueItem) { if (item._httpResultCode != 200 && item._httpResultCode != 404) { queueItem = true; } } } if (queueItem) { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // and spill fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); loader.queueURL(fingerprint, url); } } } } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException { ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0) itemOut.setCrawlStatus(item._crawlStatus); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0) itemOut.setHttpResultCode(item._httpResultCode); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0) itemOut.setRedirectStatus(item._redirectStatus); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0) itemOut.setRedirectHttpResult(item._redirectHttpResult); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0) itemOut.setLastModifiedTime(item._updateTimestamp); // now attept to get the string offset RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try {// ww w . j a v a 2s . com // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // now populate original url ... itemOut.setOriginalURL(stringDataReader.readUTF()); // now if redirect url is present if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) { itemOut.setRedirectURL(stringDataReader.readUTF()); } } finally { stringDataReader.close(); } return itemOut; }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try {/*from ww w . j av a2 s. c o m*/ // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } }