List of usage examples for org.apache.hadoop.io DataInputBuffer DataInputBuffer
public DataInputBuffer()
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try {//from www.j av a2s.co m while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria, ItemUpdater targetList) throws IOException { // ok copy stuff locally if possible ... File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index"); File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data"); File localBloomFilterPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom"); SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp); // ok copy local first if (!localIndexPath.exists()) { LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:" + localIndexPath.getAbsolutePath()); try {//from ww w.j av a2 s . com _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localIndexPath.delete(); throw e; } } if (!localDataPath.exists()) { LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:" + localDataPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localDataPath.delete(); throw e; } } if (!localBloomFilterPath.exists()) { LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:" + localBloomFilterPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localBloomFilterPath.delete(); throw e; } } // ok open local FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()); SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem, new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); try { URLFP firstIndexKey = null; URLFP lastIndexKey = new URLFP(); LongWritable position = new LongWritable(); while (indexReader.next(lastIndexKey, position)) { if (firstIndexKey == null) { try { firstIndexKey = (URLFP) lastIndexKey.clone(); } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:" + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:" + lastIndexKey.getUrlHash()); URLFP criteriaFirstKey = criteria.first(); URLFP criteriaLastKey = criteria.last(); if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) { LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!"); LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash() + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:" + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash()); return; } } finally { indexReader.close(); } LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan"); // load bloom filter FSDataInputStream bloomFilterStream = localFileSystem .open(new Path(localBloomFilterPath.getAbsolutePath())); int hitCount = 0; try { URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream); URLFP fpOut = new URLFP(); ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer(); ValueBytes valueBytes = null; DataInputBuffer valueReader = new DataInputBuffer(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyReader = new DataInputBuffer(); URLFP lastFP = null; outerLoop: // now iterate each item in the criteria for (URLFP targetFP : criteria) { // if fingerprint is present in filter ... if (filter.isPresent(targetFP)) { // check to see if reader is initialzied ... if (reader == null) { LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:" + localDataPath.getAbsolutePath()); reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:" + localDataPath.getAbsolutePath()); valueBytes = reader.createValueBytes(); } // if last read fingerprint was not null ... if (lastFP != null) { // does it match the current item if (lastFP.compareTo(targetFP) == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash() + " File:" + dataFilePath); // if so, null out last fp lastFP = null; // and update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; continue; } } // ok at this point .. read the next item in the list ... lastFP = null; while (reader.nextRaw(keyBytes, valueBytes) != -1) { // init reader ... keyReader.reset(keyBytes.getData(), keyBytes.getLength()); // read key fpOut.readFields(keyReader); // reset output buffer keyBytes.reset(); // LOG.info("LIST:" + listId +" nextRaw Returned DH:" + // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" + // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash()); // compare it to target ... int result = fpOut.compareTo(targetFP); // ok does it match .. ? if (result == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash() + " File:" + dataFilePath); // update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; // and break to outer loop continue outerLoop; } else if (result == 1) { // LOG.info("LIST:" + listId + // " FP Comparison Returned 1. Going to OuterLoop"); // update last FP lastFP = fpOut; // continue outer loop continue outerLoop; } else { // otherwise skip } } // ok if we got here .. we are done reading the sequence file and did // not find a trailing match LOG.warn("LIST:" + listId + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:" + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")"); // break out of outer loop break; } } } finally { bloomFilterStream.close(); if (reader != null) { reader.close(); } LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * Initialize a new CrawlList object from a given input stream of urls * /*from w w w.java2 s . c o m*/ * @param manager - reference to the crawl history log manager * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... * @throws IOException */ public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval) throws IOException { _manager = manager; _listState = LoadState.REALLY_LOADING; // initialize a new list id _listId = listId; LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); //establish file names initializeListFileNames(); sourceURLFile.renameTo(_listURLDataFile); FileInputStream urlInputStream = new FileInputStream(_listURLDataFile); try { // set we will use to hold all fingerprints generated TreeSet<URLFP> urlSet = new TreeSet<URLFP>(); // create temp files ... File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId)); // create mergesortspillwriter SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(), new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null, false); try { MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>( CrawlEnvironment.getHadoopConfig(), spillwriter, FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(manager.getLocalDataDir().getAbsolutePath()), null, new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() { DataInputBuffer _key1Buffer = new DataInputBuffer(); DataInputBuffer _key2Buffer = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { _key1Buffer.reset(key1Data, key1Offset, key1Length); _key2Buffer.reset(key2Data, key2Offset, key2Length); _key1Buffer.skip(2); // skip verison, and 1 byte id _key2Buffer.skip(2); // skip verison, and 1 byte id int domainHash1 = WritableUtils.readVInt(_key1Buffer); int domainHash2 = WritableUtils.readVInt(_key2Buffer); _key1Buffer.skip(1); // skip 1 byte id _key2Buffer.skip(1); // skip 1 byte id long fingerprint1 = WritableUtils.readVLong(_key1Buffer); long fingerprint2 = WritableUtils.readVLong(_key2Buffer); int result = ((Integer) domainHash1).compareTo(domainHash2); if (result == 0) { result = ((Long) fingerprint1).compareTo(fingerprint2); } return result; } @Override public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2, ProxyCrawlHistoryItem value2) { return key1.compareTo(key2); } }, URLFP.class, ProxyCrawlHistoryItem.class, false, null); try { LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List"); BufferedReader reader = new BufferedReader( new InputStreamReader(urlInputStream, Charset.forName("UTF-8"))); String line = null; int lineNumber = 0; ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); while ((line = reader.readLine()) != null) { ++lineNumber; if (line.length() != 0 && !line.startsWith("#")) { URLFP fingerprint = URLUtils.getURLFPFromURL(line, true); if (fingerprint != null) { if (!urlSet.contains(fingerprint)) { // and add fingerprint to set urlSet.add(fingerprint); // initialize item item.clear(); item.setOriginalURL(line); // and spill to merger / sorter .. merger.spillRecord(fingerprint, item); } } else { LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line); } } } LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS"); } finally { merger.close(); } } finally { if (spillwriter != null) spillwriter.close(); } LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys"); // generate bloom filter ... _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10); for (URLFP fingerprint : urlSet) { _bloomFilter.add(fingerprint); } LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter"); // serialize it FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData); try { _bloomFilter.serialize(bloomFilterStream); } finally { bloomFilterStream.flush(); bloomFilterStream.close(); } LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile); // now initialize value map and string maps based on output sequence file ... SequenceFile.Reader reader = new SequenceFile.Reader( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); // OK, Allocate room for fixed data file upfront DataOutputBuffer valueStream = new DataOutputBuffer( urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE); LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED"); try { //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile)); RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw"); try { URLFP urlFP = new URLFP(); ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); // read fingerprints ... while (reader.next(urlFP, item)) { // write out fixed data structure and strings writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream); } } finally { //valueStream.flush(); //valueStream.close(); stringsStream.close(); } } finally { reader.close(); } LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk"); LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); } // initialize temp data buffer variables _tempFixedDataBuffer = valueStream.getData(); _tempFixedDataBufferSize = valueStream.getLength(); // update metadata _metadata.setRefreshInterval(refreshInterval); _metadata.setUrlCount(urlSet.size()); // setup version _metadata.setVersion(1); // and write to disk writeMetadataToDisk(); // mark state as loaded ... _listState = LoadState.LOADED; LOG.info("*** LIST:" + getListId() + " SYNCING"); // reconcile with history log _manager.syncList(this.getListId(), urlSet, this); LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE"); // write metdata to disk again writeMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA"); // and finally flush fixed data to disk FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile); try { synchronized (this) { int blockSize = 1 << 20; long bytesCopied = 0; for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) { int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset); finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy); bytesCopied += bytesToCopy; } // validate bytes copied if (bytesCopied != _tempFixedDataBufferSize) { throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied); } // ok release the buffer _tempFixedDataBuffer = null; _tempFixedDataBufferSize = 0; LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE"); } } finally { finalDataStream.flush(); finalDataStream.close(); } // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); } catch (IOException e) { LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e)); _fixedDataFile.delete(); _variableDataFile.delete(); _bloomFilterData.delete(); _listState = LoadState.ERROR; throw e; } finally { urlInputStream.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException { // see if state is cached in memory ... boolean loadedFromMemory = false; synchronized (this) { if (_tempFixedDataBuffer != null) { loadedFromMemory = true;//from www . ja v a 2 s. c om int low = 0; int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); DataInputBuffer inputBuffer = new DataInputBuffer(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize); inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputBuffer); // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); } } if (!loadedFromMemory) { //load from disk //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash()); RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw"); // allocate buffer upfront byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE]; DataInputBuffer inputStream = new DataInputBuffer(); //LOG.info("Opened Data File. Searching for match"); try { int low = 0; int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); // seek to proper location file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // read the data structure file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length); // map location in file //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE); //DataInputStream inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer)); inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputStream); // memoryBuffer = null; //inputStream = null; // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); //DEBUG ONLY ! // dumpFixedDataFile(); } finally { file.close(); } } return null; }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private final int getOffsetForSubDomainData(long domainHash) throws IOException { DataInputBuffer inputBuffer = new DataInputBuffer(); int low = 0;//from w ww. jav a 2 s . com int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1; while (low <= high) { int mid = low + ((high - low) / 2); inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength()); inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE); // deserialize long hash = inputBuffer.readLong(); // now compare it against desired hash value ... int comparisonResult = ((Long) hash).compareTo(domainHash); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { return inputBuffer.readInt(); } } throw new IOException("NOT-FOUND!"); }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void resetSubDomainCounts() throws IOException { LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts."); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS ."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); try {/*www .j a v a 2 s. co m*/ // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } // ok reset everything except hashes and first/last url pointers int urlCount = newMetadata.getUrlCount(); long firstRecordOffset = newMetadata.getFirstRecordOffset(); long lastRecordOffset = newMetadata.getLastRecordOffset(); String domainName = newMetadata.getDomainName(); long domainHash = newMetadata.getDomainHash(); // reset newMetadata.clear(); // restore newMetadata.setUrlCount(urlCount); newMetadata.setFirstRecordOffset(firstRecordOffset); newMetadata.setLastRecordOffset(lastRecordOffset); newMetadata.setDomainName(domainName); newMetadata.setDomainHash(domainHash); // serialize it ... outputBuffer.reset(); newMetadata.serialize(outputBuffer, new BinaryProtocol()); // write it back to disk file.seek(orignalPos); // and rewrite it ... file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try {// ww w . j a v a 2s . c o m // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) { synchronized (_metadata) { ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>(); try {//from ww w .j a va 2 s . co m synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try { // skip version file.read(); // read item count int itemCount = file.readInt(); int i = offset; int end = Math.min(i + count, itemCount); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); if (i < itemCount) { file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset)); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (; i < end; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); newMetadata.deserialize(inputBuffer, new BinaryProtocol()); itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata)); } } } finally { file.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); return itemsOut; } }
From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java
License:Open Source License
public static void distributeRank(final PRValueMap valueMap, final Path outlinksFile, final boolean outlinksIsRemote, File localOutputDir, String remoteOutputDir, int thisNodeIdx, int nodeCount, int iterationNumber, final ProgressAndCancelCheckCallback progressCallback) throws IOException { final Configuration conf = CrawlEnvironment.getHadoopConfig(); Vector<PRValueOutputStream> outputStreamVector = new Vector<PRValueOutputStream>(); // allocate a queue ... final LinkedBlockingQueue<OutlinkItem> queue = new LinkedBlockingQueue<OutlinkItem>(20000); try {/* www . j av a 2s . c o m*/ // start the loader thread ... Thread loaderThread = new Thread(new Runnable() { final BytesWritable key = new BytesWritable(); final BytesWritable value = new BytesWritable(); final DataInputBuffer keyStream = new DataInputBuffer(); final DataInputBuffer valueStream = new DataInputBuffer(); @Override public void run() { LOG.info("Opening Outlinks File at:" + outlinksFile); SequenceFile.Reader reader = null; try { FileSystem fsForOutlinksFile = null; if (outlinksIsRemote) { fsForOutlinksFile = CrawlEnvironment.getDefaultFileSystem(); } else { fsForOutlinksFile = FileSystem.getLocal(conf); } long bytesToReadTotal = fsForOutlinksFile.getLength(outlinksFile); reader = new SequenceFile.Reader(fsForOutlinksFile, outlinksFile, conf); OutlinkItem item = new OutlinkItem(); int itemCount = 0; boolean isCancelled = false; while (!isCancelled && reader.next(key, value)) { keyStream.reset(key.get(), 0, key.getLength()); valueStream.reset(value.get(), 0, value.getLength()); //populate item from data readURLFPFromStream(keyStream, item.targetFingerprint); item.urlCount = readURLFPAndCountFromStream(valueStream, item.sourceFingerprint); try { long blockTimeStart = System.currentTimeMillis(); queue.put(item); long blockTimeEnd = System.currentTimeMillis(); } catch (InterruptedException e) { } item = new OutlinkItem(); if (itemCount++ % 10000 == 0 && progressCallback != null) { float percentComplete = (float) reader.getPosition() / (float) bytesToReadTotal; if (progressCallback.updateProgress(percentComplete)) { LOG.info("Cancel check callback returned true.Cancelling outlink item load"); isCancelled = true; } } } item.sourceFingerprint = null; item.targetFingerprint = null; // add empty item try { if (!isCancelled) { queue.put(item); } else { queue.put(new OutlinkItem(new IOException("Operation Cancelled"))); } } catch (InterruptedException e) { } } catch (IOException e) { // add error item to queue. try { queue.put(new OutlinkItem(e)); } catch (InterruptedException e1) { } } finally { if (reader != null) try { reader.close(); } catch (IOException e) { } } } }); loaderThread.start(); // first things first ... initialize output stream vector FileSystem fileSystem = buildDistributionOutputStreamVector(true, getOutlinksBaseName(thisNodeIdx, iterationNumber), localOutputDir, remoteOutputDir, thisNodeIdx, nodeCount, outputStreamVector); try { // open outlinks file . LOG.info("Iterating Items in Outlinks File and Writing Test Value"); int itemCount = 0; int totalOutlinkCount = 0; int iterationOutlinkCount = 0; long iterationStart = System.currentTimeMillis(); long timeStart = iterationStart; boolean done = false; ArrayList<OutlinkItem> items = new ArrayList<OutlinkItem>(); // start iterating outlinks while (!done) { //OutlinkItem item = null; //try { long waitTimeStart = System.currentTimeMillis(); queue.drainTo(items); long waitTimeEnd = System.currentTimeMillis(); //} catch (InterruptedException e) { //} for (OutlinkItem item : items) { if (item.error != null) { LOG.info( "Loader Thread Returned Error:" + CCStringUtils.stringifyException(item.error)); throw item.error; } else if (item.sourceFingerprint == null) { LOG.info("Loader Thread Indicated EOF via emtpy item"); done = true; } else { ++itemCount; /* LOG.info("SourceFP-DomainHash:" + item.sourceFingerprint.getDomainHash() + " URLHash:" + item.sourceFingerprint.getUrlHash() + " PartitionIdx:" + ((item.sourceFingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES) ); */ // now get pr value for fingerprint (random seek in memory here!!!) float prValue = valueMap.getPRValue(item.sourceFingerprint) / (float) Math.max(item.urlCount, 1); // write value out int nodeIndex = (item.targetFingerprint.hashCode() & Integer.MAX_VALUE) % nodeCount; outputStreamVector.get(nodeIndex).writePRValue(item.targetFingerprint, item.sourceFingerprint, prValue); if (itemCount % 10000 == 0) { long timeEnd = System.currentTimeMillis(); int milliseconds = (int) (timeEnd - iterationStart); LOG.info("Distribute PR for 10000 Items with:" + iterationOutlinkCount + " Outlinks Took:" + milliseconds + " Milliseconds" + " QueueCount:" + queue.size()); iterationStart = System.currentTimeMillis(); totalOutlinkCount += iterationOutlinkCount; iterationOutlinkCount = 0; } } } items.clear(); } totalOutlinkCount += iterationOutlinkCount; LOG.info("Distribute Finished for a total of:" + itemCount + " Items with:" + totalOutlinkCount + " Outlinks Took:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); LOG.info("Waiting for Loader Thread to Die"); try { loaderThread.join(); } catch (InterruptedException e) { } LOG.info("Loader Thread Died - Moving on..."); } finally { for (PRValueOutputStream info : outputStreamVector) { if (info != null) { info.close(false); } } if (fileSystem != null) { fileSystem.close(); } } } catch (IOException e) { LOG.error("Exception caught while distributing outlinks:" + CCStringUtils.stringifyException(e)); throw e; } }
From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java
License:Open Source License
private static void spillLinkDataIntoTempFileIndex(FileSystem remoteFileSystem, FileSystem localFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath, FlexBuffer linkData) throws IOException { SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>( localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFilePath)), true);//from www . ja v a 2s .c om try { // ok create merge sort spill writer ... MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>( conf, outputWriter, localFileSystem, new Path(tempFilePath.getAbsolutePath()), null, new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() { DataInputBuffer stream1 = new DataInputBuffer(); DataInputBuffer stream2 = new DataInputBuffer(); TriTextBytesTuple tuple1 = new TriTextBytesTuple(); TriTextBytesTuple tuple2 = new TriTextBytesTuple(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { stream1.reset(value1Data, value1Offset, value1Length); stream2.reset(value2Data, value2Offset, value2Length); // ok skip url int url1Length = WritableUtils.readVInt(stream1); stream1.skip(url1Length); int url2Length = WritableUtils.readVInt(stream2); stream2.skip(url2Length); // ok now read optimized page rank stuffed in second tuple WritableUtils.readVInt(stream1); WritableUtils.readVInt(stream2); // now read page rank float pageRank1 = stream1.readFloat(); float pageRank2 = stream2.readFloat(); return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1; } @Override public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2, TriTextBytesTuple value2) { stream1.reset(value1.getSecondValue().getBytes(), value1.getSecondValue().getLength()); stream2.reset(value2.getSecondValue().getBytes(), value2.getSecondValue().getLength()); try { float pr1 = stream1.readFloat(); float pr2 = stream2.readFloat(); return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException(); } } }, TextBytes.class, TriTextBytesTuple.class, false, null); try { long timeStart = System.currentTimeMillis(); System.out.println(".Running Merger against to resolve tuple set "); index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData, merger); long timeEnd = System.currentTimeMillis(); LOG.info(".Merged Successfully in:" + (timeEnd - timeStart)); } finally { LOG.info("Closing Merger"); merger.close(); } } finally { LOG.info("Closing Writer"); outputWriter.close(); } }