List of usage examples for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer
private DataOutputBuffer(Buffer buffer)
From source file:StreamWikiDumpInputFormat.java
License:Apache License
private static DataOutputBuffer getBuffer(byte[] bytes) throws IOException { DataOutputBuffer ret = new DataOutputBuffer(bytes.length); ret.write(bytes);/* w ww .j av a 2 s . c om*/ return ret; }
From source file:com.dasasian.chok.testutil.AbstractWritableTest.java
License:Apache License
protected DataOutputBuffer writeWritable(Writable writable) throws IOException { DataOutputBuffer out = new DataOutputBuffer(1024); writable.write(out);// w ww . j a va 2 s . c o m out.flush(); return out; }
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Read the index of the lzo file.//from ww w .jav a 2s . c o m * @param fs The index file is on this file system. * @param lzoFile the file whose index we are reading -- NOT the index file itself. That is, * pass in filename.lzo, not filename.lzo.index, for this parameter. * @throws IOException */ public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException { FSDataInputStream indexIn = null; Path indexFile = lzoFile.suffix(LZO_INDEX_SUFFIX); try { indexIn = fs.open(indexFile); } catch (IOException fileNotFound) { // return empty index, fall back to the unsplittable mode return new LzoIndex(); } int capacity = 16 * 1024 * 8; //size for a 4GB file (with 256KB lzo blocks) DataOutputBuffer bytes = new DataOutputBuffer(capacity); // copy indexIn and close it IOUtils.copyBytes(indexIn, bytes, 4 * 1024, true); ByteBuffer bytesIn = ByteBuffer.wrap(bytes.getData(), 0, bytes.getLength()); int blocks = bytesIn.remaining() / 8; LzoIndex index = new LzoIndex(blocks); for (int i = 0; i < blocks; i++) { index.set(i, bytesIn.getLong()); } return index; }
From source file:org.apache.orc.mapred.TestOrcList.java
License:Apache License
static void cloneWritable(Writable source, Writable destination) throws IOException { DataOutputBuffer out = new DataOutputBuffer(1024); source.write(out);/* w w w.j a v a2s .c o m*/ out.flush(); DataInputBuffer in = new DataInputBuffer(); in.reset(out.getData(), out.getLength()); destination.readFields(in); }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java
License:Open Source License
@Override public void configure(JobConf job) { _sourceInputsBuffer = new DataOutputBuffer(EXT_SOURCE_SAMPLE_BUFFER_SIZE); _sourceInputsTrackingFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); _conf = job;/*from w w w . j a v a 2 s . co m*/ try { _fs = FileSystem.get(_conf); _partitionId = _conf.getInt("mapred.task.partition", 0); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * Initialize a new CrawlList object from a given input stream of urls * /* w w w .ja v a 2s .c om*/ * @param manager - reference to the crawl history log manager * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... * @throws IOException */ public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval) throws IOException { _manager = manager; _listState = LoadState.REALLY_LOADING; // initialize a new list id _listId = listId; LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); //establish file names initializeListFileNames(); sourceURLFile.renameTo(_listURLDataFile); FileInputStream urlInputStream = new FileInputStream(_listURLDataFile); try { // set we will use to hold all fingerprints generated TreeSet<URLFP> urlSet = new TreeSet<URLFP>(); // create temp files ... File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId)); // create mergesortspillwriter SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(), new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null, false); try { MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>( CrawlEnvironment.getHadoopConfig(), spillwriter, FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(manager.getLocalDataDir().getAbsolutePath()), null, new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() { DataInputBuffer _key1Buffer = new DataInputBuffer(); DataInputBuffer _key2Buffer = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { _key1Buffer.reset(key1Data, key1Offset, key1Length); _key2Buffer.reset(key2Data, key2Offset, key2Length); _key1Buffer.skip(2); // skip verison, and 1 byte id _key2Buffer.skip(2); // skip verison, and 1 byte id int domainHash1 = WritableUtils.readVInt(_key1Buffer); int domainHash2 = WritableUtils.readVInt(_key2Buffer); _key1Buffer.skip(1); // skip 1 byte id _key2Buffer.skip(1); // skip 1 byte id long fingerprint1 = WritableUtils.readVLong(_key1Buffer); long fingerprint2 = WritableUtils.readVLong(_key2Buffer); int result = ((Integer) domainHash1).compareTo(domainHash2); if (result == 0) { result = ((Long) fingerprint1).compareTo(fingerprint2); } return result; } @Override public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2, ProxyCrawlHistoryItem value2) { return key1.compareTo(key2); } }, URLFP.class, ProxyCrawlHistoryItem.class, false, null); try { LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List"); BufferedReader reader = new BufferedReader( new InputStreamReader(urlInputStream, Charset.forName("UTF-8"))); String line = null; int lineNumber = 0; ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); while ((line = reader.readLine()) != null) { ++lineNumber; if (line.length() != 0 && !line.startsWith("#")) { URLFP fingerprint = URLUtils.getURLFPFromURL(line, true); if (fingerprint != null) { if (!urlSet.contains(fingerprint)) { // and add fingerprint to set urlSet.add(fingerprint); // initialize item item.clear(); item.setOriginalURL(line); // and spill to merger / sorter .. merger.spillRecord(fingerprint, item); } } else { LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line); } } } LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS"); } finally { merger.close(); } } finally { if (spillwriter != null) spillwriter.close(); } LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys"); // generate bloom filter ... _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10); for (URLFP fingerprint : urlSet) { _bloomFilter.add(fingerprint); } LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter"); // serialize it FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData); try { _bloomFilter.serialize(bloomFilterStream); } finally { bloomFilterStream.flush(); bloomFilterStream.close(); } LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile); // now initialize value map and string maps based on output sequence file ... SequenceFile.Reader reader = new SequenceFile.Reader( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); // OK, Allocate room for fixed data file upfront DataOutputBuffer valueStream = new DataOutputBuffer( urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE); LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED"); try { //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile)); RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw"); try { URLFP urlFP = new URLFP(); ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); // read fingerprints ... while (reader.next(urlFP, item)) { // write out fixed data structure and strings writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream); } } finally { //valueStream.flush(); //valueStream.close(); stringsStream.close(); } } finally { reader.close(); } LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk"); LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); } // initialize temp data buffer variables _tempFixedDataBuffer = valueStream.getData(); _tempFixedDataBufferSize = valueStream.getLength(); // update metadata _metadata.setRefreshInterval(refreshInterval); _metadata.setUrlCount(urlSet.size()); // setup version _metadata.setVersion(1); // and write to disk writeMetadataToDisk(); // mark state as loaded ... _listState = LoadState.LOADED; LOG.info("*** LIST:" + getListId() + " SYNCING"); // reconcile with history log _manager.syncList(this.getListId(), urlSet, this); LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE"); // write metdata to disk again writeMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA"); // and finally flush fixed data to disk FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile); try { synchronized (this) { int blockSize = 1 << 20; long bytesCopied = 0; for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) { int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset); finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy); bytesCopied += bytesToCopy; } // validate bytes copied if (bytesCopied != _tempFixedDataBufferSize) { throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied); } // ok release the buffer _tempFixedDataBuffer = null; _tempFixedDataBufferSize = 0; LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE"); } } finally { finalDataStream.flush(); finalDataStream.close(); } // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); } catch (IOException e) { LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e)); _fixedDataFile.delete(); _variableDataFile.delete(); _bloomFilterData.delete(); _listState = LoadState.ERROR; throw e; } finally { urlInputStream.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * serialize metadata to disk // w w w. j a va 2 s .c o m * @throws IOException */ void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData) throws IOException { DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); subDomainData.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!"); outputBuffer.reset(); subDomainData.setDomainName("<<CORRUPT>>"); subDomainData.serialize(outputBuffer, new BinaryProtocol()); } synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { if (subDomainData.getSubDomainDataOffset() == 0) { throw new IOException("Data Offset Zero during write!"); } file.seek(subDomainData.getSubDomainDataOffset()); file.write(outputBuffer.getData(), 0, outputBuffer.getLength()); } finally { file.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void writeInitialSubDomainMetadataToDisk() throws IOException { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try {// w w w . jav a 2 s . c om file.writeByte(0); // version file.writeInt(_transientSubDomainStats.size()); ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>(); sortedMetadata.addAll(_transientSubDomainStats.values()); _transientSubDomainStats = null; CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]); Arrays.sort(metadataArray, new Comparator<CrawlListMetadata>() { @Override public int compare(CrawlListMetadata o1, CrawlListMetadata o2) { int result = ((Integer) o2.getUrlCount()).compareTo(o1.getUrlCount()); if (result == 0) { result = o1.getDomainName().compareTo(o2.getDomainName()); } return result; } }); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (CrawlListMetadata entry : metadataArray) { // reset output buffer outputBuffer.reset(); // write item to disk entry.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); } // save offset idToOffsetMap.put(entry.getDomainHash(), (int) file.getFilePointer()); // write out fixed data size file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } _transientSubDomainStats = null; }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void resetSubDomainCounts() throws IOException { LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts."); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS ."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); try {//from w ww . j a va 2 s . c o m // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } // ok reset everything except hashes and first/last url pointers int urlCount = newMetadata.getUrlCount(); long firstRecordOffset = newMetadata.getFirstRecordOffset(); long lastRecordOffset = newMetadata.getLastRecordOffset(); String domainName = newMetadata.getDomainName(); long domainHash = newMetadata.getDomainHash(); // reset newMetadata.clear(); // restore newMetadata.setUrlCount(urlCount); newMetadata.setFirstRecordOffset(firstRecordOffset); newMetadata.setLastRecordOffset(lastRecordOffset); newMetadata.setDomainName(domainName); newMetadata.setDomainHash(domainHash); // serialize it ... outputBuffer.reset(); newMetadata.serialize(outputBuffer, new BinaryProtocol()); // write it back to disk file.seek(orignalPos); // and rewrite it ... file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try {/*ww w. j a va 2 s .c om*/ // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } }