List of usage examples for org.apache.hadoop.io DataInputBuffer reset
public void reset(byte[] input, int start, int length)
From source file:org.apache.tez.runtime.library.common.sort.impl.TestIFile.java
License:Apache License
private void populateData(KVPair kvp, DataInputBuffer key, DataInputBuffer value) throws IOException { DataOutputBuffer k = new DataOutputBuffer(); DataOutputBuffer v = new DataOutputBuffer(); kvp.getKey().write(k);//from ww w .j a va 2 s .co m kvp.getvalue().write(v); key.reset(k.getData(), 0, k.getLength()); value.reset(v.getData(), 0, v.getLength()); }
From source file:org.apache.tez.runtime.library.common.TestValuesIterator.java
License:Apache License
/** * create inmemory segments/*from w w w. j av a 2 s .co m*/ * * @return * @throws IOException */ public List<TezMerger.Segment> createInMemStreams() throws IOException { int numberOfStreams = Math.max(2, rnd.nextInt(10)); LOG.info("No of streams : " + numberOfStreams); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(keyClass); Serializer valueSerializer = serializationFactory.getSerializer(valClass); LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS); InputContext context = createTezInputContext(); MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null, null, 1024 * 1024 * 10, null, false, -1); DataOutputBuffer keyBuf = new DataOutputBuffer(); DataOutputBuffer valBuf = new DataOutputBuffer(); DataInputBuffer keyIn = new DataInputBuffer(); DataInputBuffer valIn = new DataInputBuffer(); keySerializer.open(keyBuf); valueSerializer.open(valBuf); List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>(); for (int i = 0; i < numberOfStreams; i++) { BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024); InMemoryWriter writer = new InMemoryWriter(bout); Map<Writable, Writable> data = createData(); //write data for (Map.Entry<Writable, Writable> entry : data.entrySet()) { keySerializer.serialize(entry.getKey()); valueSerializer.serialize(entry.getValue()); keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength()); valIn.reset(valBuf.getData(), 0, valBuf.getLength()); writer.append(keyIn, valIn); originalData.put(entry.getKey(), entry.getValue()); keyBuf.reset(); valBuf.reset(); keyIn.reset(); valIn.reset(); } IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length); segments.add(new TezMerger.Segment(reader, true)); data.clear(); writer.close(); } return segments; }
From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java
License:Apache License
private void writePartition(int pos, WrappedBuffer wrappedBuffer, Writer writer, DataInputBuffer keyBuffer, DataInputBuffer valBuffer) throws IOException { while (pos != WrappedBuffer.PARTITION_ABSENT_POSITION) { int metaIndex = pos / INT_SIZE; int keyLength = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_KEYLEN); int valLength = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_VALLEN); keyBuffer.reset(wrappedBuffer.buffer, pos + META_SIZE, keyLength); valBuffer.reset(wrappedBuffer.buffer, pos + META_SIZE + keyLength, valLength); writer.append(keyBuffer, valBuffer); pos = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_NEXT); }/*w ww .ja v a2 s . co m*/ }
From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedBasicSort.java
License:Open Source License
@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { WindowsBasedBasicWritable key1 = new WindowsBasedBasicWritable(); WindowsBasedBasicWritable key2 = new WindowsBasedBasicWritable(); DataInputBuffer buffer = new DataInputBuffer(); try {//from w w w .j a v a2s .c o m buffer.reset(b1, s1, l1); key1.readFields(buffer); buffer.reset(b2, s2, l2); key2.readFields(buffer); } catch (IOException e) { throw new RuntimeException(e); } return key1.compareTo(key2); }
From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedComparator.java
License:Open Source License
@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { WindowsBasedWritable key1 = new WindowsBasedWritable(); WindowsBasedWritable key2 = new WindowsBasedWritable(); DataInputBuffer buffer = new DataInputBuffer(); try {/*w w w .j a v a 2s . co m*/ buffer.reset(b1, s1, l1); key1.readFields(buffer); buffer.reset(b2, s2, l2); key2.readFields(buffer); } catch (IOException e) { throw new RuntimeException(e); } return compare(key1, key2); }
From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedSort.java
License:Open Source License
@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { WindowsBasedWritable key1 = new WindowsBasedWritable(); WindowsBasedWritable key2 = new WindowsBasedWritable(); DataInputBuffer buffer = new DataInputBuffer(); try {//from w w w .java 2 s . c o m buffer.reset(b1, s1, l1); key1.readFields(buffer); buffer.reset(b2, s2, l2); key2.readFields(buffer); } catch (IOException e) { throw new RuntimeException(e); } return key1.compareTo(key2); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { if (_skipPartition) return;/*www . j av a2 s . c o m*/ // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } FlexBuffer scanArray[] = LinkKey.allocateScanArray(); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs, incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); int processedKeysCount = 0; Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { summaryRecord = null; linkSummaryRecord = null; types.clear(); linkSources = null; outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; int statusCount = 0; int linkCount = 0; // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { statusCount++; try { JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } else { linkCount++; JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); // ok this is a link ... updateLinkStatsFromLinkJSONObject(object, fpSource, reporter); } reporter.progress(); } if (statusCount > 1) { reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1); } if (statusCount == 0 && linkCount != 0) { reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1); } else { if (statusCount >= 1 && linkCount >= 1) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1); } else if (statusCount >= 1 && linkCount == 0) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1); } } if (summaryRecord != null || linkSummaryRecord != null) { JsonObject compositeObject = new JsonObject(); if (summaryRecord != null) { compositeObject.add("crawl_status", summaryRecord); } if (linkSummaryRecord != null) { if (types != null && types.size() != 0) { stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types); if (linkSources != null) { stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values()); } } compositeObject.add("link_status", linkSummaryRecord); } if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) { if (outputKeyFromInternalLink) { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1); } else { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1); } output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString())); } else { reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1); } } } }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); FlexBuffer scanArray[] = LinkKey.allocateScanArray(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); }//from w w w. j a v a 2 s . c om // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( FileSystem.get(_jobConf), incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); TextBytes valueOut = new TextBytes(); TextBytes keyOut = new TextBytes(); Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; latestLinkDataTime = -1L; outlinks.clear(); discoveredLinks.clear(); // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // setup fingerprint ... fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { try { JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } reporter.progress(); } // ok now see if we have anything to emit ... if (discoveredLinks.size() != 0) { reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1); for (String outlink : outlinks) { // emit a to tuple toJsonObject.addProperty("to", outlink); valueBytes.set(toJsonObject.toString()); output.collect(sourceDomain, valueBytes); // now emit a from tuple ... fromJsonObject.addProperty("from", sourceDomain.toString()); keyBytes.set(outlink); valueBytes.set(fromJsonObject.toString()); output.collect(keyBytes, valueBytes); } bloomKey.setDomainHash(fpSource.getDomainHash()); for (long destDomainFP : discoveredLinks) { // set the bloom filter key ... bloomKey.setUrlHash(destDomainFP); // add it to the bloom filter emittedTuplesFilter.add(bloomKey); } } else { reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1); } } }
From source file:org.commoncrawl.service.crawler.CrawlerServer.java
License:Open Source License
void refreshMasterCrawlerActiveHostList() { // ok if there is a master crawler, and it is online ... if (_masterCrawlerServiceChannel != null && _masterCrawlerServiceChannel.isOpen()) { try {//from w ww . j a va 2s . co m _masterCrawlerStub.queryActiveHosts(new Callback<NullMessage, ActiveHostInfo>() { @Override public void requestComplete(AsyncRequest<NullMessage, ActiveHostInfo> request) { if (request.getStatus() == Status.Success) { // ok update timestamp no matter what _pauseStateTimestampIncremental = request.getOutput().getPauseStateTimestamp(); // and clear set ... _pausedHostsSet = null; // now see if we have a valid response ... if (request.getOutput().getActiveHostIds().getCount() != 0) { LOG.info("Received New Active Host Set From Master Crawler At:" + _masterCrawlerAddress); // ok we have a valid list of hosts ... // create a reader stream DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(request.getOutput().getActiveHostIds().getReadOnlyBytes(), 0, request.getOutput().getActiveHostIds().getCount()); try { // create a set ... Set<Integer> ipAddressSet = new TreeSet<Integer>(); // populate it int ipAddressCount = WritableUtils.readVInt(inputStream); for (int i = 0; i < ipAddressCount; ++i) { ipAddressSet.add(WritableUtils.readVInt(inputStream)); } LOG.info("Successfully updated Active Host Set"); // ok replace set ... _pausedHostsSet = ipAddressSet; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } } }); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } // ok no matter what... check to see if we need to set up refresh timer ... if (_masterCrawlerHostListRefreshTimer == null) { _masterCrawlerHostListRefreshTimer = new Timer(ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT, true, new Timer.Callback() { @Override public void timerFired(Timer timer) { // call refresh again ... refreshMasterCrawlerActiveHostList(); } }); _eventLoop.setTimer(_masterCrawlerHostListRefreshTimer); } }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try {/*from w w w. ja va 2s. co m*/ while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } }