List of usage examples for org.apache.hadoop.io DataInputBuffer getPosition
public int getPosition()
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { if (_skipPartition) return;/*from w w w. j a v a2 s .com*/ // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } FlexBuffer scanArray[] = LinkKey.allocateScanArray(); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs, incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); int processedKeysCount = 0; Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { summaryRecord = null; linkSummaryRecord = null; types.clear(); linkSources = null; outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; int statusCount = 0; int linkCount = 0; // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { statusCount++; try { JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } else { linkCount++; JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); // ok this is a link ... updateLinkStatsFromLinkJSONObject(object, fpSource, reporter); } reporter.progress(); } if (statusCount > 1) { reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1); } if (statusCount == 0 && linkCount != 0) { reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1); } else { if (statusCount >= 1 && linkCount >= 1) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1); } else if (statusCount >= 1 && linkCount == 0) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1); } } if (summaryRecord != null || linkSummaryRecord != null) { JsonObject compositeObject = new JsonObject(); if (summaryRecord != null) { compositeObject.add("crawl_status", summaryRecord); } if (linkSummaryRecord != null) { if (types != null && types.size() != 0) { stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types); if (linkSources != null) { stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values()); } } compositeObject.add("link_status", linkSummaryRecord); } if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) { if (outputKeyFromInternalLink) { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1); } else { reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1); } output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString())); } else { reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1); } } } }
From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java
License:Open Source License
private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut) throws IOException { inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength()); int newLength = WritableUtils.readVInt(inputBuffer); textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength); }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java
License:Open Source License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); FlexBuffer scanArray[] = LinkKey.allocateScanArray(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); }/*from w w w .ja v a 2s . c om*/ // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( FileSystem.get(_jobConf), incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); TextBytes valueOut = new TextBytes(); TextBytes keyOut = new TextBytes(); Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; latestLinkDataTime = -1L; outlinks.clear(); discoveredLinks.clear(); // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray); // setup fingerprint ... fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray, LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue : nextItem.e1) { inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length); inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length); long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID); if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { try { JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { updateCrawlStatsFromJSONObject(object, fpSource, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } } reporter.progress(); } // ok now see if we have anything to emit ... if (discoveredLinks.size() != 0) { reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1); for (String outlink : outlinks) { // emit a to tuple toJsonObject.addProperty("to", outlink); valueBytes.set(toJsonObject.toString()); output.collect(sourceDomain, valueBytes); // now emit a from tuple ... fromJsonObject.addProperty("from", sourceDomain.toString()); keyBytes.set(outlink); valueBytes.set(fromJsonObject.toString()); output.collect(keyBytes, valueBytes); } bloomKey.setDomainHash(fpSource.getDomainHash()); for (long destDomainFP : discoveredLinks) { // set the bloom filter key ... bloomKey.setUrlHash(destDomainFP); // add it to the bloom filter emittedTuplesFilter.add(bloomKey); } } else { reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1); } } }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("URL:" + args[0] + " ShardId:" + args[1]); try {/* w ww.ja v a2 s . com*/ File tempFile = File.createTempFile("inverseLinksReportTest", "seq"); try { FileSystem fs = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.getLocal(conf); URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]); if (fp != null) { collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(), localFileSystem, new Path(tempFile.getAbsolutePath())); SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem, new Path(tempFile.getAbsolutePath()), conf); try { FlexBuffer key = new FlexBuffer(); URLFPV2 src = new URLFPV2(); TextBytes url = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); while (reader.next(key, src)) { inputBuffer.reset(key.get(), key.getOffset(), key.getCount()); long targetFP = inputBuffer.readLong(); float pageRank = inputBuffer.readFloat(); // ok initialize text bytes ... int textLen = WritableUtils.readVInt(inputBuffer); url.set(key.get(), inputBuffer.getPosition(), textLen); LOG.info("PR:" + pageRank + " URL:" + url.toString()); } } finally { reader.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); // tempFile.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
public static void main(String[] args) { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); // now allocate a TextBytes TextBytes textBytes = new TextBytes(); // set the overallocated buffer as the backing store textBytes.set(overAllocated, bytes.length, bytes.length); // convert it to string first String toString = textBytes.toString(); // validate equal to original Assert.assertTrue(aTestString.equals(toString)); // ok now write it to output buffer DataOutputBuffer outputBuffer = new DataOutputBuffer(); // write string try {//from w w w . java 2s .c o m textBytes.write(outputBuffer); // read length DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); int encodedLength = WritableUtils.readVInt(inputBuffer); // validate arrays match ... Assert.assertTrue(encodedLength == bytes.length); Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(), inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0); // ok reset input buffer again ... inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); // read in fields textBytes.readFields(inputBuffer); // ok see if we are not using the original backing store ... Assert.assertTrue(textBytes.getBytes() != overAllocated); // validate buffers match to original Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(), textBytes.getOffset(), textBytes.getLength()), 0); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.commoncrawl.util.TimeSeriesDataFile.java
License:Open Source License
private void doCommonRead(ArrayList<KeyValueTuple<Long, ValueType>> valuesOut, RandomAccessFile file, long headerOffset, long endOfPrevRecord, int currentRecordLength, int recordsToRead, long optionalMinKeyValue) throws IOException { Buffer recordBuffer = new Buffer(); DataInputBuffer inputBuffer = new DataInputBuffer(); // ok start walking backwards ... while (recordsToRead != 0) { // setup new previous record pos pointer endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4; // and seek to it endOfLastRecord - 4 file.seek(endOfPrevRecord - 4);//from w ww . j a v a 2 s . c o m recordBuffer.setCapacity(currentRecordLength + 8); // read in proper amount of data ... file.read(recordBuffer.get(), 0, currentRecordLength + 8); // ok initialize input buffer ... inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8); // now read next record length first ... int nextRecordLength = inputBuffer.readInt(); // next read sync bytes ... int syncBytes = inputBuffer.readInt(); // validate if (syncBytes != SyncBytes) { throw new IOException("Corrupt Record Detected!"); } // ok read real record bytes ... int realRecordBytes = inputBuffer.readInt(); // read crc ... long crcValue = inputBuffer.readLong(); // ok validate crc ... crc.reset(); crc.update(inputBuffer.getData(), inputBuffer.getPosition(), realRecordBytes - 8); if (crcValue != crc.getValue()) { throw new IOException("CRC Mismatch!"); } // ok now read key and value try { long key = WritableUtils.readVLong(inputBuffer); if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) { break; } ValueType value = (ValueType) valueClass.newInstance(); value.readFields(inputBuffer); KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value); tuple.recordPos = endOfPrevRecord; valuesOut.add(0, tuple); } catch (Exception e) { throw new IOException(e); } currentRecordLength = nextRecordLength; recordsToRead--; if (endOfPrevRecord == headerOffset) break; } }