Example usage for org.apache.hadoop.io DataInputBuffer getPosition

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer getPosition.

Prototype

public int getPosition()

Source Link

Document

Returns the current position in the input.

Usage

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {

    if (_skipPartition)
        return;/*from  w w w.  j  a v  a2  s  .com*/
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_conf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs,
            incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int processedKeysCount = 0;

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        summaryRecord = null;
        linkSummaryRecord = null;
        types.clear();
        linkSources = null;
        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;

        int statusCount = 0;
        int linkCount = 0;

        // scan key components 
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // pick up source fp from key ... 
        URLFPV2 fpSource = new URLFPV2();

        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                statusCount++;

                try {
                    JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            } else {
                linkCount++;
                JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                // ok this is a link ... 
                updateLinkStatsFromLinkJSONObject(object, fpSource, reporter);
            }

            reporter.progress();
        }

        if (statusCount > 1) {
            reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1);
        }

        if (statusCount == 0 && linkCount != 0) {
            reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1);
        } else {
            if (statusCount >= 1 && linkCount >= 1) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1);
            } else if (statusCount >= 1 && linkCount == 0) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1);
            }
        }

        if (summaryRecord != null || linkSummaryRecord != null) {
            JsonObject compositeObject = new JsonObject();
            if (summaryRecord != null) {
                compositeObject.add("crawl_status", summaryRecord);
            }
            if (linkSummaryRecord != null) {
                if (types != null && types.size() != 0) {
                    stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types);
                    if (linkSources != null) {
                        stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values());
                    }
                }
                compositeObject.add("link_status", linkSummaryRecord);
            }

            if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) {
                if (outputKeyFromInternalLink) {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1);
                } else {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1);
                }
                output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString()));
            } else {
                reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
            }
        }
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer,
        TextBytes textOut) throws IOException {
    inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
    int newLength = WritableUtils.readVInt(inputBuffer);
    textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
}

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }/*from   w w w  .ja  v  a  2s  . c  om*/

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    TextBytes valueOut = new TextBytes();
    TextBytes keyOut = new TextBytes();

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    // pick up source fp from key ...
    URLFPV2 fpSource = new URLFPV2();

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;
        latestLinkDataTime = -1L;
        outlinks.clear();
        discoveredLinks.clear();

        // scan key components
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // setup fingerprint ...
        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                try {
                    JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            }
            reporter.progress();
        }
        // ok now see if we have anything to emit ...
        if (discoveredLinks.size() != 0) {
            reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1);
            for (String outlink : outlinks) {
                // emit a to tuple
                toJsonObject.addProperty("to", outlink);
                valueBytes.set(toJsonObject.toString());
                output.collect(sourceDomain, valueBytes);
                // now emit a from tuple ...
                fromJsonObject.addProperty("from", sourceDomain.toString());
                keyBytes.set(outlink);
                valueBytes.set(fromJsonObject.toString());
                output.collect(keyBytes, valueBytes);
            }

            bloomKey.setDomainHash(fpSource.getDomainHash());

            for (long destDomainFP : discoveredLinks) {
                // set the bloom filter key ...
                bloomKey.setUrlHash(destDomainFP);
                // add it to the bloom filter
                emittedTuplesFilter.add(bloomKey);
            }
        } else {
            reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1);
        }
    }
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

public static void main(String[] args) {
    // initialize ...
    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    LOG.info("URL:" + args[0] + " ShardId:" + args[1]);

    try {/*  w ww.ja v a2  s  .  com*/
        File tempFile = File.createTempFile("inverseLinksReportTest", "seq");
        try {
            FileSystem fs = FileSystem.get(conf);
            FileSystem localFileSystem = FileSystem.getLocal(conf);

            URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]);
            if (fp != null) {
                collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(),
                        localFileSystem, new Path(tempFile.getAbsolutePath()));

                SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,
                        new Path(tempFile.getAbsolutePath()), conf);
                try {
                    FlexBuffer key = new FlexBuffer();
                    URLFPV2 src = new URLFPV2();
                    TextBytes url = new TextBytes();

                    DataInputBuffer inputBuffer = new DataInputBuffer();

                    while (reader.next(key, src)) {
                        inputBuffer.reset(key.get(), key.getOffset(), key.getCount());
                        long targetFP = inputBuffer.readLong();
                        float pageRank = inputBuffer.readFloat();
                        // ok initialize text bytes ... 
                        int textLen = WritableUtils.readVInt(inputBuffer);
                        url.set(key.get(), inputBuffer.getPosition(), textLen);
                        LOG.info("PR:" + pageRank + " URL:" + url.toString());
                    }
                } finally {
                    reader.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            // tempFile.delete();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

public static void main(String[] args) {
    // run some tests on the new code
    String aTestString = new String("A Test Strnig");
    // convert it to bytes
    byte bytes[] = aTestString.getBytes();
    // over allocate an array
    byte overAllocated[] = new byte[bytes.length * 2];
    // copy source
    System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length);
    // now allocate a TextBytes
    TextBytes textBytes = new TextBytes();
    // set the overallocated buffer as the backing store
    textBytes.set(overAllocated, bytes.length, bytes.length);
    // convert it to string first
    String toString = textBytes.toString();
    // validate equal to original
    Assert.assertTrue(aTestString.equals(toString));
    // ok now write it to output buffer
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    // write string
    try {//from   w  w  w  .  java  2s .c  o  m
        textBytes.write(outputBuffer);
        // read length
        DataInputBuffer inputBuffer = new DataInputBuffer();
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        int encodedLength = WritableUtils.readVInt(inputBuffer);
        // validate arrays match ...
        Assert.assertTrue(encodedLength == bytes.length);
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(),
                inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0);
        // ok reset input buffer again ...
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        // read in fields
        textBytes.readFields(inputBuffer);
        // ok see if we are not using the original backing store ...
        Assert.assertTrue(textBytes.getBytes() != overAllocated);
        // validate buffers match to original
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(),
                textBytes.getOffset(), textBytes.getLength()), 0);

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:org.commoncrawl.util.TimeSeriesDataFile.java

License:Open Source License

private void doCommonRead(ArrayList<KeyValueTuple<Long, ValueType>> valuesOut, RandomAccessFile file,
        long headerOffset, long endOfPrevRecord, int currentRecordLength, int recordsToRead,
        long optionalMinKeyValue) throws IOException {

    Buffer recordBuffer = new Buffer();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    // ok start walking backwards ... 
    while (recordsToRead != 0) {
        // setup new previous record pos pointer  
        endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4;
        // and seek to it endOfLastRecord - 4
        file.seek(endOfPrevRecord - 4);//from w  ww . j  a  v  a  2 s . c  o m

        recordBuffer.setCapacity(currentRecordLength + 8);
        // read in proper amount of data ...
        file.read(recordBuffer.get(), 0, currentRecordLength + 8);
        // ok initialize input buffer ... 
        inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8);
        // now read next record length first ... 
        int nextRecordLength = inputBuffer.readInt();
        // next read sync bytes ... 
        int syncBytes = inputBuffer.readInt();
        // validate 
        if (syncBytes != SyncBytes) {
            throw new IOException("Corrupt Record Detected!");
        }
        // ok read real record bytes ... 
        int realRecordBytes = inputBuffer.readInt();
        // read crc ... 
        long crcValue = inputBuffer.readLong();
        // ok validate crc ...  
        crc.reset();
        crc.update(inputBuffer.getData(), inputBuffer.getPosition(), realRecordBytes - 8);
        if (crcValue != crc.getValue()) {
            throw new IOException("CRC Mismatch!");
        }
        // ok now read key and value 
        try {
            long key = WritableUtils.readVLong(inputBuffer);

            if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) {
                break;
            }

            ValueType value = (ValueType) valueClass.newInstance();
            value.readFields(inputBuffer);
            KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value);
            tuple.recordPos = endOfPrevRecord;
            valuesOut.add(0, tuple);

        } catch (Exception e) {
            throw new IOException(e);
        }

        currentRecordLength = nextRecordLength;

        recordsToRead--;

        if (endOfPrevRecord == headerOffset)
            break;
    }
}