Example usage for org.apache.hadoop.io DataInputBuffer reset

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer reset.

Prototype

public void reset(byte[] input, int start, int length)

Source Link

Document

Resets the data that the buffer reads.

Usage

From source file:org.apache.tez.runtime.library.common.sort.impl.TestIFile.java

License:Apache License

private void populateData(KVPair kvp, DataInputBuffer key, DataInputBuffer value) throws IOException {
    DataOutputBuffer k = new DataOutputBuffer();
    DataOutputBuffer v = new DataOutputBuffer();
    kvp.getKey().write(k);//from ww  w .j a va  2  s .co m
    kvp.getvalue().write(v);
    key.reset(k.getData(), 0, k.getLength());
    value.reset(v.getData(), 0, v.getLength());
}

From source file:org.apache.tez.runtime.library.common.TestValuesIterator.java

License:Apache License

/**
 * create inmemory segments/*from  w w w.  j  av a  2 s .co  m*/
 *
 * @return
 * @throws IOException
 */
public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);

    LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null,
            null, 1024 * 1024 * 10, null, false, -1);

    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);

    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
        BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
        InMemoryWriter writer = new InMemoryWriter(bout);
        Map<Writable, Writable> data = createData();
        //write data
        for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
            keySerializer.serialize(entry.getKey());
            valueSerializer.serialize(entry.getValue());
            keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
            valIn.reset(valBuf.getData(), 0, valBuf.getLength());
            writer.append(keyIn, valIn);
            originalData.put(entry.getKey(), entry.getValue());
            keyBuf.reset();
            valBuf.reset();
            keyIn.reset();
            valIn.reset();
        }
        IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0,
                bout.getBuffer().length);
        segments.add(new TezMerger.Segment(reader, true));

        data.clear();
        writer.close();
    }
    return segments;
}

From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java

License:Apache License

private void writePartition(int pos, WrappedBuffer wrappedBuffer, Writer writer, DataInputBuffer keyBuffer,
        DataInputBuffer valBuffer) throws IOException {
    while (pos != WrappedBuffer.PARTITION_ABSENT_POSITION) {
        int metaIndex = pos / INT_SIZE;
        int keyLength = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_KEYLEN);
        int valLength = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_VALLEN);
        keyBuffer.reset(wrappedBuffer.buffer, pos + META_SIZE, keyLength);
        valBuffer.reset(wrappedBuffer.buffer, pos + META_SIZE + keyLength, valLength);

        writer.append(keyBuffer, valBuffer);
        pos = wrappedBuffer.metaBuffer.get(metaIndex + INDEX_NEXT);
    }/*w  ww .ja v  a2  s  .  co m*/
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedBasicSort.java

License:Open Source License

@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    WindowsBasedBasicWritable key1 = new WindowsBasedBasicWritable();
    WindowsBasedBasicWritable key2 = new WindowsBasedBasicWritable();
    DataInputBuffer buffer = new DataInputBuffer();
    try {//from   w w w  .j a v a2s  .c o  m
        buffer.reset(b1, s1, l1);
        key1.readFields(buffer);
        buffer.reset(b2, s2, l2);
        key2.readFields(buffer);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    return key1.compareTo(key2);
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedComparator.java

License:Open Source License

@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    WindowsBasedWritable key1 = new WindowsBasedWritable();
    WindowsBasedWritable key2 = new WindowsBasedWritable();
    DataInputBuffer buffer = new DataInputBuffer();
    try {/*w w  w .j  a  v  a 2s  . co m*/
        buffer.reset(b1, s1, l1);
        key1.readFields(buffer);
        buffer.reset(b2, s2, l2);
        key2.readFields(buffer);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return compare(key1, key2);
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.partitioner.WindowsBasedSort.java

License:Open Source License

@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    WindowsBasedWritable key1 = new WindowsBasedWritable();
    WindowsBasedWritable key2 = new WindowsBasedWritable();
    DataInputBuffer buffer = new DataInputBuffer();
    try {//from w w w  .java 2  s .  c  o m
        buffer.reset(b1, s1, l1);
        key1.readFields(buffer);
        buffer.reset(b2, s2, l2);
        key2.readFields(buffer);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    return key1.compareTo(key2);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {

    if (_skipPartition)
        return;/*www  . j  av  a2  s .  c o m*/
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_conf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs,
            incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int processedKeysCount = 0;

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        summaryRecord = null;
        linkSummaryRecord = null;
        types.clear();
        linkSources = null;
        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;

        int statusCount = 0;
        int linkCount = 0;

        // scan key components 
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // pick up source fp from key ... 
        URLFPV2 fpSource = new URLFPV2();

        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                statusCount++;

                try {
                    JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            } else {
                linkCount++;
                JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                // ok this is a link ... 
                updateLinkStatsFromLinkJSONObject(object, fpSource, reporter);
            }

            reporter.progress();
        }

        if (statusCount > 1) {
            reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1);
        }

        if (statusCount == 0 && linkCount != 0) {
            reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1);
        } else {
            if (statusCount >= 1 && linkCount >= 1) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1);
            } else if (statusCount >= 1 && linkCount == 0) {
                reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1);
            }
        }

        if (summaryRecord != null || linkSummaryRecord != null) {
            JsonObject compositeObject = new JsonObject();
            if (summaryRecord != null) {
                compositeObject.add("crawl_status", summaryRecord);
            }
            if (linkSummaryRecord != null) {
                if (types != null && types.size() != 0) {
                    stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types);
                    if (linkSources != null) {
                        stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values());
                    }
                }
                compositeObject.add("link_status", linkSummaryRecord);
            }

            if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) {
                if (outputKeyFromInternalLink) {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1);
                } else {
                    reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1);
                }
                output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString()));
            } else {
                reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
            }
        }
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.rank.LinkScannerStep.java

License:Open Source License

@Override
public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
        Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();

    FlexBuffer scanArray[] = LinkKey.allocateScanArray();

    while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
    }//from w  w  w. j a v a 2  s  .  c om

    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_jobConf);

    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
            RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
            WritableComparable.class);

    // ok now spawn merger
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
            FileSystem.get(_jobConf), incomingPaths, localMergeConfig);

    TextBytes keyBytes = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    TextBytes valueOut = new TextBytes();
    TextBytes keyOut = new TextBytes();

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    // pick up source fp from key ...
    URLFPV2 fpSource = new URLFPV2();

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

        outputKeyString = null;
        outputKeyFromInternalLink = false;
        outputKeyURLObj = null;
        latestLinkDataTime = -1L;
        outlinks.clear();
        discoveredLinks.clear();

        // scan key components
        LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

        // setup fingerprint ...
        fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

        for (RawRecordValue rawValue : nextItem.e1) {

            inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
            int length = WritableUtils.readVInt(inputBuffer);
            keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            length = WritableUtils.readVInt(inputBuffer);
            valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

            long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

            if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                try {
                    JsonObject object = parser.parse(valueBytes.toString()).getAsJsonObject();
                    if (object != null) {
                        updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                    }
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON:" + valueBytes.toString());
                    throw new IOException(e);
                }
            }
            reporter.progress();
        }
        // ok now see if we have anything to emit ...
        if (discoveredLinks.size() != 0) {
            reporter.incrCounter(Counters.HAD_OUTLINK_DATA, 1);
            for (String outlink : outlinks) {
                // emit a to tuple
                toJsonObject.addProperty("to", outlink);
                valueBytes.set(toJsonObject.toString());
                output.collect(sourceDomain, valueBytes);
                // now emit a from tuple ...
                fromJsonObject.addProperty("from", sourceDomain.toString());
                keyBytes.set(outlink);
                valueBytes.set(fromJsonObject.toString());
                output.collect(keyBytes, valueBytes);
            }

            bloomKey.setDomainHash(fpSource.getDomainHash());

            for (long destDomainFP : discoveredLinks) {
                // set the bloom filter key ...
                bloomKey.setUrlHash(destDomainFP);
                // add it to the bloom filter
                emittedTuplesFilter.add(bloomKey);
            }
        } else {
            reporter.incrCounter(Counters.HAD_NO_OUTLINK_DATA, 1);
        }
    }
}

From source file:org.commoncrawl.service.crawler.CrawlerServer.java

License:Open Source License

void refreshMasterCrawlerActiveHostList() {
    // ok if there is a master crawler, and it is online ... 
    if (_masterCrawlerServiceChannel != null && _masterCrawlerServiceChannel.isOpen()) {
        try {//from w ww  . j a va 2s . co m
            _masterCrawlerStub.queryActiveHosts(new Callback<NullMessage, ActiveHostInfo>() {

                @Override
                public void requestComplete(AsyncRequest<NullMessage, ActiveHostInfo> request) {
                    if (request.getStatus() == Status.Success) {
                        // ok update timestamp no matter what 
                        _pauseStateTimestampIncremental = request.getOutput().getPauseStateTimestamp();
                        // and clear set ... 
                        _pausedHostsSet = null;
                        // now see if we have a valid response ... 
                        if (request.getOutput().getActiveHostIds().getCount() != 0) {
                            LOG.info("Received New Active Host Set From Master Crawler At:"
                                    + _masterCrawlerAddress);
                            // ok we have a valid list of hosts ... 
                            // create a reader stream 
                            DataInputBuffer inputStream = new DataInputBuffer();
                            inputStream.reset(request.getOutput().getActiveHostIds().getReadOnlyBytes(), 0,
                                    request.getOutput().getActiveHostIds().getCount());

                            try {
                                // create a set ... 
                                Set<Integer> ipAddressSet = new TreeSet<Integer>();
                                // populate it 
                                int ipAddressCount = WritableUtils.readVInt(inputStream);
                                for (int i = 0; i < ipAddressCount; ++i) {
                                    ipAddressSet.add(WritableUtils.readVInt(inputStream));
                                }

                                LOG.info("Successfully updated Active Host Set");
                                // ok replace set ... 
                                _pausedHostsSet = ipAddressSet;
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                    }
                }
            });
        } catch (RPCException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
    }

    // ok no matter what... check to see if we need to set up refresh timer ... 
    if (_masterCrawlerHostListRefreshTimer == null) {
        _masterCrawlerHostListRefreshTimer = new Timer(ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT, true,
                new Timer.Callback() {

                    @Override
                    public void timerFired(Timer timer) {
                        // call refresh again ... 
                        refreshMasterCrawlerActiveHostList();
                    }
                });
        _eventLoop.setTimer(_masterCrawlerHostListRefreshTimer);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {/*from w  w  w.  ja  va  2s.  co  m*/
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}