Example usage for org.apache.hadoop.record Buffer get

List of usage examples for org.apache.hadoop.record Buffer get

Introduction

In this page you can find the example usage for org.apache.hadoop.record Buffer get.

Prototype

public byte[] get() 

Source Link

Document

Get the data from the Buffer.

Usage

From source file:com.jfolson.hive.serde.RTypedBytesInput.java

License:Apache License

/**
 * Reads the raw bytes following a <code>Type.VECTOR</code> code.
 * //  ww  w  .j  a v a2  s . c o  m
 * @return the obtained bytes sequence
 * @throws IOException
 */
public byte[] readRawVector() throws IOException {
    Buffer buffer = new Buffer();
    int length = readVectorHeader();
    buffer.append(new byte[] { (byte) RType.VECTOR.code, (byte) (0xff & (length >> 24)),
            (byte) (0xff & (length >> 16)), (byte) (0xff & (length >> 8)), (byte) (0xff & length) });
    for (int i = 0; i < length; i++) {
        buffer.append(readRaw());
    }
    return buffer.get();
}

From source file:com.jfolson.hive.serde.RTypedBytesInput.java

License:Apache License

/**
 * Reads the raw bytes following a <code>Type.LIST</code> code.
 * //from  ww  w. j a v  a 2 s. c o m
 * @return the obtained bytes sequence
 * @throws IOException
 */
public byte[] readRawList() throws IOException {
    Buffer buffer = new Buffer(new byte[] { (byte) RType.LIST.code });
    byte[] bytes = readRaw();
    while (bytes != null) {
        buffer.append(bytes);
        bytes = readRaw();
    }
    buffer.append(new byte[] { (byte) RType.MARKER.code });
    return buffer.get();
}

From source file:com.jfolson.hive.serde.RTypedBytesInput.java

License:Apache License

/**
 * Reads the raw bytes following a <code>Type.MAP</code> code.
 * /*  www. j  a  va 2  s  .  c  om*/
 * @return the obtained bytes sequence
 * @throws IOException
 */
public byte[] readRawMap() throws IOException {
    Buffer buffer = new Buffer();
    int length = readMapHeader();
    buffer.append(new byte[] { (byte) RType.MAP.code, (byte) (0xff & (length >> 24)),
            (byte) (0xff & (length >> 16)), (byte) (0xff & (length >> 8)), (byte) (0xff & length) });
    for (int i = 0; i < length; i++) {
        buffer.append(readRaw());
        buffer.append(readRaw());
    }
    return buffer.get();
}

From source file:com.jfolson.hive.serde.RTypedBytesRecordOutput.java

License:Apache License

public void writeBuffer(Buffer buf, String tag) throws IOException {
    out.writeBytes(buf.get());
}

From source file:org.commoncrawl.service.crawler.CrawlList.java

License:Open Source License

/** fetch succeeded **/
void fetchSucceeded(final CrawlTarget target, int downloadTime, final NIOHttpHeaders httpHeaders,
        final Buffer contentBuffer) {

    _lastRequestWasIOException = false;/* w ww .j a  v a2  s  .  co  m*/
    _lastRequestDownloadTime = downloadTime;
    _lastRequestRedirectCount = target.getRedirectCount();
    _fetchEndTime = System.currentTimeMillis();

    _activeConnection = null;

    getHost().incrementCounter(CrawlListHost.CounterId.SuccessfullGetCount, 1);

    // reset host's io error count
    _host.resetCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount);

    if (getActiveDomain() != null)
        getActiveDomain()._domainRetryCounter = 0;

    Disposition oldDisposition = _disposition;

    final String originalHost = URLUtils.fastGetHostFromURL(target.getOriginalURL());
    final String activeHost = URLUtils.fastGetHostFromURL(target.getActiveURL());

    if (originalHost != null && activeHost != null) {
        // update our server ip information from information contained within crawl target ...
        cacheDNSEntry(activeHost, target.getServerIP(), target.getServerIPTTL());
        // if the target was redirected ... cache the original ip address and ttl as well ... 
        if (target.isRedirected()) {
            if (target.getOriginalRequestData() != null) {
                cacheDNSEntry(originalHost, target.getOriginalRequestData()._serverIP,
                        target.getOriginalRequestData()._serverIPTTL);
            }
        }
    }

    final int resultCode = NIOHttpConnection.getHttpResponseCode(httpHeaders);

    if (resultCode == 200) {

        getHost().incrementCounter(CrawlListHost.CounterId.Http200Count, 1);

        if (getActiveDomain() != null) {
            getActiveDomain()._HTTP200Count++;
            getActiveDomain()._SequentialHTTPFailuresCount = 0;
        }

        // validate www rewrite rule if not set and target was redirected ... 
        if (target.isRedirected()) {
            /* this is broken for the new list design
            if (!originalHost.equalsIgnoreCase(activeHost)) { 
              // if redirect strips the www then ... 
              if ((originalHost.startsWith("www.") || originalHost.startsWith("WWW.")) && activeHost.equalsIgnoreCase(originalHost.substring(4))) { 
                addWWWReWriteItem(originalHost,WWWRULE_Remove);
              }
              // else if redirect adds the www then ...
              else if ((activeHost.startsWith("www.") || activeHost.startsWith("WWW.")) && originalHost.equalsIgnoreCase(activeHost.substring(4))) {
                addWWWReWriteItem(originalHost,WWWRULE_Add);
              }
            }
            */
        }
    } else if (resultCode >= 400 && resultCode < 500) {
        if (resultCode == 403) {
            // inform host for stats tracking purposes 
            _host.incrementCounter(CrawlListHost.CounterId.Http403Count, 1);
        }
        if (getActiveDomain() != null)
            getActiveDomain()._SequentialHTTPFailuresCount++;
    } else if (resultCode >= 500 && resultCode < 600) {
        if (getActiveDomain() != null) {
            getActiveDomain()._SequentialHTTPFailuresCount++;
        }
    }

    if (_scheduled != target) {
        if (_scheduled == null)
            LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName()
                    + " fetchSucceed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!");
        else
            LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName()
                    + " fetchSucceed Target is:" + target.getOriginalURL() + " " + target.toString()
                    + " ActiveTarget is:" + _scheduled.getOriginalURL() + " " + _scheduled.toString());
    } else {

        // clear active ... 
        _scheduled = null;

        // if this is the robots target ... 
        if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) {

            final CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();

            // process the robots data if any ... 
            // check for null queue (in case of unit test);
            if (resultCode == 200) {

                _robotsRetrieved = true;

                synchronized (crawlerStats) {
                    crawlerStats.setRobotsRequestsSucceeded(crawlerStats.getRobotsRequestsSucceeded() + 1);
                    crawlerStats.setRobotsRequestsQueuedForParse(
                            crawlerStats.getRobotsRequestsQueuedForParse() + 1);
                }

                LOG.info("### Scheduling Robots Parse for:" + target.getActiveURL());

                // transition to a waiting on completion disposition ... 
                _disposition = Disposition.WaitingOnCompletion;

                if (getServerSingleton() != null) {
                    // schedule a robots parser parse attempt ... 
                    getServerSingleton().registerThreadPool("robots", 5)
                            .execute(new ConcurrentTask<RobotRuleResult>(getServerSingleton().getEventLoop(),

                                    new Callable<RobotRuleResult>() {

                                        public RobotRuleResult call() throws Exception {

                                            try {

                                                TextBytes contentData = new TextBytes(contentBuffer.get());

                                                String contentEncoding = httpHeaders
                                                        .findValue("Content-Encoding");

                                                if (contentEncoding != null
                                                        && contentEncoding.equalsIgnoreCase("gzip")) {

                                                    if (Environment.detailLogEnabled())
                                                        LOG.info("GZIP Encoding Detected for Robots File For:"
                                                                + activeHost);

                                                    UnzipResult result = GZIPUtils.unzipBestEffort(
                                                            contentData.getBytes(),
                                                            CrawlEnvironment.CONTENT_SIZE_LIMIT);

                                                    if (result == null) {
                                                        contentData = null;
                                                        if (Environment.detailLogEnabled())
                                                            LOG.info(
                                                                    "GZIP Decoder returned NULL for Robots File For:"
                                                                            + activeHost);
                                                    } else {
                                                        contentData.set(result.data.get(),
                                                                result.data.getOffset(),
                                                                result.data.getCount());
                                                    }
                                                }

                                                try {
                                                    if (contentData != null) {
                                                        String robotsTxt = contentData.toString().trim()
                                                                .toLowerCase();
                                                        if (robotsTxt.startsWith("<html")
                                                                || robotsTxt.startsWith("<!doctype html")) {
                                                            contentData = null;

                                                            CrawlerServer.getEngine().logRobots(
                                                                    System.currentTimeMillis(), _robotsHostName,
                                                                    resultCode, null,
                                                                    CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                    CrawlerEngine.RobotsParseFlag_ContentWasHTML);
                                                        } else {
                                                            CrawlerServer.getEngine().logRobots(
                                                                    System.currentTimeMillis(), _robotsHostName,
                                                                    resultCode, robotsTxt,
                                                                    CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                    0);

                                                            synchronized (this) {
                                                                _lastFetchedRobotsData = robotsTxt;
                                                                _lastFetchedRobotsHostName = _robotsHostName;
                                                            }
                                                        }
                                                    } else {
                                                        CrawlerServer.getEngine().logRobots(
                                                                System.currentTimeMillis(), _robotsHostName,
                                                                resultCode, null,
                                                                CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                                                    }
                                                } catch (Exception e) {
                                                    LOG.error(CCStringUtils.stringifyException(e));
                                                    CrawlerServer.getEngine().logRobots(
                                                            System.currentTimeMillis(), _robotsHostName,
                                                            resultCode, null,
                                                            CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                            CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                                                }

                                                if (Environment.detailLogEnabled())
                                                    LOG.info("Parsing Robots File for Host:" + activeHost);
                                                RobotRuleResult result = new RobotRuleResult();

                                                if (contentData != null) {
                                                    synchronized (_crc32) {
                                                        _crc32.reset();
                                                        _crc32.update(contentData.getBytes(),
                                                                contentData.getOffset(),
                                                                contentData.getLength());
                                                        result.crcValue = _crc32.getValue();
                                                    }
                                                    RobotRulesParser parser = new RobotRulesParser(
                                                            getServerSingleton().getConfig());
                                                    result.ruleSet = parser.parseRules(contentData.getBytes(),
                                                            contentData.getOffset(), contentData.getLength());
                                                } else {
                                                    result.ruleSet = RobotRulesParser.getEmptyRules();
                                                    result.crcValue = 0;
                                                }
                                                return result;
                                            } catch (Exception e) {
                                                LOG.error(CCStringUtils.stringifyException(e));
                                                throw e;
                                            }
                                        }
                                    },

                                    new ConcurrentTask.CompletionCallback<RobotRuleResult>() {

                                        public void taskComplete(RobotRuleResult loadResult) {

                                            synchronized (crawlerStats) {
                                                crawlerStats.setRobotsRequestsQueuedForParse(
                                                        crawlerStats.getRobotsRequestsQueuedForParse() - 1);
                                            }

                                            if (loadResult != null) {

                                                boolean disallowsAll = !_ruleSet.isAllowed("/");
                                                boolean robotsHadCrawlDelay = _ruleSet.getCrawlDelay() != -1;
                                                boolean explicitMention = _ruleSet.explicitMention;

                                                int logFlags = 0;
                                                if (disallowsAll)
                                                    logFlags |= CrawlerEngine.RobotsParseFlag_ExcludesAll;
                                                if (explicitMention)
                                                    logFlags |= CrawlerEngine.RobotsParseFlag_ExplicitMention;
                                                if (robotsHadCrawlDelay)
                                                    logFlags |= CrawlerEngine.RobotsParseFlag_HasCrawlDelay;

                                                synchronized (crawlerStats) {
                                                    crawlerStats.setRobotsRequestsSuccessfullParse(
                                                            crawlerStats.getRobotsRequestsSuccessfullParse()
                                                                    + 1);
                                                    if (disallowsAll) {
                                                        crawlerStats.setRobotsFileExcludesAllContent(
                                                                crawlerStats.getRobotsFileExcludesAllContent()
                                                                        + 1);
                                                        if (explicitMention)
                                                            crawlerStats.setRobotsFileExplicitlyExcludesAll(
                                                                    crawlerStats
                                                                            .getRobotsFileExplicitlyExcludesAll()
                                                                            + 1);
                                                    }
                                                    if (explicitMention) {
                                                        crawlerStats.setRobotsFileHasExplicitMention(
                                                                crawlerStats.getRobotsFileHasExplicitMention()
                                                                        + 1);
                                                    }
                                                    if (robotsHadCrawlDelay) {
                                                        crawlerStats.setRobotsFileHadCrawlDelay(
                                                                crawlerStats.getRobotsFileHadCrawlDelay() + 1);
                                                    }
                                                }

                                                CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),
                                                        _robotsHostName, 0, null,
                                                        CrawlerEngine.RobotsLogEventType.Parse_Succeeded,
                                                        logFlags);

                                                _ruleSet = loadResult.ruleSet;
                                                _robotsCRC = loadResult.crcValue;

                                                _host.cacheRobotsFile(_ruleSet, _robotsCRC);
                                            } else {

                                                CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),
                                                        _robotsHostName, 0, null,
                                                        CrawlerEngine.RobotsLogEventType.Parse_Failed, 0);

                                                synchronized (crawlerStats) {
                                                    crawlerStats.setRobotsRequestsFailedParse(
                                                            crawlerStats.getRobotsRequestsFailedParse() + 1);
                                                }

                                                // LOG.error("####Robots parsing for host:" + activeHost + " failed.");
                                                _ruleSet = RobotRulesParser.getEmptyRules();
                                                _robotsCRC = 0;
                                            }

                                            //if (Environment.detailLogEnabled())
                                            LOG.info("####Robots RETRIEVED for Host:" + activeHost
                                                    + " CrawlDelay IS:" + getCrawlDelay(false));

                                            if (originalHost != null && activeHost != null) {
                                                updateRobotsCRCForDomain(_robotsCRC, originalHost,
                                                        _robotsReturned400, _robotsReturned403);
                                                if (activeHost.compareToIgnoreCase(originalHost) != 0) {
                                                    updateRobotsCRCForDomain(_robotsCRC, activeHost,
                                                            _robotsReturned400, _robotsReturned403);
                                                }
                                            }

                                            Disposition oldDisposition = _disposition;

                                            if (getNextPending(false) != null) {
                                                _disposition = Disposition.ItemAvailable;
                                            } else {
                                                _disposition = Disposition.WaitingOnTime;
                                            }

                                            if (oldDisposition != _disposition) {
                                                // notify queue 
                                                getHost().listDispositionChanged(CrawlList.this, oldDisposition,
                                                        _disposition);
                                            }
                                        }

                                        public void taskFailed(Exception e) {
                                            if (Environment.detailLogEnabled())
                                                LOG.error("####Robots parsing for host:" + _robotsHostName
                                                        + " failed with exception" + e);
                                            _ruleSet = RobotRulesParser.getEmptyRules();

                                            Disposition oldDisposition = _disposition;

                                            if (getNextPending(false) != null) {
                                                _disposition = Disposition.ItemAvailable;
                                            } else {
                                                _disposition = Disposition.WaitingOnTime;
                                            }

                                            if (oldDisposition != _disposition) {
                                                // notify queue 
                                                getHost().listDispositionChanged(CrawlList.this, oldDisposition,
                                                        _disposition);
                                            }
                                        }
                                    }));
                }
                // explitly return here ( inorder to wait for the async completion event)
                return;
            }
            //otherwise ... 
            else {

                synchronized (crawlerStats) {
                    crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1);
                }

                CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, resultCode,
                        null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed, 0);

                _robotsCRC = 0;
                if (Environment.detailLogEnabled())
                    LOG.info("####Robots GET for Host:" + activeHost + "FAILED With Result Code:" + resultCode);
                //TODO: MAKE THIS MORE ROBUST ... 
                // clear robots flag ... 
                _robotsRetrieved = true;
                // see if result code was a 403 
                if (resultCode >= 400 && resultCode <= 499) {
                    _robotsReturned400 = true;
                    if (resultCode == 403)
                        _robotsReturned403 = true;

                }
                // for now, assume no robots rules for any error conditions ... 
                _ruleSet = RobotRulesParser.getEmptyRules();

                if (originalHost != null && activeHost != null) {
                    updateRobotsCRCForDomain(_robotsCRC, originalHost, _robotsReturned400, _robotsReturned403);
                    if (activeHost.compareToIgnoreCase(originalHost) != 0) {
                        updateRobotsCRCForDomain(_robotsCRC, activeHost, _robotsReturned400,
                                _robotsReturned403);
                    }
                }

            }
        }

        if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) {
            // update active host stats and check for failure ... 
            checkActiveHostStatsForFailure();
        }

        // if there are no more items in the queue 
        if (getNextPending(false) == null) {
            // if offline count is zero then mark this domain's queue as empty
            if (_offlineTargetCount == 0) {
                _disposition = Disposition.QueueEmpty;
            }
            // otherwise put us in a wait state and potentially queue up a disk load 
            else {
                _disposition = Disposition.WaitingOnTime;
                // potentially queue up a disk load 
                potentiallyQueueDiskLoad();
            }
        } else {
            // if we are ready to fetch the next item ... 
            if (calculateNextWaitTime() < System.currentTimeMillis()) {
                _disposition = Disposition.ItemAvailable;
            } else {
                // transition to a new wait state ... 
                _disposition = Disposition.WaitingOnTime;
            }
        }

        if (oldDisposition != _disposition) {
            // either way ... notify queue 
            getHost().listDispositionChanged(this, oldDisposition, _disposition);
        }
    }
}

From source file:org.commoncrawl.util.TimeSeriesDataFile.java

License:Open Source License

private void doCommonRead(ArrayList<KeyValueTuple<Long, ValueType>> valuesOut, RandomAccessFile file,
        long headerOffset, long endOfPrevRecord, int currentRecordLength, int recordsToRead,
        long optionalMinKeyValue) throws IOException {

    Buffer recordBuffer = new Buffer();
    DataInputBuffer inputBuffer = new DataInputBuffer();

    // ok start walking backwards ... 
    while (recordsToRead != 0) {
        // setup new previous record pos pointer  
        endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4;
        // and seek to it endOfLastRecord - 4
        file.seek(endOfPrevRecord - 4);// w w  w. j ava 2 s  .c  o  m

        recordBuffer.setCapacity(currentRecordLength + 8);
        // read in proper amount of data ...
        file.read(recordBuffer.get(), 0, currentRecordLength + 8);
        // ok initialize input buffer ... 
        inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8);
        // now read next record length first ... 
        int nextRecordLength = inputBuffer.readInt();
        // next read sync bytes ... 
        int syncBytes = inputBuffer.readInt();
        // validate 
        if (syncBytes != SyncBytes) {
            throw new IOException("Corrupt Record Detected!");
        }
        // ok read real record bytes ... 
        int realRecordBytes = inputBuffer.readInt();
        // read crc ... 
        long crcValue = inputBuffer.readLong();
        // ok validate crc ...  
        crc.reset();
        crc.update(inputBuffer.getData(), inputBuffer.getPosition(), realRecordBytes - 8);
        if (crcValue != crc.getValue()) {
            throw new IOException("CRC Mismatch!");
        }
        // ok now read key and value 
        try {
            long key = WritableUtils.readVLong(inputBuffer);

            if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) {
                break;
            }

            ValueType value = (ValueType) valueClass.newInstance();
            value.readFields(inputBuffer);
            KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value);
            tuple.recordPos = endOfPrevRecord;
            valuesOut.add(0, tuple);

        } catch (Exception e) {
            throw new IOException(e);
        }

        currentRecordLength = nextRecordLength;

        recordsToRead--;

        if (endOfPrevRecord == headerOffset)
            break;
    }
}