List of usage examples for org.apache.hadoop.record Buffer get
public byte[] get()
From source file:com.jfolson.hive.serde.RTypedBytesInput.java
License:Apache License
/** * Reads the raw bytes following a <code>Type.VECTOR</code> code. * // ww w .j a v a2 s . c o m * @return the obtained bytes sequence * @throws IOException */ public byte[] readRawVector() throws IOException { Buffer buffer = new Buffer(); int length = readVectorHeader(); buffer.append(new byte[] { (byte) RType.VECTOR.code, (byte) (0xff & (length >> 24)), (byte) (0xff & (length >> 16)), (byte) (0xff & (length >> 8)), (byte) (0xff & length) }); for (int i = 0; i < length; i++) { buffer.append(readRaw()); } return buffer.get(); }
From source file:com.jfolson.hive.serde.RTypedBytesInput.java
License:Apache License
/** * Reads the raw bytes following a <code>Type.LIST</code> code. * //from ww w. j a v a 2 s. c o m * @return the obtained bytes sequence * @throws IOException */ public byte[] readRawList() throws IOException { Buffer buffer = new Buffer(new byte[] { (byte) RType.LIST.code }); byte[] bytes = readRaw(); while (bytes != null) { buffer.append(bytes); bytes = readRaw(); } buffer.append(new byte[] { (byte) RType.MARKER.code }); return buffer.get(); }
From source file:com.jfolson.hive.serde.RTypedBytesInput.java
License:Apache License
/** * Reads the raw bytes following a <code>Type.MAP</code> code. * /* www. j a va 2 s . c om*/ * @return the obtained bytes sequence * @throws IOException */ public byte[] readRawMap() throws IOException { Buffer buffer = new Buffer(); int length = readMapHeader(); buffer.append(new byte[] { (byte) RType.MAP.code, (byte) (0xff & (length >> 24)), (byte) (0xff & (length >> 16)), (byte) (0xff & (length >> 8)), (byte) (0xff & length) }); for (int i = 0; i < length; i++) { buffer.append(readRaw()); buffer.append(readRaw()); } return buffer.get(); }
From source file:com.jfolson.hive.serde.RTypedBytesRecordOutput.java
License:Apache License
public void writeBuffer(Buffer buf, String tag) throws IOException { out.writeBytes(buf.get()); }
From source file:org.commoncrawl.service.crawler.CrawlList.java
License:Open Source License
/** fetch succeeded **/ void fetchSucceeded(final CrawlTarget target, int downloadTime, final NIOHttpHeaders httpHeaders, final Buffer contentBuffer) { _lastRequestWasIOException = false;/* w ww .j a v a2 s . co m*/ _lastRequestDownloadTime = downloadTime; _lastRequestRedirectCount = target.getRedirectCount(); _fetchEndTime = System.currentTimeMillis(); _activeConnection = null; getHost().incrementCounter(CrawlListHost.CounterId.SuccessfullGetCount, 1); // reset host's io error count _host.resetCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount); if (getActiveDomain() != null) getActiveDomain()._domainRetryCounter = 0; Disposition oldDisposition = _disposition; final String originalHost = URLUtils.fastGetHostFromURL(target.getOriginalURL()); final String activeHost = URLUtils.fastGetHostFromURL(target.getActiveURL()); if (originalHost != null && activeHost != null) { // update our server ip information from information contained within crawl target ... cacheDNSEntry(activeHost, target.getServerIP(), target.getServerIPTTL()); // if the target was redirected ... cache the original ip address and ttl as well ... if (target.isRedirected()) { if (target.getOriginalRequestData() != null) { cacheDNSEntry(originalHost, target.getOriginalRequestData()._serverIP, target.getOriginalRequestData()._serverIPTTL); } } } final int resultCode = NIOHttpConnection.getHttpResponseCode(httpHeaders); if (resultCode == 200) { getHost().incrementCounter(CrawlListHost.CounterId.Http200Count, 1); if (getActiveDomain() != null) { getActiveDomain()._HTTP200Count++; getActiveDomain()._SequentialHTTPFailuresCount = 0; } // validate www rewrite rule if not set and target was redirected ... if (target.isRedirected()) { /* this is broken for the new list design if (!originalHost.equalsIgnoreCase(activeHost)) { // if redirect strips the www then ... if ((originalHost.startsWith("www.") || originalHost.startsWith("WWW.")) && activeHost.equalsIgnoreCase(originalHost.substring(4))) { addWWWReWriteItem(originalHost,WWWRULE_Remove); } // else if redirect adds the www then ... else if ((activeHost.startsWith("www.") || activeHost.startsWith("WWW.")) && originalHost.equalsIgnoreCase(activeHost.substring(4))) { addWWWReWriteItem(originalHost,WWWRULE_Add); } } */ } } else if (resultCode >= 400 && resultCode < 500) { if (resultCode == 403) { // inform host for stats tracking purposes _host.incrementCounter(CrawlListHost.CounterId.Http403Count, 1); } if (getActiveDomain() != null) getActiveDomain()._SequentialHTTPFailuresCount++; } else if (resultCode >= 500 && resultCode < 600) { if (getActiveDomain() != null) { getActiveDomain()._SequentialHTTPFailuresCount++; } } if (_scheduled != target) { if (_scheduled == null) LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!"); else LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + target.getOriginalURL() + " " + target.toString() + " ActiveTarget is:" + _scheduled.getOriginalURL() + " " + _scheduled.toString()); } else { // clear active ... _scheduled = null; // if this is the robots target ... if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) { final CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats(); // process the robots data if any ... // check for null queue (in case of unit test); if (resultCode == 200) { _robotsRetrieved = true; synchronized (crawlerStats) { crawlerStats.setRobotsRequestsSucceeded(crawlerStats.getRobotsRequestsSucceeded() + 1); crawlerStats.setRobotsRequestsQueuedForParse( crawlerStats.getRobotsRequestsQueuedForParse() + 1); } LOG.info("### Scheduling Robots Parse for:" + target.getActiveURL()); // transition to a waiting on completion disposition ... _disposition = Disposition.WaitingOnCompletion; if (getServerSingleton() != null) { // schedule a robots parser parse attempt ... getServerSingleton().registerThreadPool("robots", 5) .execute(new ConcurrentTask<RobotRuleResult>(getServerSingleton().getEventLoop(), new Callable<RobotRuleResult>() { public RobotRuleResult call() throws Exception { try { TextBytes contentData = new TextBytes(contentBuffer.get()); String contentEncoding = httpHeaders .findValue("Content-Encoding"); if (contentEncoding != null && contentEncoding.equalsIgnoreCase("gzip")) { if (Environment.detailLogEnabled()) LOG.info("GZIP Encoding Detected for Robots File For:" + activeHost); UnzipResult result = GZIPUtils.unzipBestEffort( contentData.getBytes(), CrawlEnvironment.CONTENT_SIZE_LIMIT); if (result == null) { contentData = null; if (Environment.detailLogEnabled()) LOG.info( "GZIP Decoder returned NULL for Robots File For:" + activeHost); } else { contentData.set(result.data.get(), result.data.getOffset(), result.data.getCount()); } } try { if (contentData != null) { String robotsTxt = contentData.toString().trim() .toLowerCase(); if (robotsTxt.startsWith("<html") || robotsTxt.startsWith("<!doctype html")) { contentData = null; CrawlerServer.getEngine().logRobots( System.currentTimeMillis(), _robotsHostName, resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, CrawlerEngine.RobotsParseFlag_ContentWasHTML); } else { CrawlerServer.getEngine().logRobots( System.currentTimeMillis(), _robotsHostName, resultCode, robotsTxt, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, 0); synchronized (this) { _lastFetchedRobotsData = robotsTxt; _lastFetchedRobotsHostName = _robotsHostName; } } } else { CrawlerServer.getEngine().logRobots( System.currentTimeMillis(), _robotsHostName, resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, CrawlerEngine.RobotsParseFlag_ContentDecodeFailed); } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); CrawlerServer.getEngine().logRobots( System.currentTimeMillis(), _robotsHostName, resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, CrawlerEngine.RobotsParseFlag_ContentDecodeFailed); } if (Environment.detailLogEnabled()) LOG.info("Parsing Robots File for Host:" + activeHost); RobotRuleResult result = new RobotRuleResult(); if (contentData != null) { synchronized (_crc32) { _crc32.reset(); _crc32.update(contentData.getBytes(), contentData.getOffset(), contentData.getLength()); result.crcValue = _crc32.getValue(); } RobotRulesParser parser = new RobotRulesParser( getServerSingleton().getConfig()); result.ruleSet = parser.parseRules(contentData.getBytes(), contentData.getOffset(), contentData.getLength()); } else { result.ruleSet = RobotRulesParser.getEmptyRules(); result.crcValue = 0; } return result; } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw e; } } }, new ConcurrentTask.CompletionCallback<RobotRuleResult>() { public void taskComplete(RobotRuleResult loadResult) { synchronized (crawlerStats) { crawlerStats.setRobotsRequestsQueuedForParse( crawlerStats.getRobotsRequestsQueuedForParse() - 1); } if (loadResult != null) { boolean disallowsAll = !_ruleSet.isAllowed("/"); boolean robotsHadCrawlDelay = _ruleSet.getCrawlDelay() != -1; boolean explicitMention = _ruleSet.explicitMention; int logFlags = 0; if (disallowsAll) logFlags |= CrawlerEngine.RobotsParseFlag_ExcludesAll; if (explicitMention) logFlags |= CrawlerEngine.RobotsParseFlag_ExplicitMention; if (robotsHadCrawlDelay) logFlags |= CrawlerEngine.RobotsParseFlag_HasCrawlDelay; synchronized (crawlerStats) { crawlerStats.setRobotsRequestsSuccessfullParse( crawlerStats.getRobotsRequestsSuccessfullParse() + 1); if (disallowsAll) { crawlerStats.setRobotsFileExcludesAllContent( crawlerStats.getRobotsFileExcludesAllContent() + 1); if (explicitMention) crawlerStats.setRobotsFileExplicitlyExcludesAll( crawlerStats .getRobotsFileExplicitlyExcludesAll() + 1); } if (explicitMention) { crawlerStats.setRobotsFileHasExplicitMention( crawlerStats.getRobotsFileHasExplicitMention() + 1); } if (robotsHadCrawlDelay) { crawlerStats.setRobotsFileHadCrawlDelay( crawlerStats.getRobotsFileHadCrawlDelay() + 1); } } CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, 0, null, CrawlerEngine.RobotsLogEventType.Parse_Succeeded, logFlags); _ruleSet = loadResult.ruleSet; _robotsCRC = loadResult.crcValue; _host.cacheRobotsFile(_ruleSet, _robotsCRC); } else { CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, 0, null, CrawlerEngine.RobotsLogEventType.Parse_Failed, 0); synchronized (crawlerStats) { crawlerStats.setRobotsRequestsFailedParse( crawlerStats.getRobotsRequestsFailedParse() + 1); } // LOG.error("####Robots parsing for host:" + activeHost + " failed."); _ruleSet = RobotRulesParser.getEmptyRules(); _robotsCRC = 0; } //if (Environment.detailLogEnabled()) LOG.info("####Robots RETRIEVED for Host:" + activeHost + " CrawlDelay IS:" + getCrawlDelay(false)); if (originalHost != null && activeHost != null) { updateRobotsCRCForDomain(_robotsCRC, originalHost, _robotsReturned400, _robotsReturned403); if (activeHost.compareToIgnoreCase(originalHost) != 0) { updateRobotsCRCForDomain(_robotsCRC, activeHost, _robotsReturned400, _robotsReturned403); } } Disposition oldDisposition = _disposition; if (getNextPending(false) != null) { _disposition = Disposition.ItemAvailable; } else { _disposition = Disposition.WaitingOnTime; } if (oldDisposition != _disposition) { // notify queue getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition); } } public void taskFailed(Exception e) { if (Environment.detailLogEnabled()) LOG.error("####Robots parsing for host:" + _robotsHostName + " failed with exception" + e); _ruleSet = RobotRulesParser.getEmptyRules(); Disposition oldDisposition = _disposition; if (getNextPending(false) != null) { _disposition = Disposition.ItemAvailable; } else { _disposition = Disposition.WaitingOnTime; } if (oldDisposition != _disposition) { // notify queue getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition); } } })); } // explitly return here ( inorder to wait for the async completion event) return; } //otherwise ... else { synchronized (crawlerStats) { crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1); } CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed, 0); _robotsCRC = 0; if (Environment.detailLogEnabled()) LOG.info("####Robots GET for Host:" + activeHost + "FAILED With Result Code:" + resultCode); //TODO: MAKE THIS MORE ROBUST ... // clear robots flag ... _robotsRetrieved = true; // see if result code was a 403 if (resultCode >= 400 && resultCode <= 499) { _robotsReturned400 = true; if (resultCode == 403) _robotsReturned403 = true; } // for now, assume no robots rules for any error conditions ... _ruleSet = RobotRulesParser.getEmptyRules(); if (originalHost != null && activeHost != null) { updateRobotsCRCForDomain(_robotsCRC, originalHost, _robotsReturned400, _robotsReturned403); if (activeHost.compareToIgnoreCase(originalHost) != 0) { updateRobotsCRCForDomain(_robotsCRC, activeHost, _robotsReturned400, _robotsReturned403); } } } } if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) { // update active host stats and check for failure ... checkActiveHostStatsForFailure(); } // if there are no more items in the queue if (getNextPending(false) == null) { // if offline count is zero then mark this domain's queue as empty if (_offlineTargetCount == 0) { _disposition = Disposition.QueueEmpty; } // otherwise put us in a wait state and potentially queue up a disk load else { _disposition = Disposition.WaitingOnTime; // potentially queue up a disk load potentiallyQueueDiskLoad(); } } else { // if we are ready to fetch the next item ... if (calculateNextWaitTime() < System.currentTimeMillis()) { _disposition = Disposition.ItemAvailable; } else { // transition to a new wait state ... _disposition = Disposition.WaitingOnTime; } } if (oldDisposition != _disposition) { // either way ... notify queue getHost().listDispositionChanged(this, oldDisposition, _disposition); } } }
From source file:org.commoncrawl.util.TimeSeriesDataFile.java
License:Open Source License
private void doCommonRead(ArrayList<KeyValueTuple<Long, ValueType>> valuesOut, RandomAccessFile file, long headerOffset, long endOfPrevRecord, int currentRecordLength, int recordsToRead, long optionalMinKeyValue) throws IOException { Buffer recordBuffer = new Buffer(); DataInputBuffer inputBuffer = new DataInputBuffer(); // ok start walking backwards ... while (recordsToRead != 0) { // setup new previous record pos pointer endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4; // and seek to it endOfLastRecord - 4 file.seek(endOfPrevRecord - 4);// w w w. j ava 2 s .c o m recordBuffer.setCapacity(currentRecordLength + 8); // read in proper amount of data ... file.read(recordBuffer.get(), 0, currentRecordLength + 8); // ok initialize input buffer ... inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8); // now read next record length first ... int nextRecordLength = inputBuffer.readInt(); // next read sync bytes ... int syncBytes = inputBuffer.readInt(); // validate if (syncBytes != SyncBytes) { throw new IOException("Corrupt Record Detected!"); } // ok read real record bytes ... int realRecordBytes = inputBuffer.readInt(); // read crc ... long crcValue = inputBuffer.readLong(); // ok validate crc ... crc.reset(); crc.update(inputBuffer.getData(), inputBuffer.getPosition(), realRecordBytes - 8); if (crcValue != crc.getValue()) { throw new IOException("CRC Mismatch!"); } // ok now read key and value try { long key = WritableUtils.readVLong(inputBuffer); if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) { break; } ValueType value = (ValueType) valueClass.newInstance(); value.readFields(inputBuffer); KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value); tuple.recordPos = endOfPrevRecord; valuesOut.add(0, tuple); } catch (Exception e) { throw new IOException(e); } currentRecordLength = nextRecordLength; recordsToRead--; if (endOfPrevRecord == headerOffset) break; } }