List of usage examples for org.apache.hadoop.record Buffer Buffer
public Buffer(byte[] bytes, int offset, int length)
From source file:com.jfolson.hive.serde.RBaseSerDe.java
License:Apache License
protected void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();/*from ww w .ja v a 2 s . c o m*/ return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); // Don't use array (typecode: 144) until everything supports NA values in typedbytes if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:com.jfolson.hive.serde.RTypedBytesOutput.java
License:Apache License
public void writeTypedBytes(TypedBytesWritable tb) throws IOException { //LOG.info("Writing as typedbytes bytes"); writeRaw(new Buffer(tb.getBytes(), 0, tb.getLength()).get()); }
From source file:com.jfolson.hive.serde.RTypedBytesSerDe.java
License:Apache License
private void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();/*from w w w . j av a2 s .com*/ return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:org.commoncrawl.service.crawler.CrawlSegmentLog.java
License:Open Source License
/** sync the incoming segment against the local crawl log and then send it up to the history server **/ public int syncToLog(CrawlSegmentFPMap segmentDetail) throws IOException { if (Environment.detailLogEnabled()) LOG.info("### SYNC: List:" + _listId + " Segment:" + _segmentId + " Syncing Progress Log"); int itemsProcessed = 0; // and construct a path to the local crawl segment directory ... File activeLogPath = buildActivePath(_rootDataDir, _listId, _segmentId); File checkpointLogPath = buildCheckpointPath(_rootDataDir, _listId, _segmentId); // check if it exists ... if (checkpointLogPath.exists()) { // log it ... if (Environment.detailLogEnabled()) LOG.info("### SYNC: List:" + _listId + " Segment:" + _segmentId + " Checkpoint Log Found"); // rename it as the active log ... checkpointLogPath.renameTo(activeLogPath); }/*from w w w. j a va 2 s .c om*/ if (activeLogPath.exists()) { // reconcile against active log (if it exists) ... _localLogItemCount = reconcileLogFile(FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(activeLogPath.getAbsolutePath()), _listId, _segmentId, segmentDetail, null); if (Environment.detailLogEnabled()) LOG.info("### SYNC: List:" + _listId + " Segment:" + _segmentId + " Reconciled Local Log File with ProcessedItemCount:" + _localLogItemCount); itemsProcessed += _localLogItemCount; } FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); // first things first ... check to see if special completion log file exists in hdfs Path hdfsSegmentCompletionLogPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + getListId() + "/" + getSegmentId() + "/" + CrawlEnvironment.buildCrawlSegmentCompletionLogFileName(getNodeName())); if (hdfs.exists(hdfsSegmentCompletionLogPath)) { if (Environment.detailLogEnabled()) LOG.info("### SYNC: List:" + _listId + " Segment:" + _segmentId + " Completion File Found. Marking Segment Complete"); // if the file exists then this segment has been crawled and uploaded already ... // if active log file exists ... delete it ... if (activeLogPath.exists()) activeLogPath.delete(); //reset local log item count ... _localLogItemCount = 0; itemsProcessed = -1; // remove all hosts from segment segmentDetail._urlsComplete = segmentDetail._urlCount; } else { if (segmentDetail != null) { if (Environment.detailLogEnabled()) LOG.info("### SYNC: Building BulkItem History Query for List:" + _listId + " Segment:" + _segmentId); BulkItemHistoryQuery query = buildHistoryQueryBufferFromMap(segmentDetail); if (query != null) { // create blocking semaphore ... final Semaphore semaphore = new Semaphore(1); semaphore.acquireUninterruptibly(); if (Environment.detailLogEnabled()) LOG.info("### SYNC: Dispatching query to history server"); //create an outer response object we can pass aysnc response to ... final BulkItemHistoryQueryResponse outerResponse = new BulkItemHistoryQueryResponse(); CrawlerServer.getServer().getHistoryServiceStub().bulkItemQuery(query, new Callback<BulkItemHistoryQuery, BulkItemHistoryQueryResponse>() { @Override public void requestComplete( final AsyncRequest<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> request) { // response returns in async thread context ... if (request.getStatus() == Status.Success) { if (Environment.detailLogEnabled()) LOG.info( "###SYNC: bulk Query to history server succeeded. setting out resposne"); ImmutableBuffer buffer = request.getOutput().getResponseList(); outerResponse.setResponseList( new Buffer(buffer.getReadOnlyBytes(), 0, buffer.getCount())); } else { LOG.error("###SYNC: bulk Query to history server failed."); } // release semaphore semaphore.release(); } }); LOG.info("###SYNC: Loader thread blocked waiting for bulk query response"); semaphore.acquireUninterruptibly(); LOG.info("###SYNC: Loader thread received response from history server"); if (outerResponse.getResponseList().getCount() == 0) { LOG.error("###SYNC: History Server Bulk Query Returned NULL!!! for List:" + _listId + " Segment:" + _segmentId); } else { // ok time to process the response and integrate the results into the fp list updateFPMapFromBulkQueryResponse(segmentDetail, outerResponse); } } else { if (Environment.detailLogEnabled()) LOG.warn("### SYNC: No fingerprints found when processing segment detail for List:" + _listId + " Segment:" + _segmentId); segmentDetail._urlsComplete = segmentDetail._urlCount; } } /* // and now walk hdfs looking for any checkpointed logs ... // scan based on checkpoint filename ... FileStatus[] remoteCheckpointFiles = hdfs.globStatus(new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + getListId() + "/" + getSegmentId() + "/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getNodeName()))); if (remoteCheckpointFiles != null) { LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Found Remote Checkpoint Files"); // create a temp file to hold the reconciled log ... File consolidatedLogFile = null; if (remoteCheckpointFiles.length > 1) { // create temp log file ... consolidatedLogFile = File.createTempFile("SegmentLog", Long.toString(System.currentTimeMillis())); // write out header ... CrawlSegmentLog.writeHeader(consolidatedLogFile,0); } // walk the files for(FileStatus checkpointFilePath : remoteCheckpointFiles) { // and reconcile them against segment ... itemsProcessed += reconcileLogFile(hdfs,checkpointFilePath.getPath(),getListId(),getSegmentId(),segmentDetail,consolidatedLogFile); LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Processed Checkpoint File:" + checkpointFilePath.getPath() + " Items Processed:" + itemsProcessed); } // finally ... if consolidatedLogFile is not null if (consolidatedLogFile != null) { // build a new hdfs file name ... Path consolidatedHDFSPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + getListId() + "/" + getSegmentId() + "/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointFileName(getNodeName(), System.currentTimeMillis())); LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Writing Consolidated Log File:" + consolidatedHDFSPath + " to HDFS"); // and copy local file to log ... hdfs.copyFromLocalFile(new Path(consolidatedLogFile.getAbsolutePath()),consolidatedHDFSPath); // and delete all previous log file entries ... for (FileStatus oldCheckPointFile : remoteCheckpointFiles) { hdfs.delete(oldCheckPointFile.getPath()); } consolidatedLogFile.delete(); } } */ } if (segmentDetail != null) { _remainingURLS += (segmentDetail._urlCount - segmentDetail._urlsComplete); // mark url count as valid now ... _urlCountValid = true; // now if remaining url count is zero ... then mark the segment as complete ... if (_remainingURLS == 0 && _localLogItemCount == 0) { _segmentComplete = true; } } if (Environment.detailLogEnabled()) LOG.info("### SYNC: List:" + _listId + " Segment:" + _segmentId + " Done Syncing Progress Log TotalURLS:" + segmentDetail._urlCount + " RemainingURLS:" + _remainingURLS + " LocalLogItemCount:" + _localLogItemCount); return itemsProcessed; }
From source file:org.commoncrawl.service.crawlhistory.CrawlHistoryServer.java
License:Open Source License
@Override public void bulkItemQuery(AsyncContext<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> rpcContext) throws RPCException { LOG.info("Received BulkItemQueryRequest"); ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList(); if (inputBuffer.getCount() != 0) { try {/* w w w. java 2 s. com*/ if (_bloomFilter == null) { throw new IOException("BloomFilter Not Initilized. Invalid Server State!"); } DataInputStream inputStream = new DataInputStream( new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(), 0, inputBuffer.getCount())); BitStream bitStreamOut = new BitStream(); URLFPV2 fingerprint = new URLFPV2(); int itemsPresent = 0; while (inputStream.available() != 0) { fingerprint.setDomainHash(WritableUtils.readVLong(inputStream)); fingerprint.setUrlHash(WritableUtils.readVLong(inputStream)); if (_bloomFilter.isPresent(fingerprint)) { bitStreamOut.addbit(1); ++itemsPresent; } else { bitStreamOut.addbit(0); } } LOG.info("Received BulkItemQueryRequest Completed with " + itemsPresent + " items found"); rpcContext.getOutput() .setResponseList(new Buffer(bitStreamOut.bits, 0, (bitStreamOut.nbits + 7) / 8)); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } rpcContext.completeRequest(); } }
From source file:org.commoncrawl.service.listcrawler.ProxyServlet.java
License:Open Source License
private static void cacheS3ItemResult(ArcFileItem itemResult, String targetURL, long fingerpint) { CacheItem cacheItem = new CacheItem(); cacheItem.setUrlFingerprint(fingerpint); cacheItem.setUrl(targetURL);/*from ww w . ja va 2 s . c om*/ cacheItem.setSource((byte) CacheItem.Source.S3Cache); cacheItem.setHeaderItems(itemResult.getHeaderItems()); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); cacheItem.setContent( new Buffer(itemResult.getContent().getReadOnlyBytes(), 0, itemResult.getContent().getCount())); if ((itemResult.getFlags() & ArcFileItem.Flags.TruncatedInDownload) != 0) { cacheItem.setFlags(cacheItem.getFlags() | CacheItem.Flags.Flag_WasTruncatedDuringDownload); } if ((itemResult.getFlags() & ArcFileItem.Flags.TruncatedInInflate) != 0) { cacheItem.setFlags(cacheItem.getFlags() | CacheItem.Flags.Flag_WasTruncatedDuringInflate); } ProxyServer.getSingleton().getCache().cacheItem(cacheItem, null); }
From source file:org.commoncrawl.service.listcrawler.ProxyServlet.java
License:Open Source License
private static void sendS3ItemResponse(final HttpServletRequest req, final HttpServletResponse response, ArcFileItem responseItem, String renderAs, AsyncResponse responseObject, long requestStartTime) throws IOException { CacheItem cacheItem = new CacheItem(); // populate a cache item object ... cacheItem.setHeaderItems(responseItem.getHeaderItems()); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); cacheItem.setUrl(responseItem.getUri()); cacheItem.setUrlFingerprint(URLUtils.getCanonicalURLFingerprint(responseItem.getUri(), true)); cacheItem.setSource((byte) CacheItem.Source.S3Cache); cacheItem.setContent(//from w ww. j a v a2s. c o m new Buffer(responseItem.getContent().getReadOnlyBytes(), 0, responseItem.getContent().getCount())); sendCacheItemResponse(req, response, cacheItem, true, renderAs, responseObject, requestStartTime); }