List of usage examples for org.apache.hadoop.io MD5Hash getDigest
public byte[] getDigest()
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value, Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException { FlexBuffer contentOut = null;// w ww . jav a 2s . c om String textOut = null; NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders()); CrawlURLMetadata urlMetadata = new CrawlURLMetadata(); // extract information from http headers ... HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata); // get the mime type ... String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE) ? urlMetadata.getContentType() : "text/html"; metadata.addProperty("mime_type", normalizedMimeType); crawlMeta.setMimeType(normalizedMimeType); // get download size ... int downloadSize = value.getContentRaw().getCount(); // set original content len ... metadata.addProperty("download_size", downloadSize); crawlMeta.setDownloadSize(downloadSize); // set truncation flag if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { metadata.addProperty("download_truncated", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated); } if (downloadSize > 0) { // get content type, charset and encoding String encoding = finalHeaders.findValue("Content-Encoding"); boolean isGZIP = false; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { isGZIP = true; } byte[] contentBytes = value.getContentRaw().getReadOnlyBytes(); int contentLen = value.getContentRaw().getCount(); // assume we are going to output original data ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); if (isGZIP) { metadata.addProperty("content_is_gzip", isGZIP); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP); UnzipResult unzipResult = null; try { // LOG.info("BEFORE GUNZIP"); unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen, CrawlEnvironment.GUNZIP_SIZE_LIMIT); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } if (unzipResult != null && unzipResult.data != null) { if (unzipResult.wasTruncated) { LOG.warn("Truncated Document During GZIP:" + finalURL); reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1); } contentBytes = unzipResult.data.get(); contentLen = unzipResult.data.getCount(); metadata.addProperty("gunzip_content_len", unzipResult.data.getCount()); crawlMeta.setGunzipSize(unzipResult.data.getCount()); // update content out ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); } else { metadata.addProperty("gunzip_failed", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed); reporter.incrCounter(Counters.GUNZIP_FAILED, 1); contentBytes = null; contentLen = 0; contentOut = null; } // LOG.info("AFTER GUNZIP"); } if (contentBytes != null) { // ok compute an md5 hash MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen); metadata.addProperty("md5", md5Hash.toString()); crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length)); // get normalized mime type if (MimeTypeFilter.isTextType(normalizedMimeType)) { // ok time to decode the data into ucs2 ... Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen); // ok write out decode metadata metadata.addProperty("charset_detected", decodeResult.e0.e1.toString()); crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString()); metadata.addProperty("charset_detector", decodeResult.e0.e0); crawlMeta.setCharsetDetector(decodeResult.e0.e0); // get the content String textContent = decodeResult.e1; // compute simhash long simhash = SimHash.computeOptimizedSimHashForString(textContent); metadata.addProperty("text_simhash", simhash); crawlMeta.setTextSimHash(simhash); // figure out simplified mime type ... MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter .checkMimeTypeDisposition(normalizedMimeType); boolean parseComplete = false; Pair<JsonObject, String> tupleOut = null; // write it out if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) { // ok parse as html tupleOut = parseHTMLDocument(finalURL, value.getHeaders(), new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter); if (tupleOut == null) { reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1); LOG.error("Unable to Parse as HTML:" + finalURL.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } else { reporter.incrCounter(Counters.PARSED_HTML_DOC, 1); metadata.addProperty("parsed_as", "html"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) { // ok try parse this document as a feed ... tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent, crawlMeta.getFeedContent(), ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter); if (tupleOut == null) { if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) { reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1); //TODO:HACK //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } } else { reporter.incrCounter(Counters.PARSED_FEED_URL, 1); metadata.addProperty("parsed_as", "feed"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) { reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) { // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text"); // TODO: FIX THIS BUT PUNT FOR NOW :-( //tupleOut = new Pair<JsonObject,String>(null,textContent); } if (tupleOut != null) { if (tupleOut.e0 != null) { metadata.add("content", tupleOut.e0); } textOut = tupleOut.e1; } } } } return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut, new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut)); }
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
@Override public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter) throws IOException { if (url.getLength() == 0) { LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL()); return;/*from w ww. jav a 2s . c om*/ } try { // allocate parse output ParseOutput parseOutput = new ParseOutput(); // json object out ... JsonObject jsonObj = new JsonObject(); // and create a crawl metadata CrawlMetadata metadata = parseOutput.getCrawlMetadata(); // and content (if available) ... Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null; URL originalURL = null; try { originalURL = new URL(url.toString()); } catch (MalformedURLException e) { LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1); return; } URL finalURL = originalURL; jsonObj.addProperty("attempt_time", value.getLastAttemptTime()); metadata.setAttemptTime(value.getLastAttemptTime()); // first step write status jsonObj.addProperty("disposition", (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE"); metadata.setCrawlDisposition( (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1)); // deal with redirects ... if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter); jsonObj.add("redirect_from", redirect.e1); finalURL = redirect.e0; } if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) { jsonObj.addProperty("failure_reason", CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason())); metadata.setFailureReason(value.getLastAttemptFailureReason()); jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail()); metadata.setFailureDetail(value.getLastAttemptFailureDetail()); } else { jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); metadata.setServerIP(value.getServerIP()); jsonObj.addProperty("http_result", value.getResultCode()); metadata.setHttpResult(value.getResultCode()); jsonObj.add("http_headers", httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders()))); metadata.setHttpHeaders(value.getHeaders()); jsonObj.addProperty("content_len", value.getContentRaw().getCount()); metadata.setContentLength(value.getContentRaw().getCount()); if (value.getResultCode() >= 200 && value.getResultCode() <= 299 && value.getContentRaw().getCount() > 0) { contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata); } } // ok ... write stuff out ... reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1); ////////////////////////////////////////////////////////////// // echo some stuff to parseOutput ... parseOutput.setMetadata(jsonObj.toString()); JsonElement mimeType = jsonObj.get("mime_type"); if (mimeType != null) { parseOutput.setNormalizedMimeType(mimeType.getAsString()); } JsonElement md5 = jsonObj.get("md5"); if (md5 != null) { MD5Hash hash = new MD5Hash(md5.getAsString()); byte[] bytes = hash.getDigest(); parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length)); } JsonElement simHash = jsonObj.get("text_simhash"); if (simHash != null) { parseOutput.setSimHash(simHash.getAsLong()); } parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); parseOutput.setFetchTime(value.getLastAttemptTime()); //////////////////////////////////////////////////////////// if (contentOut != null) { if (contentOut.e0 != null) { parseOutput.setTextContent(contentOut.e0); reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1); } if (contentOut.e1 != null) { // directly set the text bytes ... parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0); // mark it dirty !!! parseOutput.setFieldDirty(ParseOutput.Field_HEADERS); // if content available ... if (contentOut.e1.e1 != null) { parseOutput.setRawContent(contentOut.e1.e1); } reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1); } } //buildCompactMetadata(parseOutput,jsonObj,urlMap); output.collect(new Text(finalURL.toString()), parseOutput); } catch (IOException e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1); //TODO:HACK //throw e; } catch (Exception e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1); //TODO: HACK //throw new IOException(e); } }