Example usage for org.apache.hadoop.io MD5Hash getDigest

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash getDigest.

Prototype

public byte[] getDigest()

Source Link

Document

Returns the digest bytes.

Usage

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value,
        Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException {

    FlexBuffer contentOut = null;// w  ww .  jav a 2s  .  c  om
    String textOut = null;

    NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders());

    CrawlURLMetadata urlMetadata = new CrawlURLMetadata();

    // extract information from http headers ... 
    HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
    // get the mime type ... 
    String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)
            ? urlMetadata.getContentType()
            : "text/html";

    metadata.addProperty("mime_type", normalizedMimeType);
    crawlMeta.setMimeType(normalizedMimeType);

    // get download size ... 
    int downloadSize = value.getContentRaw().getCount();

    // set original content len ... 
    metadata.addProperty("download_size", downloadSize);
    crawlMeta.setDownloadSize(downloadSize);

    // set truncation flag 
    if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
        metadata.addProperty("download_truncated", true);
        crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
    }

    if (downloadSize > 0) {
        // get content type, charset and encoding 
        String encoding = finalHeaders.findValue("Content-Encoding");
        boolean isGZIP = false;
        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
            isGZIP = true;
        }

        byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
        int contentLen = value.getContentRaw().getCount();

        // assume we are going to output original data ... 
        contentOut = new FlexBuffer(contentBytes, 0, contentLen);

        if (isGZIP) {
            metadata.addProperty("content_is_gzip", isGZIP);
            crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);

            UnzipResult unzipResult = null;
            try {
                // LOG.info("BEFORE GUNZIP");
                unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen,
                        CrawlEnvironment.GUNZIP_SIZE_LIMIT);
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }

            if (unzipResult != null && unzipResult.data != null) {

                if (unzipResult.wasTruncated) {
                    LOG.warn("Truncated Document During GZIP:" + finalURL);
                    reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
                }

                contentBytes = unzipResult.data.get();
                contentLen = unzipResult.data.getCount();

                metadata.addProperty("gunzip_content_len", unzipResult.data.getCount());
                crawlMeta.setGunzipSize(unzipResult.data.getCount());

                // update content out ... 
                contentOut = new FlexBuffer(contentBytes, 0, contentLen);
            } else {

                metadata.addProperty("gunzip_failed", true);
                crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);

                reporter.incrCounter(Counters.GUNZIP_FAILED, 1);

                contentBytes = null;
                contentLen = 0;

                contentOut = null;
            }
            // LOG.info("AFTER GUNZIP");
        }

        if (contentBytes != null) {

            // ok compute an md5 hash 
            MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen);

            metadata.addProperty("md5", md5Hash.toString());
            crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length));
            // get normalized mime type 
            if (MimeTypeFilter.isTextType(normalizedMimeType)) {
                // ok time to decode the data into ucs2 ... 
                Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils
                        .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
                // ok write out decode metadata 
                metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
                crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
                metadata.addProperty("charset_detector", decodeResult.e0.e0);
                crawlMeta.setCharsetDetector(decodeResult.e0.e0);
                // get the content 
                String textContent = decodeResult.e1;
                // compute simhash 
                long simhash = SimHash.computeOptimizedSimHashForString(textContent);
                metadata.addProperty("text_simhash", simhash);
                crawlMeta.setTextSimHash(simhash);

                // figure out simplified mime type ... 
                MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter
                        .checkMimeTypeDisposition(normalizedMimeType);

                boolean parseComplete = false;

                Pair<JsonObject, String> tupleOut = null;

                // write it out 
                if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
                    // ok parse as html 
                    tupleOut = parseHTMLDocument(finalURL, value.getHeaders(),
                            new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter);

                    if (tupleOut == null) {
                        reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
                        LOG.error("Unable to Parse as HTML:" + finalURL.toString());
                        mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                    } else {
                        reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
                        metadata.addProperty("parsed_as", "html");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED
                        || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {

                    // ok try parse this document as a feed ...
                    tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,
                            crawlMeta.getFeedContent(),
                            ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter);

                    if (tupleOut == null) {
                        if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
                            reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
                            //TODO:HACK 
                            //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
                            mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                        }
                    } else {
                        reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
                        metadata.addProperty("parsed_as", "feed");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
                    reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
                    mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                }
                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
                    // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
                    // TODO: FIX THIS BUT PUNT FOR NOW :-(
                    //tupleOut = new Pair<JsonObject,String>(null,textContent);
                }

                if (tupleOut != null) {
                    if (tupleOut.e0 != null) {
                        metadata.add("content", tupleOut.e0);
                    }
                    textOut = tupleOut.e1;
                }
            }
        }
    }
    return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut,
            new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut));
}

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

@Override
public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter)
        throws IOException {

    if (url.getLength() == 0) {
        LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
        return;/*from  w ww.  jav  a 2s  .  c om*/
    }

    try {
        // allocate parse output 
        ParseOutput parseOutput = new ParseOutput();
        // json object out ... 
        JsonObject jsonObj = new JsonObject();
        // and create a crawl metadata 
        CrawlMetadata metadata = parseOutput.getCrawlMetadata();

        // and content (if available) ... 
        Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null;

        URL originalURL = null;

        try {
            originalURL = new URL(url.toString());
        } catch (MalformedURLException e) {
            LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
            return;
        }

        URL finalURL = originalURL;

        jsonObj.addProperty("attempt_time", value.getLastAttemptTime());
        metadata.setAttemptTime(value.getLastAttemptTime());

        // first step write status 
        jsonObj.addProperty("disposition",
                (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE");
        metadata.setCrawlDisposition(
                (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));

        // deal with redirects ... 
        if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter);
            jsonObj.add("redirect_from", redirect.e1);
            finalURL = redirect.e0;
        }

        if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
            jsonObj.addProperty("failure_reason",
                    CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
            metadata.setFailureReason(value.getLastAttemptFailureReason());
            jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail());
            metadata.setFailureDetail(value.getLastAttemptFailureDetail());
        } else {
            jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
            metadata.setServerIP(value.getServerIP());
            jsonObj.addProperty("http_result", value.getResultCode());
            metadata.setHttpResult(value.getResultCode());
            jsonObj.add("http_headers",
                    httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
            metadata.setHttpHeaders(value.getHeaders());
            jsonObj.addProperty("content_len", value.getContentRaw().getCount());
            metadata.setContentLength(value.getContentRaw().getCount());
            if (value.getResultCode() >= 200 && value.getResultCode() <= 299
                    && value.getContentRaw().getCount() > 0) {
                contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata);
            }
        }

        // ok ... write stuff out ...
        reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
        //////////////////////////////////////////////////////////////
        // echo some stuff to parseOutput ... 
        parseOutput.setMetadata(jsonObj.toString());
        JsonElement mimeType = jsonObj.get("mime_type");
        if (mimeType != null) {
            parseOutput.setNormalizedMimeType(mimeType.getAsString());
        }
        JsonElement md5 = jsonObj.get("md5");
        if (md5 != null) {
            MD5Hash hash = new MD5Hash(md5.getAsString());
            byte[] bytes = hash.getDigest();
            parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length));
        }
        JsonElement simHash = jsonObj.get("text_simhash");
        if (simHash != null) {
            parseOutput.setSimHash(simHash.getAsLong());
        }
        parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
        parseOutput.setFetchTime(value.getLastAttemptTime());
        ////////////////////////////////////////////////////////////

        if (contentOut != null) {
            if (contentOut.e0 != null) {
                parseOutput.setTextContent(contentOut.e0);
                reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
            }
            if (contentOut.e1 != null) {

                // directly set the text bytes ... 
                parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
                // mark it dirty !!!
                parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
                // if content available ... 
                if (contentOut.e1.e1 != null) {
                    parseOutput.setRawContent(contentOut.e1.e1);
                }
                reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
            }
        }

        //buildCompactMetadata(parseOutput,jsonObj,urlMap);

        output.collect(new Text(finalURL.toString()), parseOutput);
    } catch (IOException e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
        //TODO:HACK
        //throw e;
    } catch (Exception e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
        //TODO: HACK 
        //throw new IOException(e);
    }
}