Example usage for org.apache.hadoop.io MD5Hash getDigest

List of usage examples for org.apache.hadoop.io MD5Hash getDigest

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash getDigest.

Prototype

public byte[] getDigest() 

Source Link

Document

Returns the digest bytes.

Usage

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value,
        Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException {

    FlexBuffer contentOut = null;// w  ww .  jav a 2s  .  c  om
    String textOut = null;

    NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders());

    CrawlURLMetadata urlMetadata = new CrawlURLMetadata();

    // extract information from http headers ... 
    HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
    // get the mime type ... 
    String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)
            ? urlMetadata.getContentType()
            : "text/html";

    metadata.addProperty("mime_type", normalizedMimeType);
    crawlMeta.setMimeType(normalizedMimeType);

    // get download size ... 
    int downloadSize = value.getContentRaw().getCount();

    // set original content len ... 
    metadata.addProperty("download_size", downloadSize);
    crawlMeta.setDownloadSize(downloadSize);

    // set truncation flag 
    if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
        metadata.addProperty("download_truncated", true);
        crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
    }

    if (downloadSize > 0) {
        // get content type, charset and encoding 
        String encoding = finalHeaders.findValue("Content-Encoding");
        boolean isGZIP = false;
        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
            isGZIP = true;
        }

        byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
        int contentLen = value.getContentRaw().getCount();

        // assume we are going to output original data ... 
        contentOut = new FlexBuffer(contentBytes, 0, contentLen);

        if (isGZIP) {
            metadata.addProperty("content_is_gzip", isGZIP);
            crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);

            UnzipResult unzipResult = null;
            try {
                // LOG.info("BEFORE GUNZIP");
                unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen,
                        CrawlEnvironment.GUNZIP_SIZE_LIMIT);
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }

            if (unzipResult != null && unzipResult.data != null) {

                if (unzipResult.wasTruncated) {
                    LOG.warn("Truncated Document During GZIP:" + finalURL);
                    reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
                }

                contentBytes = unzipResult.data.get();
                contentLen = unzipResult.data.getCount();

                metadata.addProperty("gunzip_content_len", unzipResult.data.getCount());
                crawlMeta.setGunzipSize(unzipResult.data.getCount());

                // update content out ... 
                contentOut = new FlexBuffer(contentBytes, 0, contentLen);
            } else {

                metadata.addProperty("gunzip_failed", true);
                crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);

                reporter.incrCounter(Counters.GUNZIP_FAILED, 1);

                contentBytes = null;
                contentLen = 0;

                contentOut = null;
            }
            // LOG.info("AFTER GUNZIP");
        }

        if (contentBytes != null) {

            // ok compute an md5 hash 
            MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen);

            metadata.addProperty("md5", md5Hash.toString());
            crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length));
            // get normalized mime type 
            if (MimeTypeFilter.isTextType(normalizedMimeType)) {
                // ok time to decode the data into ucs2 ... 
                Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils
                        .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
                // ok write out decode metadata 
                metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
                crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
                metadata.addProperty("charset_detector", decodeResult.e0.e0);
                crawlMeta.setCharsetDetector(decodeResult.e0.e0);
                // get the content 
                String textContent = decodeResult.e1;
                // compute simhash 
                long simhash = SimHash.computeOptimizedSimHashForString(textContent);
                metadata.addProperty("text_simhash", simhash);
                crawlMeta.setTextSimHash(simhash);

                // figure out simplified mime type ... 
                MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter
                        .checkMimeTypeDisposition(normalizedMimeType);

                boolean parseComplete = false;

                Pair<JsonObject, String> tupleOut = null;

                // write it out 
                if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
                    // ok parse as html 
                    tupleOut = parseHTMLDocument(finalURL, value.getHeaders(),
                            new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter);

                    if (tupleOut == null) {
                        reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
                        LOG.error("Unable to Parse as HTML:" + finalURL.toString());
                        mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                    } else {
                        reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
                        metadata.addProperty("parsed_as", "html");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED
                        || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {

                    // ok try parse this document as a feed ...
                    tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,
                            crawlMeta.getFeedContent(),
                            ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter);

                    if (tupleOut == null) {
                        if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
                            reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
                            //TODO:HACK 
                            //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
                            mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                        }
                    } else {
                        reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
                        metadata.addProperty("parsed_as", "feed");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
                    reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
                    mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                }
                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
                    // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
                    // TODO: FIX THIS BUT PUNT FOR NOW :-(
                    //tupleOut = new Pair<JsonObject,String>(null,textContent);
                }

                if (tupleOut != null) {
                    if (tupleOut.e0 != null) {
                        metadata.add("content", tupleOut.e0);
                    }
                    textOut = tupleOut.e1;
                }
            }
        }
    }
    return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut,
            new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut));
}

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

@Override
public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter)
        throws IOException {

    if (url.getLength() == 0) {
        LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
        return;/*from  w ww.  jav  a 2s  .  c om*/
    }

    try {
        // allocate parse output 
        ParseOutput parseOutput = new ParseOutput();
        // json object out ... 
        JsonObject jsonObj = new JsonObject();
        // and create a crawl metadata 
        CrawlMetadata metadata = parseOutput.getCrawlMetadata();

        // and content (if available) ... 
        Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null;

        URL originalURL = null;

        try {
            originalURL = new URL(url.toString());
        } catch (MalformedURLException e) {
            LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
            return;
        }

        URL finalURL = originalURL;

        jsonObj.addProperty("attempt_time", value.getLastAttemptTime());
        metadata.setAttemptTime(value.getLastAttemptTime());

        // first step write status 
        jsonObj.addProperty("disposition",
                (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE");
        metadata.setCrawlDisposition(
                (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));

        // deal with redirects ... 
        if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter);
            jsonObj.add("redirect_from", redirect.e1);
            finalURL = redirect.e0;
        }

        if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
            jsonObj.addProperty("failure_reason",
                    CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
            metadata.setFailureReason(value.getLastAttemptFailureReason());
            jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail());
            metadata.setFailureDetail(value.getLastAttemptFailureDetail());
        } else {
            jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
            metadata.setServerIP(value.getServerIP());
            jsonObj.addProperty("http_result", value.getResultCode());
            metadata.setHttpResult(value.getResultCode());
            jsonObj.add("http_headers",
                    httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
            metadata.setHttpHeaders(value.getHeaders());
            jsonObj.addProperty("content_len", value.getContentRaw().getCount());
            metadata.setContentLength(value.getContentRaw().getCount());
            if (value.getResultCode() >= 200 && value.getResultCode() <= 299
                    && value.getContentRaw().getCount() > 0) {
                contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata);
            }
        }

        // ok ... write stuff out ...
        reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
        //////////////////////////////////////////////////////////////
        // echo some stuff to parseOutput ... 
        parseOutput.setMetadata(jsonObj.toString());
        JsonElement mimeType = jsonObj.get("mime_type");
        if (mimeType != null) {
            parseOutput.setNormalizedMimeType(mimeType.getAsString());
        }
        JsonElement md5 = jsonObj.get("md5");
        if (md5 != null) {
            MD5Hash hash = new MD5Hash(md5.getAsString());
            byte[] bytes = hash.getDigest();
            parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length));
        }
        JsonElement simHash = jsonObj.get("text_simhash");
        if (simHash != null) {
            parseOutput.setSimHash(simHash.getAsLong());
        }
        parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
        parseOutput.setFetchTime(value.getLastAttemptTime());
        ////////////////////////////////////////////////////////////

        if (contentOut != null) {
            if (contentOut.e0 != null) {
                parseOutput.setTextContent(contentOut.e0);
                reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
            }
            if (contentOut.e1 != null) {

                // directly set the text bytes ... 
                parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
                // mark it dirty !!!
                parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
                // if content available ... 
                if (contentOut.e1.e1 != null) {
                    parseOutput.setRawContent(contentOut.e1.e1);
                }
                reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
            }
        }

        //buildCompactMetadata(parseOutput,jsonObj,urlMap);

        output.collect(new Text(finalURL.toString()), parseOutput);
    } catch (IOException e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
        //TODO:HACK
        //throw e;
    } catch (Exception e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
        //TODO: HACK 
        //throw new IOException(e);
    }
}