Example usage for org.apache.hadoop.io MD5Hash digest

List of usage examples for org.apache.hadoop.io MD5Hash digest

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash digest.

Prototype

null digest

To view the source code for org.apache.hadoop.io MD5Hash digest.

Click Source Link

Usage

From source file:org.apache.nutch.util.DumpFileUtil.java

License:Apache License

public static String getUrlMD5(String url) {
    byte[] digest = MD5Hash.digest(url).getDigest();

    StringBuffer sb = new StringBuffer();
    for (byte b : digest) {
        sb.append(String.format("%02x", b & 0xff));
    }/*w w w . j  a va2 s.c  o  m*/

    return sb.toString();
}

From source file:org.apache.nutchbase.crawl.MD5SignatureHbase.java

License:Apache License

@Override
public byte[] calculate(ImmutableRowPart row, ParseHbase parse) {
    byte[] data = row.getContent();
    return MD5Hash.digest(data).getDigest();
}

From source file:org.apache.nutchbase.crawl.TextProfileSignatureHbase.java

License:Apache License

public byte[] calculate(ImmutableRowPart row, ParseHbase parse) {
    int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2);
    float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
    HashMap<String, Token> tokens = new HashMap<String, Token>();
    String text = null;//from   w  ww.ja v  a  2 s  .co m
    if (parse != null)
        text = parse.getText();
    if (text == null || text.length() == 0)
        return fallback.calculate(row, parse);
    StringBuffer curToken = new StringBuffer();
    int maxFreq = 0;
    for (int i = 0; i < text.length(); i++) {
        char c = text.charAt(i);
        if (Character.isLetterOrDigit(c)) {
            curToken.append(Character.toLowerCase(c));
        } else {
            if (curToken.length() > 0) {
                if (curToken.length() > MIN_TOKEN_LEN) {
                    // add it
                    String s = curToken.toString();
                    Token tok = tokens.get(s);
                    if (tok == null) {
                        tok = new Token(0, s);
                        tokens.put(s, tok);
                    }
                    tok.cnt++;
                    if (tok.cnt > maxFreq)
                        maxFreq = tok.cnt;
                }
                curToken.setLength(0);
            }
        }
    }
    // check the last token
    if (curToken.length() > MIN_TOKEN_LEN) {
        // add it
        String s = curToken.toString();
        Token tok = tokens.get(s);
        if (tok == null) {
            tok = new Token(0, s);
            tokens.put(s, tok);
        }
        tok.cnt++;
        if (tok.cnt > maxFreq)
            maxFreq = tok.cnt;
    }
    Iterator<Token> it = tokens.values().iterator();
    ArrayList<Token> profile = new ArrayList<Token>();
    // calculate the QUANT value
    int QUANT = Math.round(maxFreq * QUANT_RATE);
    if (QUANT < 2) {
        if (maxFreq > 1)
            QUANT = 2;
        else
            QUANT = 1;
    }
    while (it.hasNext()) {
        Token t = it.next();
        // round down to the nearest QUANT
        t.cnt = (t.cnt / QUANT) * QUANT;
        // discard the frequencies below the QUANT
        if (t.cnt < QUANT) {
            continue;
        }
        profile.add(t);
    }
    Collections.sort(profile, new TokenComparator());
    StringBuffer newText = new StringBuffer();
    it = profile.iterator();
    while (it.hasNext()) {
        Token t = it.next();
        if (newText.length() > 0)
            newText.append("\n");
        newText.append(t.toString());
    }
    return MD5Hash.digest(newText.toString()).getDigest();
}

From source file:org.archive.access.nutch.jobs.ImportArcs.java

License:LGPL

public void map(final WritableComparable key, final Writable value, final OutputCollector output,
        final Reporter r) throws IOException {
    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();/*w  w  w .j  av  a  2s.  c  o  m*/

    ARCRecord rec = (ARCRecord) ((ObjectWritable) value).get();
    ARCReporter reporter = (ARCReporter) r;

    // Its null first time map is called on an ARC.
    checkArcName(rec);
    if (!isIndex(rec)) {
        return;
    }
    checkCollectionName();

    final ARCRecordMetaData arcData = rec.getMetaData();
    String oldUrl = url;

    try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
        url = filters.filter(url); // filter the url
    } catch (Exception e) {
        LOG.warn("Skipping record. Didn't pass normalization/filter " + oldUrl + ": " + e.toString());

        return;
    }

    final long b = arcData.getContentBegin();
    final long l = arcData.getLength();
    final long recordLength = (l > b) ? (l - b) : l;

    // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file.  See below.
    String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url);

    if (skip(mimetype)) {
        return;
    }

    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++) {
        final Header header = headers[j];

        if (mimetype == null) {
            // Special handling. If mimetype is still null, try getting it
            // from the http header. I've seen arc record lines with empty
            // content-type and a MIME unparseable file ending; i.e. .MID.
            if ((header.getName() != null)
                    && header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY)) {
                mimetype = getMimetype(header.getValue(), null, null);

                if (skip(mimetype)) {
                    return;
                }
            }
        }

        metaData.set(header.getName(), header.getValue());
    }

    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype = TextUtils.replaceAll(ImportArcs.WHITESPACE,
            ((mimetype == null || mimetype.length() <= 0) ? "TODO" : mimetype), "-");
    final String recordLengthAsStr = Long.toString(recordLength);

    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));

    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);

    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);

    // TODO: Skip if unindexable type.
    int total = 0;

    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);

    if (mimetype == null) {
        MimeType mt = this.mimeTypes.getMimeType(this.buffer);

        if (mt == null || mt.getName() == null) {
            LOG.warn("Failed to get mimetype for: " + url);

            return;
        }

        mimetype = mt.getName();
    }

    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);

    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype)) ? this.pdfContentLimit : this.contentLimit;

    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();

    while ((len != -1) && ((readLimit == -1) || (total < readLimit))) {
        total += len;
        this.contentBuffer.write(this.buffer, 0, len);
        len = rec.read(this.buffer, 0, this.buffer.length);
        reporter.setStatusIfElapse("reading " + url);
    }

    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);

    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);

    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1) ? rec.getDigestStr() : MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);

    // Set digest back into the arcData so available later when we write
    // CDX line.
    arcData.setDigest(digest);

    metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);

    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype, metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));

    MapWritable mw = datum.getMetaData();

    if (mw == null) {
        mw = new MapWritable();
    }

    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
        mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY),
                new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName, arcData.getDate())));
    } else {
        mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
    }
    mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
    mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY), new Text(Long.toString(arcData.getOffset())));
    datum.setMetaData(mw);

    TimeoutParsingThread tout = threadPool.getThread(Thread.currentThread().getId(), timeoutIndexingDocument);
    tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);
    tout.wakeupAndWait();

    ParseStatus parseStatus = tout.getParseStatus();
    Parse parse = tout.getParse();
    reporter.setStatusIfElapse("parsed " + url);

    if (!parseStatus.isSuccess()) {
        final String status = formatToOneLine(parseStatus.toString());
        LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
        parse = null;
    } else {
        // Was it a slow parse?
        final double kbPerSecond = getParseRate(startTime, (contentBytes != null) ? contentBytes.length : 0);

        if (LOG.isDebugEnabled()) {
            LOG.debug(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond));
        } else if (kbPerSecond < this.parseThreshold) {
            LOG.warn(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond));
        }
    }

    Writable v = new FetcherOutput(datum, null, parse != null ? new ParseImpl(parse) : null);
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
        LOG.info("multiple: "
                + SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate()) + " "
                + url);
        output.collect(Nutchwax.generateWaxKey(url,
                SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate())), v);
    } else {
        output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v);
    }
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId)
        throws IOException {
    List<TestRecord> recordSet = ArcFileReaderTests
            .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
    Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz");
    FSDataOutputStream os = fs.create(filePath);
    try {/*from   w  w  w  . j  ava 2  s. c o  m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
        long streamPos = os.getPos();

        long testAttemptTime = System.currentTimeMillis();
        NIOHttpHeaders testHeaders = new NIOHttpHeaders();
        testHeaders.add("test", "test-value");

        for (TestRecord record : recordSet) {
            long preWritePos = os.getPos();
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
            long postWritePos = os.getPos();
            record.streamPos = (int) preWritePos;
            record.rawSize = (int) (postWritePos - preWritePos);
        }
        os.flush();
    } finally {
        os.close();
    }
    return new Pair<Path, List<TestRecord>>(filePath, recordSet);
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    JobConf conf = new JobConf();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/*from w  w w .  j a  v  a  2  s  .c o  m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(conf, split);

    int index = 0;

    // iterate and validate stuff ... 
    Text key = reader.createKey();
    BytesWritable value = reader.createValue();

    while (reader.next(key, value)) {

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId)
        throws IOException {
    List<TestRecord> recordSet = ArcFileReaderTests
            .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
    Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz");
    FSDataOutputStream os = fs.create(filePath);
    try {/*ww w.j a  v  a  2  s .c  om*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        NIOHttpHeaders testHeaders = new NIOHttpHeaders();
        testHeaders.add("test", "test-value");

        for (TestRecord record : recordSet) {
            long preWritePos = os.getPos();
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
            long postWritePos = os.getPos();
            record.streamPos = (int) preWritePos;
            record.rawSize = (int) (postWritePos - preWritePos);
        }
        os.flush();
    } finally {
        os.close();
    }
    return new Pair<Path, List<TestRecord>>(filePath, recordSet);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    Configuration conf = new Configuration();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {//from   w  w w  .  j  a  v a 2s  .  co m
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID()));

    int index = 0;

    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value,
        Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException {

    FlexBuffer contentOut = null;/*from w ww . ja va  2s  .com*/
    String textOut = null;

    NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders());

    CrawlURLMetadata urlMetadata = new CrawlURLMetadata();

    // extract information from http headers ... 
    HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
    // get the mime type ... 
    String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)
            ? urlMetadata.getContentType()
            : "text/html";

    metadata.addProperty("mime_type", normalizedMimeType);
    crawlMeta.setMimeType(normalizedMimeType);

    // get download size ... 
    int downloadSize = value.getContentRaw().getCount();

    // set original content len ... 
    metadata.addProperty("download_size", downloadSize);
    crawlMeta.setDownloadSize(downloadSize);

    // set truncation flag 
    if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
        metadata.addProperty("download_truncated", true);
        crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
    }

    if (downloadSize > 0) {
        // get content type, charset and encoding 
        String encoding = finalHeaders.findValue("Content-Encoding");
        boolean isGZIP = false;
        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
            isGZIP = true;
        }

        byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
        int contentLen = value.getContentRaw().getCount();

        // assume we are going to output original data ... 
        contentOut = new FlexBuffer(contentBytes, 0, contentLen);

        if (isGZIP) {
            metadata.addProperty("content_is_gzip", isGZIP);
            crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);

            UnzipResult unzipResult = null;
            try {
                // LOG.info("BEFORE GUNZIP");
                unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen,
                        CrawlEnvironment.GUNZIP_SIZE_LIMIT);
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }

            if (unzipResult != null && unzipResult.data != null) {

                if (unzipResult.wasTruncated) {
                    LOG.warn("Truncated Document During GZIP:" + finalURL);
                    reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
                }

                contentBytes = unzipResult.data.get();
                contentLen = unzipResult.data.getCount();

                metadata.addProperty("gunzip_content_len", unzipResult.data.getCount());
                crawlMeta.setGunzipSize(unzipResult.data.getCount());

                // update content out ... 
                contentOut = new FlexBuffer(contentBytes, 0, contentLen);
            } else {

                metadata.addProperty("gunzip_failed", true);
                crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);

                reporter.incrCounter(Counters.GUNZIP_FAILED, 1);

                contentBytes = null;
                contentLen = 0;

                contentOut = null;
            }
            // LOG.info("AFTER GUNZIP");
        }

        if (contentBytes != null) {

            // ok compute an md5 hash 
            MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen);

            metadata.addProperty("md5", md5Hash.toString());
            crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length));
            // get normalized mime type 
            if (MimeTypeFilter.isTextType(normalizedMimeType)) {
                // ok time to decode the data into ucs2 ... 
                Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils
                        .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
                // ok write out decode metadata 
                metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
                crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
                metadata.addProperty("charset_detector", decodeResult.e0.e0);
                crawlMeta.setCharsetDetector(decodeResult.e0.e0);
                // get the content 
                String textContent = decodeResult.e1;
                // compute simhash 
                long simhash = SimHash.computeOptimizedSimHashForString(textContent);
                metadata.addProperty("text_simhash", simhash);
                crawlMeta.setTextSimHash(simhash);

                // figure out simplified mime type ... 
                MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter
                        .checkMimeTypeDisposition(normalizedMimeType);

                boolean parseComplete = false;

                Pair<JsonObject, String> tupleOut = null;

                // write it out 
                if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
                    // ok parse as html 
                    tupleOut = parseHTMLDocument(finalURL, value.getHeaders(),
                            new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter);

                    if (tupleOut == null) {
                        reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
                        LOG.error("Unable to Parse as HTML:" + finalURL.toString());
                        mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                    } else {
                        reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
                        metadata.addProperty("parsed_as", "html");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED
                        || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {

                    // ok try parse this document as a feed ...
                    tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,
                            crawlMeta.getFeedContent(),
                            ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter);

                    if (tupleOut == null) {
                        if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
                            reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
                            //TODO:HACK 
                            //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
                            mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                        }
                    } else {
                        reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
                        metadata.addProperty("parsed_as", "feed");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
                    reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
                    mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                }
                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
                    // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
                    // TODO: FIX THIS BUT PUNT FOR NOW :-(
                    //tupleOut = new Pair<JsonObject,String>(null,textContent);
                }

                if (tupleOut != null) {
                    if (tupleOut.e0 != null) {
                        metadata.add("content", tupleOut.e0);
                    }
                    textOut = tupleOut.e1;
                }
            }
        }
    }
    return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut,
            new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut));
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java

License:Open Source License

@Test
public void validateLinkKey() throws Exception {
    // allocate scan arrays 
    FlexBuffer[] scanArray = allocateScanArray();

    URLFPV2 fp = URLUtils.getURLFPV2FromURL("http://www.google.com/");
    if (fp != null) {
        TextBytes key = generateLinkKey(fp, CrawlDBKey.Type.KEY_TYPE_HTML_LINK, "FOOBAR");
        // get it the hard way
        scanForComponents(key, ':', scanArray);

        System.out.println("Key is:" + key.toString());
        System.out.println("Check Root Domain Key");
        Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromKey(key,
                CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromComponentArray(scanArray,
                CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        System.out.println("Check Domain Key");
        Assert.assertTrue(fp.getDomainHash() == getLongComponentFromKey(key,
                CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        Assert.assertTrue(fp.getDomainHash() == getLongComponentFromComponentArray(scanArray,
                CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        System.out.println("Check URL Hash Key");
        Assert.assertTrue(//from w  w  w . jav  a 2  s.c o m
                fp.getUrlHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
        Assert.assertTrue(fp.getUrlHash() == getLongComponentFromComponentArray(scanArray,
                CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
        System.out.println("Check Type");
        Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromKey(key,
                CrawlDBKey.ComponentId.TYPE_COMPONENT_ID));
        Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromComponentArray(
                scanArray, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID));
        System.out.println("Check ExtraData");
        Assert.assertTrue(new FlexBuffer("FOOBAR".getBytes()).compareTo(
                getByteArrayComponentFromKey(key, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0);
        Assert.assertTrue(
                new FlexBuffer("FOOBAR".getBytes()).compareTo(getByteArrayFromComponentArray(scanArray,
                        CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0);

        TextBytes statusKey1 = generateCrawlStatusKey(new Text("http://www.google.com/"), 12345L);
        TextBytes statusKey2 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                12345L);
        TextBytes statusKey3 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                12346L);
        TextBytes linkKey1 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest("123").toString());
        TextBytes linkKey2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest("1234").toString());
        URLFPV2 fpLink3 = URLUtils.getURLFPV2FromURL("http://www.google.com/");
        fpLink3.setUrlHash(fpLink3.getUrlHash() + 1);
        TextBytes linkKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_HTML_LINK, "12345");
        TextBytes linkKey4 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                CrawlDBKey.Type.KEY_TYPE_ATOM_LINK, "1234");
        TextBytes linkKey5 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ATOM_LINK, "12345");

        TextBytes mergeKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, "12345");
        TextBytes rootDomainKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD,
                "12345");
        TextBytes subDomainKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD,
                "12345");

        LinkKeyComparator comparator = new LinkKeyComparator();
        CrawlDBKeyGroupByURLComparator gcomparator = new CrawlDBKeyGroupByURLComparator();

        System.out.println("Comparing Similar status Keys");
        compareKeys(comparator, statusKey1, statusKey2, 0);
        compareKeys(comparator, statusKey2, statusKey1, 0);
        System.out.println("Comparing Similar status Keys w/Grouping C");
        compareKeys(gcomparator, statusKey1, statusKey2, 0);
        compareKeys(gcomparator, statusKey2, statusKey1, 0);
        System.out.println("Comparing Similar status Keys with different timestamps");
        compareKeys(comparator, statusKey2, statusKey3, -1);
        compareKeys(comparator, statusKey3, statusKey2, 1);
        System.out.println("Comparing Similar status Keys with different timestamps w/Grouping C");
        compareKeys(gcomparator, statusKey2, statusKey3, 0);
        compareKeys(gcomparator, statusKey3, statusKey2, 0);
        System.out.println("Comparing Status Key to Link Key");
        compareKeys(comparator, statusKey1, linkKey1, -1);
        compareKeys(comparator, linkKey1, statusKey1, 1);
        System.out.println("Comparing Status Key to Link Key Grouping C");
        compareKeys(gcomparator, statusKey1, linkKey1, 0);
        compareKeys(gcomparator, linkKey1, statusKey1, 0);
        System.out.println("Comparing TWO Link Keys with same hash value");
        compareKeys(comparator, linkKey1, linkKey1, 0);
        compareKeys(comparator, linkKey1, linkKey1, 0);
        System.out.println("Comparing TWO Link Keys with same type but different hash values");
        compareKeys(comparator, linkKey2, linkKey3, -1);
        compareKeys(comparator, linkKey3, linkKey2, 1);
        System.out.println("Comparing TWO Link Keys with same type but different hash values - Grouping  C");
        compareKeys(gcomparator, linkKey2, linkKey3, -1);
        compareKeys(gcomparator, linkKey3, linkKey2, 1);
        System.out.println("Comparing TWO Link Keys with different types but same hash values");
        compareKeys(comparator, linkKey2, linkKey4, -1);
        compareKeys(comparator, linkKey4, linkKey2, 1);
        System.out.println("Comparing TWO Link Keys with different types but same hash values - Grouping C ");
        compareKeys(gcomparator, linkKey2, linkKey4, 0);
        compareKeys(gcomparator, linkKey4, linkKey2, 0);
        System.out.println("Comparing TWO Link Keys with similar types but different hash values");
        compareKeys(comparator, linkKey4, linkKey5, -1);
        compareKeys(comparator, linkKey5, linkKey4, 1);
        System.out.println("Comparing TWO Link Keys with similar types but different hash values - Grouping C");
        compareKeys(gcomparator, linkKey4, linkKey5, -1);
        compareKeys(gcomparator, linkKey5, linkKey4, 1);

        compareKeys(comparator, mergeKey3, linkKey3, -1);
        compareKeys(comparator, rootDomainKey3, mergeKey3, -1);
        compareKeys(comparator, subDomainKey3, mergeKey3, -1);
        compareKeys(comparator, rootDomainKey3, subDomainKey3, -1);
        compareKeys(comparator, subDomainKey3, rootDomainKey3, 1);
        compareKeys(comparator, rootDomainKey3, rootDomainKey3, 0);

        TextBytes mergeKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, "12345");
        TextBytes rootDomainKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD,
                "12345");
        TextBytes subDomainKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD,
                "12345");

        TextBytes linkKeyTest = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                CrawlDBKey.Type.KEY_TYPE_HTML_LINK, "");
        Assert.assertTrue(scanForComponents(linkKeyTest, ':', scanArray) == scanArray.length - 1);
        for (FlexBuffer buffer : scanArray)
            LOG.info("Scan Item:" + buffer.toString());
        TextBytes linkKeyTest2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),
                CrawlDBKey.Type.KEY_TYPE_HTML_LINK,
                MD5Hash.digest("REALLY LONG SOMETHING OR ANOTHER").toString());
        Assert.assertTrue(scanForComponents(linkKeyTest2, ':', scanArray) == scanArray.length);
        for (FlexBuffer buffer : scanArray)
            LOG.info("Scan Item:" + buffer.toString());

    }
}