List of usage examples for org.apache.hadoop.io MD5Hash digest
null digest
To view the source code for org.apache.hadoop.io MD5Hash digest.
Click Source Link
From source file:org.apache.nutch.util.DumpFileUtil.java
License:Apache License
public static String getUrlMD5(String url) { byte[] digest = MD5Hash.digest(url).getDigest(); StringBuffer sb = new StringBuffer(); for (byte b : digest) { sb.append(String.format("%02x", b & 0xff)); }/*w w w . j a va2 s.c o m*/ return sb.toString(); }
From source file:org.apache.nutchbase.crawl.MD5SignatureHbase.java
License:Apache License
@Override public byte[] calculate(ImmutableRowPart row, ParseHbase parse) { byte[] data = row.getContent(); return MD5Hash.digest(data).getDigest(); }
From source file:org.apache.nutchbase.crawl.TextProfileSignatureHbase.java
License:Apache License
public byte[] calculate(ImmutableRowPart row, ParseHbase parse) { int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2); float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f); HashMap<String, Token> tokens = new HashMap<String, Token>(); String text = null;//from w ww.ja v a 2 s .co m if (parse != null) text = parse.getText(); if (text == null || text.length() == 0) return fallback.calculate(row, parse); StringBuffer curToken = new StringBuffer(); int maxFreq = 0; for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (Character.isLetterOrDigit(c)) { curToken.append(Character.toLowerCase(c)); } else { if (curToken.length() > 0) { if (curToken.length() > MIN_TOKEN_LEN) { // add it String s = curToken.toString(); Token tok = tokens.get(s); if (tok == null) { tok = new Token(0, s); tokens.put(s, tok); } tok.cnt++; if (tok.cnt > maxFreq) maxFreq = tok.cnt; } curToken.setLength(0); } } } // check the last token if (curToken.length() > MIN_TOKEN_LEN) { // add it String s = curToken.toString(); Token tok = tokens.get(s); if (tok == null) { tok = new Token(0, s); tokens.put(s, tok); } tok.cnt++; if (tok.cnt > maxFreq) maxFreq = tok.cnt; } Iterator<Token> it = tokens.values().iterator(); ArrayList<Token> profile = new ArrayList<Token>(); // calculate the QUANT value int QUANT = Math.round(maxFreq * QUANT_RATE); if (QUANT < 2) { if (maxFreq > 1) QUANT = 2; else QUANT = 1; } while (it.hasNext()) { Token t = it.next(); // round down to the nearest QUANT t.cnt = (t.cnt / QUANT) * QUANT; // discard the frequencies below the QUANT if (t.cnt < QUANT) { continue; } profile.add(t); } Collections.sort(profile, new TokenComparator()); StringBuffer newText = new StringBuffer(); it = profile.iterator(); while (it.hasNext()) { Token t = it.next(); if (newText.length() > 0) newText.append("\n"); newText.append(t.toString()); } return MD5Hash.digest(newText.toString()).getDigest(); }
From source file:org.archive.access.nutch.jobs.ImportArcs.java
License:LGPL
public void map(final WritableComparable key, final Writable value, final OutputCollector output, final Reporter r) throws IOException { // Assumption is that this map is being run by ARCMapRunner. // Otherwise, the below casts fail. String url = key.toString();/*w w w .j av a 2s. c o m*/ ARCRecord rec = (ARCRecord) ((ObjectWritable) value).get(); ARCReporter reporter = (ARCReporter) r; // Its null first time map is called on an ARC. checkArcName(rec); if (!isIndex(rec)) { return; } checkCollectionName(); final ARCRecordMetaData arcData = rec.getMetaData(); String oldUrl = url; try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER); url = filters.filter(url); // filter the url } catch (Exception e) { LOG.warn("Skipping record. Didn't pass normalization/filter " + oldUrl + ": " + e.toString()); return; } final long b = arcData.getContentBegin(); final long l = arcData.getLength(); final long recordLength = (l > b) ? (l - b) : l; // Look at ARCRecord meta data line mimetype. It can be empty. If so, // two more chances at figuring it either by looking at HTTP headers or // by looking at first couple of bytes of the file. See below. String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url); if (skip(mimetype)) { return; } // Copy http headers to nutch metadata. final Metadata metaData = new Metadata(); final Header[] headers = rec.getHttpHeaders(); for (int j = 0; j < headers.length; j++) { final Header header = headers[j]; if (mimetype == null) { // Special handling. If mimetype is still null, try getting it // from the http header. I've seen arc record lines with empty // content-type and a MIME unparseable file ending; i.e. .MID. if ((header.getName() != null) && header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY)) { mimetype = getMimetype(header.getValue(), null, null); if (skip(mimetype)) { return; } } } metaData.set(header.getName(), header.getValue()); } // This call to reporter setStatus pings the tasktracker telling it our // status and telling the task tracker we're still alive (so it doesn't // time us out). final String noSpacesMimetype = TextUtils.replaceAll(ImportArcs.WHITESPACE, ((mimetype == null || mimetype.length() <= 0) ? "TODO" : mimetype), "-"); final String recordLengthAsStr = Long.toString(recordLength); reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype)); // This is a nutch 'more' field. metaData.set("contentLength", recordLengthAsStr); rec.skipHttpHeader(); reporter.setStatusIfElapse("read headers on " + url); // TODO: Skip if unindexable type. int total = 0; // Read in first block. If mimetype still null, look for MAGIC. int len = rec.read(this.buffer, 0, this.buffer.length); if (mimetype == null) { MimeType mt = this.mimeTypes.getMimeType(this.buffer); if (mt == null || mt.getName() == null) { LOG.warn("Failed to get mimetype for: " + url); return; } mimetype = mt.getName(); } metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype); // How much do we read total? If pdf, we will read more. If equal to -1, // read all. int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype)) ? this.pdfContentLimit : this.contentLimit; // Reset our contentBuffer so can reuse. Over the life of an ARC // processing will grow to maximum record size. this.contentBuffer.reset(); while ((len != -1) && ((readLimit == -1) || (total < readLimit))) { total += len; this.contentBuffer.write(this.buffer, 0, len); len = rec.read(this.buffer, 0, this.buffer.length); reporter.setStatusIfElapse("reading " + url); } // Close the Record. We're done with it. Side-effect is calculation // of digest -- if we're digesting. rec.close(); reporter.setStatusIfElapse("closed " + url); final byte[] contentBytes = this.contentBuffer.toByteArray(); final CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); // Calculate digest or use precalculated sha1. String digest = (this.sha1) ? rec.getDigestStr() : MD5Hash.digest(contentBytes).toString(); metaData.set(Nutch.SIGNATURE_KEY, digest); // Set digest back into the arcData so available later when we write // CDX line. arcData.setDigest(digest); metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName); // Score at this stage is 1.0f. metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); final long startTime = System.currentTimeMillis(); final Content content = new Content(url, url, contentBytes, mimetype, metaData, getConf()); datum.setFetchTime(Nutchwax.getDate(arcData.getDate())); MapWritable mw = datum.getMetaData(); if (mw == null) { mw = new MapWritable(); } if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) { mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName, arcData.getDate()))); } else { mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName)); } mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName)); mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY), new Text(Long.toString(arcData.getOffset()))); datum.setMetaData(mw); TimeoutParsingThread tout = threadPool.getThread(Thread.currentThread().getId(), timeoutIndexingDocument); tout.setUrl(url); tout.setContent(content); tout.setParseUtil(parseUtil); tout.wakeupAndWait(); ParseStatus parseStatus = tout.getParseStatus(); Parse parse = tout.getParse(); reporter.setStatusIfElapse("parsed " + url); if (!parseStatus.isSuccess()) { final String status = formatToOneLine(parseStatus.toString()); LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status); parse = null; } else { // Was it a slow parse? final double kbPerSecond = getParseRate(startTime, (contentBytes != null) ? contentBytes.length : 0); if (LOG.isDebugEnabled()) { LOG.debug(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond)); } else if (kbPerSecond < this.parseThreshold) { LOG.warn(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond)); } } Writable v = new FetcherOutput(datum, null, parse != null ? new ParseImpl(parse) : null); if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) { LOG.info("multiple: " + SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate()) + " " + url); output.collect(Nutchwax.generateWaxKey(url, SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate())), v); } else { output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v); } }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId) throws IOException { List<TestRecord> recordSet = ArcFileReaderTests .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz"); FSDataOutputStream os = fs.create(filePath); try {/*from w w w . j ava 2 s. c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long streamPos = os.getPos(); long testAttemptTime = System.currentTimeMillis(); NIOHttpHeaders testHeaders = new NIOHttpHeaders(); testHeaders.add("test", "test-value"); for (TestRecord record : recordSet) { long preWritePos = os.getPos(); ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); long postWritePos = os.getPos(); record.streamPos = (int) preWritePos; record.rawSize = (int) (postWritePos - preWritePos); } os.flush(); } finally { os.close(); } return new Pair<Path, List<TestRecord>>(filePath, recordSet); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { JobConf conf = new JobConf(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/*from w w w . j a v a 2 s .c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(conf, split); int index = 0; // iterate and validate stuff ... Text key = reader.createKey(); BytesWritable value = reader.createValue(); while (reader.next(key, value)) { TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId) throws IOException { List<TestRecord> recordSet = ArcFileReaderTests .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz"); FSDataOutputStream os = fs.create(filePath); try {/*ww w.j a v a 2 s .c om*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); NIOHttpHeaders testHeaders = new NIOHttpHeaders(); testHeaders.add("test", "test-value"); for (TestRecord record : recordSet) { long preWritePos = os.getPos(); ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); long postWritePos = os.getPos(); record.streamPos = (int) preWritePos; record.rawSize = (int) (postWritePos - preWritePos); } os.flush(); } finally { os.close(); } return new Pair<Path, List<TestRecord>>(filePath, recordSet); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {//from w w w . j a v a 2s . co m // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID())); int index = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value, Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException { FlexBuffer contentOut = null;/*from w ww . ja va 2s .com*/ String textOut = null; NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders()); CrawlURLMetadata urlMetadata = new CrawlURLMetadata(); // extract information from http headers ... HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata); // get the mime type ... String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE) ? urlMetadata.getContentType() : "text/html"; metadata.addProperty("mime_type", normalizedMimeType); crawlMeta.setMimeType(normalizedMimeType); // get download size ... int downloadSize = value.getContentRaw().getCount(); // set original content len ... metadata.addProperty("download_size", downloadSize); crawlMeta.setDownloadSize(downloadSize); // set truncation flag if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { metadata.addProperty("download_truncated", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated); } if (downloadSize > 0) { // get content type, charset and encoding String encoding = finalHeaders.findValue("Content-Encoding"); boolean isGZIP = false; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { isGZIP = true; } byte[] contentBytes = value.getContentRaw().getReadOnlyBytes(); int contentLen = value.getContentRaw().getCount(); // assume we are going to output original data ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); if (isGZIP) { metadata.addProperty("content_is_gzip", isGZIP); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP); UnzipResult unzipResult = null; try { // LOG.info("BEFORE GUNZIP"); unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen, CrawlEnvironment.GUNZIP_SIZE_LIMIT); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } if (unzipResult != null && unzipResult.data != null) { if (unzipResult.wasTruncated) { LOG.warn("Truncated Document During GZIP:" + finalURL); reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1); } contentBytes = unzipResult.data.get(); contentLen = unzipResult.data.getCount(); metadata.addProperty("gunzip_content_len", unzipResult.data.getCount()); crawlMeta.setGunzipSize(unzipResult.data.getCount()); // update content out ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); } else { metadata.addProperty("gunzip_failed", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed); reporter.incrCounter(Counters.GUNZIP_FAILED, 1); contentBytes = null; contentLen = 0; contentOut = null; } // LOG.info("AFTER GUNZIP"); } if (contentBytes != null) { // ok compute an md5 hash MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen); metadata.addProperty("md5", md5Hash.toString()); crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length)); // get normalized mime type if (MimeTypeFilter.isTextType(normalizedMimeType)) { // ok time to decode the data into ucs2 ... Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen); // ok write out decode metadata metadata.addProperty("charset_detected", decodeResult.e0.e1.toString()); crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString()); metadata.addProperty("charset_detector", decodeResult.e0.e0); crawlMeta.setCharsetDetector(decodeResult.e0.e0); // get the content String textContent = decodeResult.e1; // compute simhash long simhash = SimHash.computeOptimizedSimHashForString(textContent); metadata.addProperty("text_simhash", simhash); crawlMeta.setTextSimHash(simhash); // figure out simplified mime type ... MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter .checkMimeTypeDisposition(normalizedMimeType); boolean parseComplete = false; Pair<JsonObject, String> tupleOut = null; // write it out if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) { // ok parse as html tupleOut = parseHTMLDocument(finalURL, value.getHeaders(), new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter); if (tupleOut == null) { reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1); LOG.error("Unable to Parse as HTML:" + finalURL.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } else { reporter.incrCounter(Counters.PARSED_HTML_DOC, 1); metadata.addProperty("parsed_as", "html"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) { // ok try parse this document as a feed ... tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent, crawlMeta.getFeedContent(), ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter); if (tupleOut == null) { if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) { reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1); //TODO:HACK //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } } else { reporter.incrCounter(Counters.PARSED_FEED_URL, 1); metadata.addProperty("parsed_as", "feed"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) { reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) { // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text"); // TODO: FIX THIS BUT PUNT FOR NOW :-( //tupleOut = new Pair<JsonObject,String>(null,textContent); } if (tupleOut != null) { if (tupleOut.e0 != null) { metadata.add("content", tupleOut.e0); } textOut = tupleOut.e1; } } } } return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut, new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut)); }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java
License:Open Source License
@Test public void validateLinkKey() throws Exception { // allocate scan arrays FlexBuffer[] scanArray = allocateScanArray(); URLFPV2 fp = URLUtils.getURLFPV2FromURL("http://www.google.com/"); if (fp != null) { TextBytes key = generateLinkKey(fp, CrawlDBKey.Type.KEY_TYPE_HTML_LINK, "FOOBAR"); // get it the hard way scanForComponents(key, ':', scanArray); System.out.println("Key is:" + key.toString()); System.out.println("Check Root Domain Key"); Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromComponentArray(scanArray, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); System.out.println("Check Domain Key"); Assert.assertTrue(fp.getDomainHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getDomainHash() == getLongComponentFromComponentArray(scanArray, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); System.out.println("Check URL Hash Key"); Assert.assertTrue(//from w w w . jav a 2 s.c o m fp.getUrlHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getUrlHash() == getLongComponentFromComponentArray(scanArray, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); System.out.println("Check Type"); Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID)); Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromComponentArray( scanArray, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID)); System.out.println("Check ExtraData"); Assert.assertTrue(new FlexBuffer("FOOBAR".getBytes()).compareTo( getByteArrayComponentFromKey(key, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0); Assert.assertTrue( new FlexBuffer("FOOBAR".getBytes()).compareTo(getByteArrayFromComponentArray(scanArray, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0); TextBytes statusKey1 = generateCrawlStatusKey(new Text("http://www.google.com/"), 12345L); TextBytes statusKey2 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), 12345L); TextBytes statusKey3 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), 12346L); TextBytes linkKey1 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest("123").toString()); TextBytes linkKey2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest("1234").toString()); URLFPV2 fpLink3 = URLUtils.getURLFPV2FromURL("http://www.google.com/"); fpLink3.setUrlHash(fpLink3.getUrlHash() + 1); TextBytes linkKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_HTML_LINK, "12345"); TextBytes linkKey4 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), CrawlDBKey.Type.KEY_TYPE_ATOM_LINK, "1234"); TextBytes linkKey5 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ATOM_LINK, "12345"); TextBytes mergeKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, "12345"); TextBytes rootDomainKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD, "12345"); TextBytes subDomainKey3 = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD, "12345"); LinkKeyComparator comparator = new LinkKeyComparator(); CrawlDBKeyGroupByURLComparator gcomparator = new CrawlDBKeyGroupByURLComparator(); System.out.println("Comparing Similar status Keys"); compareKeys(comparator, statusKey1, statusKey2, 0); compareKeys(comparator, statusKey2, statusKey1, 0); System.out.println("Comparing Similar status Keys w/Grouping C"); compareKeys(gcomparator, statusKey1, statusKey2, 0); compareKeys(gcomparator, statusKey2, statusKey1, 0); System.out.println("Comparing Similar status Keys with different timestamps"); compareKeys(comparator, statusKey2, statusKey3, -1); compareKeys(comparator, statusKey3, statusKey2, 1); System.out.println("Comparing Similar status Keys with different timestamps w/Grouping C"); compareKeys(gcomparator, statusKey2, statusKey3, 0); compareKeys(gcomparator, statusKey3, statusKey2, 0); System.out.println("Comparing Status Key to Link Key"); compareKeys(comparator, statusKey1, linkKey1, -1); compareKeys(comparator, linkKey1, statusKey1, 1); System.out.println("Comparing Status Key to Link Key Grouping C"); compareKeys(gcomparator, statusKey1, linkKey1, 0); compareKeys(gcomparator, linkKey1, statusKey1, 0); System.out.println("Comparing TWO Link Keys with same hash value"); compareKeys(comparator, linkKey1, linkKey1, 0); compareKeys(comparator, linkKey1, linkKey1, 0); System.out.println("Comparing TWO Link Keys with same type but different hash values"); compareKeys(comparator, linkKey2, linkKey3, -1); compareKeys(comparator, linkKey3, linkKey2, 1); System.out.println("Comparing TWO Link Keys with same type but different hash values - Grouping C"); compareKeys(gcomparator, linkKey2, linkKey3, -1); compareKeys(gcomparator, linkKey3, linkKey2, 1); System.out.println("Comparing TWO Link Keys with different types but same hash values"); compareKeys(comparator, linkKey2, linkKey4, -1); compareKeys(comparator, linkKey4, linkKey2, 1); System.out.println("Comparing TWO Link Keys with different types but same hash values - Grouping C "); compareKeys(gcomparator, linkKey2, linkKey4, 0); compareKeys(gcomparator, linkKey4, linkKey2, 0); System.out.println("Comparing TWO Link Keys with similar types but different hash values"); compareKeys(comparator, linkKey4, linkKey5, -1); compareKeys(comparator, linkKey5, linkKey4, 1); System.out.println("Comparing TWO Link Keys with similar types but different hash values - Grouping C"); compareKeys(gcomparator, linkKey4, linkKey5, -1); compareKeys(gcomparator, linkKey5, linkKey4, 1); compareKeys(comparator, mergeKey3, linkKey3, -1); compareKeys(comparator, rootDomainKey3, mergeKey3, -1); compareKeys(comparator, subDomainKey3, mergeKey3, -1); compareKeys(comparator, rootDomainKey3, subDomainKey3, -1); compareKeys(comparator, subDomainKey3, rootDomainKey3, 1); compareKeys(comparator, rootDomainKey3, rootDomainKey3, 0); TextBytes mergeKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, "12345"); TextBytes rootDomainKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD, "12345"); TextBytes subDomainKey = generateLinkKey(fpLink3, CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD, "12345"); TextBytes linkKeyTest = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), CrawlDBKey.Type.KEY_TYPE_HTML_LINK, ""); Assert.assertTrue(scanForComponents(linkKeyTest, ':', scanArray) == scanArray.length - 1); for (FlexBuffer buffer : scanArray) LOG.info("Scan Item:" + buffer.toString()); TextBytes linkKeyTest2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"), CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest("REALLY LONG SOMETHING OR ANOTHER").toString()); Assert.assertTrue(scanForComponents(linkKeyTest2, ':', scanArray) == scanArray.length); for (FlexBuffer buffer : scanArray) LOG.info("Scan Item:" + buffer.toString()); } }