List of usage examples for org.apache.hadoop.io MD5Hash toString
@Override
public String toString()
From source file:com.shmsoft.dmass.main.Reduce.java
License:Apache License
@Override public void reduce(MD5Hash key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException { String outputKey = key.toString(); masterKey = outputKey;/*from ww w . jav a 2 s .c om*/ isMaster = true; for (MapWritable value : values) { columnMetadata.reinit(); ++outputFileCount; processMap(value); // write this all to the reduce map context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues())); isMaster = false; } }
From source file:edu.umd.lib.hadoopapps.MD5Sum.java
License:Apache License
/** * Runs the app.//from w w w. ja v a 2 s . c om */ public int run(String[] args) throws Exception { if (args.length < 1) { usage(); return -1; } Path filePath = new Path(args[0]); FileSystem hdfs = FileSystem.get(new Configuration()); if (hdfs.exists(filePath)) { InputStream in = (InputStream) (hdfs.open(filePath)); MD5Hash md5hash = MD5Hash.digest(in); System.out.println("MD5Sum: " + md5hash.toString()); } else { System.out.println("Invalid path!"); } return 0; }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testUrlDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }// w w w. ja v a 2 s . c o m Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testMixedDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index1, index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }/*www . j a v a 2 s . c o m*/ Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", "http://www.example.com/2", doc.get("url")); System.out.println(doc); } reader.close(); dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value, Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException { FlexBuffer contentOut = null;//from w w w . jav a 2 s .c om String textOut = null; NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders()); CrawlURLMetadata urlMetadata = new CrawlURLMetadata(); // extract information from http headers ... HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata); // get the mime type ... String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE) ? urlMetadata.getContentType() : "text/html"; metadata.addProperty("mime_type", normalizedMimeType); crawlMeta.setMimeType(normalizedMimeType); // get download size ... int downloadSize = value.getContentRaw().getCount(); // set original content len ... metadata.addProperty("download_size", downloadSize); crawlMeta.setDownloadSize(downloadSize); // set truncation flag if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { metadata.addProperty("download_truncated", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated); } if (downloadSize > 0) { // get content type, charset and encoding String encoding = finalHeaders.findValue("Content-Encoding"); boolean isGZIP = false; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { isGZIP = true; } byte[] contentBytes = value.getContentRaw().getReadOnlyBytes(); int contentLen = value.getContentRaw().getCount(); // assume we are going to output original data ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); if (isGZIP) { metadata.addProperty("content_is_gzip", isGZIP); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP); UnzipResult unzipResult = null; try { // LOG.info("BEFORE GUNZIP"); unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen, CrawlEnvironment.GUNZIP_SIZE_LIMIT); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } if (unzipResult != null && unzipResult.data != null) { if (unzipResult.wasTruncated) { LOG.warn("Truncated Document During GZIP:" + finalURL); reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1); } contentBytes = unzipResult.data.get(); contentLen = unzipResult.data.getCount(); metadata.addProperty("gunzip_content_len", unzipResult.data.getCount()); crawlMeta.setGunzipSize(unzipResult.data.getCount()); // update content out ... contentOut = new FlexBuffer(contentBytes, 0, contentLen); } else { metadata.addProperty("gunzip_failed", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed); reporter.incrCounter(Counters.GUNZIP_FAILED, 1); contentBytes = null; contentLen = 0; contentOut = null; } // LOG.info("AFTER GUNZIP"); } if (contentBytes != null) { // ok compute an md5 hash MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen); metadata.addProperty("md5", md5Hash.toString()); crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length)); // get normalized mime type if (MimeTypeFilter.isTextType(normalizedMimeType)) { // ok time to decode the data into ucs2 ... Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen); // ok write out decode metadata metadata.addProperty("charset_detected", decodeResult.e0.e1.toString()); crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString()); metadata.addProperty("charset_detector", decodeResult.e0.e0); crawlMeta.setCharsetDetector(decodeResult.e0.e0); // get the content String textContent = decodeResult.e1; // compute simhash long simhash = SimHash.computeOptimizedSimHashForString(textContent); metadata.addProperty("text_simhash", simhash); crawlMeta.setTextSimHash(simhash); // figure out simplified mime type ... MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter .checkMimeTypeDisposition(normalizedMimeType); boolean parseComplete = false; Pair<JsonObject, String> tupleOut = null; // write it out if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) { // ok parse as html tupleOut = parseHTMLDocument(finalURL, value.getHeaders(), new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter); if (tupleOut == null) { reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1); LOG.error("Unable to Parse as HTML:" + finalURL.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } else { reporter.incrCounter(Counters.PARSED_HTML_DOC, 1); metadata.addProperty("parsed_as", "html"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) { // ok try parse this document as a feed ... tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent, crawlMeta.getFeedContent(), ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter); if (tupleOut == null) { if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) { reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1); //TODO:HACK //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } } else { reporter.incrCounter(Counters.PARSED_FEED_URL, 1); metadata.addProperty("parsed_as", "feed"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) { reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) { // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text"); // TODO: FIX THIS BUT PUNT FOR NOW :-( //tupleOut = new Pair<JsonObject,String>(null,textContent); } if (tupleOut != null) { if (tupleOut.e0 != null) { metadata.add("content", tupleOut.e0); } textOut = tupleOut.e1; } } } } return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut, new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut)); }
From source file:org.freeeed.main.FileProcessor.java
License:Apache License
/** * Cull, then emit responsive files./*from w ww.j a v a 2 s . c om*/ * * @param discoveryFile object with info for processing discovery. * @throws IOException on any IO problem. * @throws InterruptedException throws by Hadoop. */ protected void processFileEntry(DiscoveryFile discoveryFile) throws IOException, InterruptedException { Project project = Project.getCurrentProject(); if (project.isStopThePresses()) { return; } // update application log LOGGER.trace("Processing file: {}", discoveryFile.getRealFileName()); // set to true if file matches any query params boolean isResponsive = false; // exception message to place in output if error occurs String exceptionMessage = null; // ImageTextParser metadata, derived from Tika metadata class DocumentMetadata metadata = new DocumentMetadata(); discoveryFile.setMetadata(metadata); String extension = Util.getExtension(discoveryFile.getRealFileName()); if ("jl".equalsIgnoreCase(extension)) { extractJlFields(discoveryFile); } try { metadata.setOriginalPath(getOriginalDocumentPath(discoveryFile)); metadata.setHasAttachments(discoveryFile.isHasAttachments()); metadata.setHasParent(discoveryFile.isHasParent()); // extract file contents with Tika // Tika metadata class contains references to metadata and file text // TODO discoveryFile has the pointer to the same metadata - simplify this extractMetadata(discoveryFile, metadata); if (project.isRemoveSystemFiles() && Util.isSystemFile(metadata)) { LOGGER.info("File {} is recognized as system file and is not processed further", discoveryFile.getPath().getPath()); return; } metadata.setCustodian(project.getCurrentCustodian()); // add Hash to metadata MD5Hash hash = Util.createKeyHash(discoveryFile.getPath(), metadata); metadata.setHash(hash.toString()); metadata.acquireUniqueId(); // search through Tika results using Lucene isResponsive = isResponsive(metadata); if (isResponsive) { addToES(metadata); } } catch (IOException | ParseException e) { LOGGER.warn("Exception processing file ", e); exceptionMessage = e.getMessage(); } // update exception message if error if (exceptionMessage != null) { metadata.set(DocumentMetadataKeys.PROCESSING_EXCEPTION, exceptionMessage); } if (isResponsive || exceptionMessage != null) { createImage(discoveryFile); if (isPreview()) { try { createHtmlForDocument(discoveryFile); } catch (Exception e) { metadata.set(DocumentMetadataKeys.PROCESSING_EXCEPTION, e.getMessage()); } } writeMetadata(discoveryFile, metadata); } LOGGER.trace("Is the file responsive: {}", isResponsive); }