Example usage for org.apache.hadoop.io MD5Hash toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash toString.

Prototype

@Override
public String toString()

Source Link

Document

Returns a string representation of this object.

Usage

From source file:com.shmsoft.dmass.main.Reduce.java

License:Apache License

@Override
public void reduce(MD5Hash key, Iterable<MapWritable> values, Context context)
        throws IOException, InterruptedException {
    String outputKey = key.toString();
    masterKey = outputKey;/*from   ww w . jav a  2  s .c  om*/
    isMaster = true;
    for (MapWritable value : values) {
        columnMetadata.reinit();
        ++outputFileCount;
        processMap(value);
        // write this all to the reduce map
        context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues()));
        isMaster = false;
    }
}

From source file:edu.umd.lib.hadoopapps.MD5Sum.java

License:Apache License

/**
 * Runs the app.//from w w  w.  ja v  a 2  s . c om
 */

public int run(String[] args) throws Exception {

    if (args.length < 1) {
        usage();
        return -1;
    }
    Path filePath = new Path(args[0]);
    FileSystem hdfs = FileSystem.get(new Configuration());
    if (hdfs.exists(filePath)) {
        InputStream in = (InputStream) (hdfs.open(filePath));
        MD5Hash md5hash = MD5Hash.digest(in);
        System.out.println("MD5Sum: " + md5hash.toString());
    } else {
        System.out.println("Invalid path!");
    }
    return 0;
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testUrlDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//  w  w w.  ja v  a 2  s  . c  o  m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testMixedDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index1, index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }/*www . j a  v a  2 s . c o m*/
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check url", "http://www.example.com/2", doc.get("url"));
        System.out.println(doc);
    }
    reader.close();
    dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value,
        Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException {

    FlexBuffer contentOut = null;//from   w w  w  . jav  a 2 s  .c om
    String textOut = null;

    NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders());

    CrawlURLMetadata urlMetadata = new CrawlURLMetadata();

    // extract information from http headers ... 
    HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
    // get the mime type ... 
    String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)
            ? urlMetadata.getContentType()
            : "text/html";

    metadata.addProperty("mime_type", normalizedMimeType);
    crawlMeta.setMimeType(normalizedMimeType);

    // get download size ... 
    int downloadSize = value.getContentRaw().getCount();

    // set original content len ... 
    metadata.addProperty("download_size", downloadSize);
    crawlMeta.setDownloadSize(downloadSize);

    // set truncation flag 
    if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
        metadata.addProperty("download_truncated", true);
        crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
    }

    if (downloadSize > 0) {
        // get content type, charset and encoding 
        String encoding = finalHeaders.findValue("Content-Encoding");
        boolean isGZIP = false;
        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
            isGZIP = true;
        }

        byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
        int contentLen = value.getContentRaw().getCount();

        // assume we are going to output original data ... 
        contentOut = new FlexBuffer(contentBytes, 0, contentLen);

        if (isGZIP) {
            metadata.addProperty("content_is_gzip", isGZIP);
            crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);

            UnzipResult unzipResult = null;
            try {
                // LOG.info("BEFORE GUNZIP");
                unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen,
                        CrawlEnvironment.GUNZIP_SIZE_LIMIT);
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }

            if (unzipResult != null && unzipResult.data != null) {

                if (unzipResult.wasTruncated) {
                    LOG.warn("Truncated Document During GZIP:" + finalURL);
                    reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
                }

                contentBytes = unzipResult.data.get();
                contentLen = unzipResult.data.getCount();

                metadata.addProperty("gunzip_content_len", unzipResult.data.getCount());
                crawlMeta.setGunzipSize(unzipResult.data.getCount());

                // update content out ... 
                contentOut = new FlexBuffer(contentBytes, 0, contentLen);
            } else {

                metadata.addProperty("gunzip_failed", true);
                crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);

                reporter.incrCounter(Counters.GUNZIP_FAILED, 1);

                contentBytes = null;
                contentLen = 0;

                contentOut = null;
            }
            // LOG.info("AFTER GUNZIP");
        }

        if (contentBytes != null) {

            // ok compute an md5 hash 
            MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen);

            metadata.addProperty("md5", md5Hash.toString());
            crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length));
            // get normalized mime type 
            if (MimeTypeFilter.isTextType(normalizedMimeType)) {
                // ok time to decode the data into ucs2 ... 
                Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils
                        .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
                // ok write out decode metadata 
                metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
                crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
                metadata.addProperty("charset_detector", decodeResult.e0.e0);
                crawlMeta.setCharsetDetector(decodeResult.e0.e0);
                // get the content 
                String textContent = decodeResult.e1;
                // compute simhash 
                long simhash = SimHash.computeOptimizedSimHashForString(textContent);
                metadata.addProperty("text_simhash", simhash);
                crawlMeta.setTextSimHash(simhash);

                // figure out simplified mime type ... 
                MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter
                        .checkMimeTypeDisposition(normalizedMimeType);

                boolean parseComplete = false;

                Pair<JsonObject, String> tupleOut = null;

                // write it out 
                if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
                    // ok parse as html 
                    tupleOut = parseHTMLDocument(finalURL, value.getHeaders(),
                            new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter);

                    if (tupleOut == null) {
                        reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
                        LOG.error("Unable to Parse as HTML:" + finalURL.toString());
                        mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                    } else {
                        reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
                        metadata.addProperty("parsed_as", "html");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED
                        || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {

                    // ok try parse this document as a feed ...
                    tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,
                            crawlMeta.getFeedContent(),
                            ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter);

                    if (tupleOut == null) {
                        if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
                            reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
                            //TODO:HACK 
                            //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
                            mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                        }
                    } else {
                        reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
                        metadata.addProperty("parsed_as", "feed");
                        crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                        parseComplete = true;
                    }
                }

                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
                    reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
                    mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                }
                if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
                    // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
                    // TODO: FIX THIS BUT PUNT FOR NOW :-(
                    //tupleOut = new Pair<JsonObject,String>(null,textContent);
                }

                if (tupleOut != null) {
                    if (tupleOut.e0 != null) {
                        metadata.add("content", tupleOut.e0);
                    }
                    textOut = tupleOut.e1;
                }
            }
        }
    }
    return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut,
            new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut));
}

From source file:org.freeeed.main.FileProcessor.java

License:Apache License

/**
 * Cull, then emit responsive files./*from   w  ww.j a v  a 2 s  .  c om*/
 *
 * @param discoveryFile object with info for processing discovery.
 * @throws IOException          on any IO problem.
 * @throws InterruptedException throws by Hadoop.
 */
protected void processFileEntry(DiscoveryFile discoveryFile) throws IOException, InterruptedException {
    Project project = Project.getCurrentProject();

    if (project.isStopThePresses()) {
        return;
    }
    // update application log
    LOGGER.trace("Processing file: {}", discoveryFile.getRealFileName());
    // set to true if file matches any query params
    boolean isResponsive = false;
    // exception message to place in output if error occurs
    String exceptionMessage = null;
    // ImageTextParser metadata, derived from Tika metadata class
    DocumentMetadata metadata = new DocumentMetadata();
    discoveryFile.setMetadata(metadata);
    String extension = Util.getExtension(discoveryFile.getRealFileName());
    if ("jl".equalsIgnoreCase(extension)) {
        extractJlFields(discoveryFile);
    }
    try {
        metadata.setOriginalPath(getOriginalDocumentPath(discoveryFile));
        metadata.setHasAttachments(discoveryFile.isHasAttachments());
        metadata.setHasParent(discoveryFile.isHasParent());
        // extract file contents with Tika
        // Tika metadata class contains references to metadata and file text
        // TODO discoveryFile has the pointer to the same metadata - simplify this
        extractMetadata(discoveryFile, metadata);
        if (project.isRemoveSystemFiles() && Util.isSystemFile(metadata)) {
            LOGGER.info("File {} is recognized as system file and is not processed further",
                    discoveryFile.getPath().getPath());
            return;
        }
        metadata.setCustodian(project.getCurrentCustodian());
        // add Hash to metadata
        MD5Hash hash = Util.createKeyHash(discoveryFile.getPath(), metadata);
        metadata.setHash(hash.toString());
        metadata.acquireUniqueId();
        // search through Tika results using Lucene
        isResponsive = isResponsive(metadata);
        if (isResponsive) {
            addToES(metadata);
        }
    } catch (IOException | ParseException e) {
        LOGGER.warn("Exception processing file ", e);
        exceptionMessage = e.getMessage();
    }
    // update exception message if error
    if (exceptionMessage != null) {
        metadata.set(DocumentMetadataKeys.PROCESSING_EXCEPTION, exceptionMessage);
    }
    if (isResponsive || exceptionMessage != null) {
        createImage(discoveryFile);
        if (isPreview()) {
            try {
                createHtmlForDocument(discoveryFile);
            } catch (Exception e) {
                metadata.set(DocumentMetadataKeys.PROCESSING_EXCEPTION, e.getMessage());
            }
        }
        writeMetadata(discoveryFile, metadata);
    }
    LOGGER.trace("Is the file responsive: {}", isResponsive);
}