Example usage for org.apache.hadoop.io MD5Hash MD5Hash

List of usage examples for org.apache.hadoop.io MD5Hash MD5Hash

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash MD5Hash.

Prototype

public MD5Hash(byte[] digest) 

Source Link

Document

Constructs an MD5Hash with a specified value.

Usage

From source file:com.mellanox.r4h.DFSClient.java

License:Apache License

/**
 * Get the checksum of the whole file of a range of the file. Note that the
 * range always starts from the beginning of the file.
 * //from w  ww .ja v a  2 s. c  om
 * @param src
 *            The file path
 * @param length
 *            the length of the range, i.e., the range is [0, length]
 * @return The checksum
 * @see DistributedFileSystem#getFileChecksum(Path)
 */
public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length) throws IOException {
    checkOpen();
    Preconditions.checkArgument(length >= 0);
    // get block locations for the file range
    LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0, length);
    if (null == blockLocations) {
        throw new FileNotFoundException("File does not exist: " + src);
    }
    List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks();
    final DataOutputBuffer md5out = new DataOutputBuffer();
    int bytesPerCRC = -1;
    DataChecksum.Type crcType = DataChecksum.Type.DEFAULT;
    long crcPerBlock = 0;
    boolean refetchBlocks = false;
    int lastRetriedIndex = -1;

    // get block checksum for each block
    long remaining = length;
    if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
        remaining = Math.min(length, blockLocations.getFileLength());
    }
    for (int i = 0; i < locatedblocks.size() && remaining > 0; i++) {
        if (refetchBlocks) { // refetch to get fresh tokens
            blockLocations = callGetBlockLocations(namenode, src, 0, length);
            if (null == blockLocations) {
                throw new FileNotFoundException("File does not exist: " + src);
            }
            locatedblocks = blockLocations.getLocatedBlocks();
            refetchBlocks = false;
        }
        LocatedBlock lb = locatedblocks.get(i);
        final ExtendedBlock block = lb.getBlock();
        if (remaining < block.getNumBytes()) {
            block.setNumBytes(remaining);
        }
        remaining -= block.getNumBytes();
        final DatanodeInfo[] datanodes = lb.getLocations();

        // try each datanode location of the block
        final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout();
        boolean done = false;
        for (int j = 0; !done && j < datanodes.length; j++) {
            DataOutputStream out = null;
            DataInputStream in = null;

            try {
                // connect to a datanode
                IOStreamPair pair = connectToDN(datanodes[j], timeout, lb);
                out = new DataOutputStream(new BufferedOutputStream(pair.out, HdfsConstants.SMALL_BUFFER_SIZE));
                in = new DataInputStream(pair.in);

                if (LOG.isDebugEnabled()) {
                    LOG.debug("write to " + datanodes[j] + ": " + Op.BLOCK_CHECKSUM + ", block=" + block);
                }
                // get block MD5
                new Sender(out).blockChecksum(block, lb.getBlockToken());

                final BlockOpResponseProto reply = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));

                String logInfo = "for block " + block + " from datanode " + datanodes[j];
                DataTransferProtoUtil.checkBlockOpStatus(reply, logInfo);

                OpBlockChecksumResponseProto checksumData = reply.getChecksumResponse();

                // read byte-per-checksum
                final int bpc = checksumData.getBytesPerCrc();
                if (i == 0) { // first block
                    bytesPerCRC = bpc;
                } else if (bpc != bytesPerCRC) {
                    throw new IOException(
                            "Byte-per-checksum not matched: bpc=" + bpc + " but bytesPerCRC=" + bytesPerCRC);
                }

                // read crc-per-block
                final long cpb = checksumData.getCrcPerBlock();
                if (locatedblocks.size() > 1 && i == 0) {
                    crcPerBlock = cpb;
                }

                // read md5
                final MD5Hash md5 = new MD5Hash(checksumData.getMd5().toByteArray());
                md5.write(md5out);

                // read crc-type
                final DataChecksum.Type ct;
                if (checksumData.hasCrcType()) {
                    ct = PBHelper.convert(checksumData.getCrcType());
                } else {
                    LOG.debug("Retrieving checksum from an earlier-version DataNode: "
                            + "inferring checksum by reading first byte");
                    ct = inferChecksumTypeByReading(lb, datanodes[j]);
                }

                if (i == 0) { // first block
                    crcType = ct;
                } else if (crcType != DataChecksum.Type.MIXED && crcType != ct) {
                    // if crc types are mixed in a file
                    crcType = DataChecksum.Type.MIXED;
                }

                done = true;

                if (LOG.isDebugEnabled()) {
                    if (i == 0) {
                        LOG.debug("set bytesPerCRC=" + bytesPerCRC + ", crcPerBlock=" + crcPerBlock);
                    }
                    LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5);
                }
            } catch (InvalidBlockTokenException ibte) {
                if (i > lastRetriedIndex) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM " + "for file " + src
                                + " for block " + block + " from datanode " + datanodes[j]
                                + ". Will retry the block once.");
                    }
                    lastRetriedIndex = i;
                    done = true; // actually it's not done; but we'll retry
                    i--; // repeat at i-th block
                    refetchBlocks = true;
                    break;
                }
            } catch (IOException ie) {
                LOG.warn("src=" + src + ", datanodes[" + j + "]=" + datanodes[j], ie);
            } finally {
                IOUtils.closeStream(in);
                IOUtils.closeStream(out);
            }
        }

        if (!done) {
            throw new IOException("Fail to get block MD5 for " + block);
        }
    }

    // compute file MD5
    final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData());
    switch (crcType) {
    case CRC32:
        return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC, crcPerBlock, fileMD5);
    case CRC32C:
        return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC, crcPerBlock, fileMD5);
    default:
        // If there is no block allocated for the file,
        // return one with the magic entry that matches what previous
        // hdfs versions return.
        if (locatedblocks.size() == 0) {
            return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5);
        }

        // we should never get here since the validity was checked
        // when getCrcType() was called above.
        return null;
    }
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java

License:Apache License

@Test
public void testSingleFileInListDontFailOnWrongChecksum() throws Exception {
    String fileName = "000000_0";

    Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00")))
            .when(fileSystem).getFileChecksum(any(Path.class));

    runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "False");
    runner.setProperty(ComputeHDFSChecksums.FILES,
            String.format("[" + fileEntry + "]", fileName, "AAACAAAAAAAAAAAArRnBpxcZ9ze14XqfLMB4yA=="));
    runner.enqueue(new byte[0]);
    runner.run();/*from  ww  w  . j  a  v a  2  s  .  c o  m*/

    // Check relationships
    Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size());
    Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size());

    // Check whether checksum was passed correctly to attributes
    String filesJSON = runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).get(0)
            .getAttribute("files");
    Gson jsonParser = new Gson();
    ComputeHDFSChecksums.File[] files = jsonParser.fromJson(filesJSON, ComputeHDFSChecksums.File[].class);
    Assert.assertEquals(files[0].getComputedChecksum().getValue(),
            "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=");

    // Check file system calls
    verifyGetFileChecksumCall(fileName);
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java

License:Apache License

@Test
public void testSingleFileInListFailOnWrongChecksum() throws Exception {
    String fileName = "000000_0";

    Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00")))
            .when(fileSystem).getFileChecksum(any(Path.class));

    runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True");
    runner.setProperty(ComputeHDFSChecksums.FILES,
            String.format("[" + fileEntry + "]", fileName, "AAACAAAAAAAAAAAArRnBpxcZ9ze14XqfLMB4yA=="));
    runner.enqueue(new byte[0]);
    runner.run();/*from   ww w  . ja  v a  2 s .  c  o m*/

    // Check relationships
    Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size());
    Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size());

    // Check file system calls
    verifyGetFileChecksumCall(fileName);
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java

License:Apache License

@Test
public void testSingleFileProperChecksum() throws Exception {
    String fileName = "000000_0";

    Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00")))
            .when(fileSystem).getFileChecksum(any(Path.class));

    runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True");
    runner.setProperty(ComputeHDFSChecksums.FILES,
            String.format("[" + fileEntry + "]", fileName, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA="));
    runner.enqueue(new byte[0]);
    runner.run();/*from  w  w  w.  j  a v a  2 s  .  c o m*/

    // Check relationships
    Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size());
    Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size());

    // Check file system calls
    verifyGetFileChecksumCall(fileName);
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java

License:Apache License

@Test
public void testMultipleFilesFailOnSingleWrongChecksum() throws Exception {
    String fileName = "000000_0";
    String fileName2 = "000000_1";
    String fileName3 = "000000_2";

    Mockito.when(fileSystem.getFileChecksum(any(Path.class)))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00")))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff01")))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff02")));

    runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True");
    runner.setProperty(ComputeHDFSChecksums.FILES,
            String.format("[" + fileEntry + "," + fileEntry + "," + fileEntry + "]", fileName,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName2,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName3,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AgAAAAA="));
    runner.enqueue(new byte[0]);
    runner.run();//w w  w. ja va2 s.  co m

    // Check relationships
    Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size());
    Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size());

    // Check file system calls
    InOrder inOrder = Mockito.inOrder(fileSystem);
    inOrder.verify(fileSystem).getFileChecksum(new Path(fileName));
    inOrder.verify(fileSystem).getFileChecksum(new Path(fileName2));
    inOrder.verifyNoMoreInteractions();
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java

License:Apache License

@Test
public void testMultipleFilesWithDirectoryDefined() throws Exception {
    String fileName = "000000_0";
    String fileName2 = "000000_1";
    String fileName3 = "000000_2";
    String directory = "/dropzone";

    Mockito.when(fileSystem.getFileChecksum(any(Path.class)))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00")))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff01")))
            .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff02")));

    runner.setProperty(ComputeHDFSChecksums.DIRECTORY, directory);
    runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True");
    runner.setProperty(ComputeHDFSChecksums.FILES,
            String.format("[" + fileEntry + "," + fileEntry + "," + fileEntry + "]", fileName,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName2,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AQAAAAA=", fileName3,
                    "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AgAAAAA="));
    runner.enqueue(new byte[0]);
    runner.run();//from w  ww .jav a2  s .  c  o  m

    // Check relationships
    Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size());
    Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size());

    // Check file system calls
    InOrder inOrder = Mockito.inOrder(fileSystem);
    inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName));
    inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName2));
    inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName3));
    inOrder.verifyNoMoreInteractions();
}

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

@Override
public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter)
        throws IOException {

    if (url.getLength() == 0) {
        LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
        return;//from  w  ww .j a  v  a 2 s  . c  o  m
    }

    try {
        // allocate parse output 
        ParseOutput parseOutput = new ParseOutput();
        // json object out ... 
        JsonObject jsonObj = new JsonObject();
        // and create a crawl metadata 
        CrawlMetadata metadata = parseOutput.getCrawlMetadata();

        // and content (if available) ... 
        Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null;

        URL originalURL = null;

        try {
            originalURL = new URL(url.toString());
        } catch (MalformedURLException e) {
            LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
            return;
        }

        URL finalURL = originalURL;

        jsonObj.addProperty("attempt_time", value.getLastAttemptTime());
        metadata.setAttemptTime(value.getLastAttemptTime());

        // first step write status 
        jsonObj.addProperty("disposition",
                (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE");
        metadata.setCrawlDisposition(
                (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));

        // deal with redirects ... 
        if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter);
            jsonObj.add("redirect_from", redirect.e1);
            finalURL = redirect.e0;
        }

        if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
            jsonObj.addProperty("failure_reason",
                    CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
            metadata.setFailureReason(value.getLastAttemptFailureReason());
            jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail());
            metadata.setFailureDetail(value.getLastAttemptFailureDetail());
        } else {
            jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
            metadata.setServerIP(value.getServerIP());
            jsonObj.addProperty("http_result", value.getResultCode());
            metadata.setHttpResult(value.getResultCode());
            jsonObj.add("http_headers",
                    httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
            metadata.setHttpHeaders(value.getHeaders());
            jsonObj.addProperty("content_len", value.getContentRaw().getCount());
            metadata.setContentLength(value.getContentRaw().getCount());
            if (value.getResultCode() >= 200 && value.getResultCode() <= 299
                    && value.getContentRaw().getCount() > 0) {
                contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata);
            }
        }

        // ok ... write stuff out ...
        reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
        //////////////////////////////////////////////////////////////
        // echo some stuff to parseOutput ... 
        parseOutput.setMetadata(jsonObj.toString());
        JsonElement mimeType = jsonObj.get("mime_type");
        if (mimeType != null) {
            parseOutput.setNormalizedMimeType(mimeType.getAsString());
        }
        JsonElement md5 = jsonObj.get("md5");
        if (md5 != null) {
            MD5Hash hash = new MD5Hash(md5.getAsString());
            byte[] bytes = hash.getDigest();
            parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length));
        }
        JsonElement simHash = jsonObj.get("text_simhash");
        if (simHash != null) {
            parseOutput.setSimHash(simHash.getAsLong());
        }
        parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
        parseOutput.setFetchTime(value.getLastAttemptTime());
        ////////////////////////////////////////////////////////////

        if (contentOut != null) {
            if (contentOut.e0 != null) {
                parseOutput.setTextContent(contentOut.e0);
                reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
            }
            if (contentOut.e1 != null) {

                // directly set the text bytes ... 
                parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
                // mark it dirty !!!
                parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
                // if content available ... 
                if (contentOut.e1.e1 != null) {
                    parseOutput.setRawContent(contentOut.e1.e1);
                }
                reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
            }
        }

        //buildCompactMetadata(parseOutput,jsonObj,urlMap);

        output.collect(new Text(finalURL.toString()), parseOutput);
    } catch (IOException e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
        //TODO:HACK
        //throw e;
    } catch (Exception e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
        //TODO: HACK 
        //throw new IOException(e);
    }
}