List of usage examples for org.apache.hadoop.io MD5Hash MD5Hash
public MD5Hash(byte[] digest)
From source file:com.mellanox.r4h.DFSClient.java
License:Apache License
/** * Get the checksum of the whole file of a range of the file. Note that the * range always starts from the beginning of the file. * //from w ww .ja v a 2 s. c om * @param src * The file path * @param length * the length of the range, i.e., the range is [0, length] * @return The checksum * @see DistributedFileSystem#getFileChecksum(Path) */ public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length) throws IOException { checkOpen(); Preconditions.checkArgument(length >= 0); // get block locations for the file range LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0, length); if (null == blockLocations) { throw new FileNotFoundException("File does not exist: " + src); } List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks(); final DataOutputBuffer md5out = new DataOutputBuffer(); int bytesPerCRC = -1; DataChecksum.Type crcType = DataChecksum.Type.DEFAULT; long crcPerBlock = 0; boolean refetchBlocks = false; int lastRetriedIndex = -1; // get block checksum for each block long remaining = length; if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) { remaining = Math.min(length, blockLocations.getFileLength()); } for (int i = 0; i < locatedblocks.size() && remaining > 0; i++) { if (refetchBlocks) { // refetch to get fresh tokens blockLocations = callGetBlockLocations(namenode, src, 0, length); if (null == blockLocations) { throw new FileNotFoundException("File does not exist: " + src); } locatedblocks = blockLocations.getLocatedBlocks(); refetchBlocks = false; } LocatedBlock lb = locatedblocks.get(i); final ExtendedBlock block = lb.getBlock(); if (remaining < block.getNumBytes()) { block.setNumBytes(remaining); } remaining -= block.getNumBytes(); final DatanodeInfo[] datanodes = lb.getLocations(); // try each datanode location of the block final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout(); boolean done = false; for (int j = 0; !done && j < datanodes.length; j++) { DataOutputStream out = null; DataInputStream in = null; try { // connect to a datanode IOStreamPair pair = connectToDN(datanodes[j], timeout, lb); out = new DataOutputStream(new BufferedOutputStream(pair.out, HdfsConstants.SMALL_BUFFER_SIZE)); in = new DataInputStream(pair.in); if (LOG.isDebugEnabled()) { LOG.debug("write to " + datanodes[j] + ": " + Op.BLOCK_CHECKSUM + ", block=" + block); } // get block MD5 new Sender(out).blockChecksum(block, lb.getBlockToken()); final BlockOpResponseProto reply = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in)); String logInfo = "for block " + block + " from datanode " + datanodes[j]; DataTransferProtoUtil.checkBlockOpStatus(reply, logInfo); OpBlockChecksumResponseProto checksumData = reply.getChecksumResponse(); // read byte-per-checksum final int bpc = checksumData.getBytesPerCrc(); if (i == 0) { // first block bytesPerCRC = bpc; } else if (bpc != bytesPerCRC) { throw new IOException( "Byte-per-checksum not matched: bpc=" + bpc + " but bytesPerCRC=" + bytesPerCRC); } // read crc-per-block final long cpb = checksumData.getCrcPerBlock(); if (locatedblocks.size() > 1 && i == 0) { crcPerBlock = cpb; } // read md5 final MD5Hash md5 = new MD5Hash(checksumData.getMd5().toByteArray()); md5.write(md5out); // read crc-type final DataChecksum.Type ct; if (checksumData.hasCrcType()) { ct = PBHelper.convert(checksumData.getCrcType()); } else { LOG.debug("Retrieving checksum from an earlier-version DataNode: " + "inferring checksum by reading first byte"); ct = inferChecksumTypeByReading(lb, datanodes[j]); } if (i == 0) { // first block crcType = ct; } else if (crcType != DataChecksum.Type.MIXED && crcType != ct) { // if crc types are mixed in a file crcType = DataChecksum.Type.MIXED; } done = true; if (LOG.isDebugEnabled()) { if (i == 0) { LOG.debug("set bytesPerCRC=" + bytesPerCRC + ", crcPerBlock=" + crcPerBlock); } LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5); } } catch (InvalidBlockTokenException ibte) { if (i > lastRetriedIndex) { if (LOG.isDebugEnabled()) { LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM " + "for file " + src + " for block " + block + " from datanode " + datanodes[j] + ". Will retry the block once."); } lastRetriedIndex = i; done = true; // actually it's not done; but we'll retry i--; // repeat at i-th block refetchBlocks = true; break; } } catch (IOException ie) { LOG.warn("src=" + src + ", datanodes[" + j + "]=" + datanodes[j], ie); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } } if (!done) { throw new IOException("Fail to get block MD5 for " + block); } } // compute file MD5 final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData()); switch (crcType) { case CRC32: return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC, crcPerBlock, fileMD5); case CRC32C: return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC, crcPerBlock, fileMD5); default: // If there is no block allocated for the file, // return one with the magic entry that matches what previous // hdfs versions return. if (locatedblocks.size() == 0) { return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5); } // we should never get here since the validity was checked // when getCrcType() was called above. return null; } }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java
License:Apache License
@Test public void testSingleFileInListDontFailOnWrongChecksum() throws Exception { String fileName = "000000_0"; Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00"))) .when(fileSystem).getFileChecksum(any(Path.class)); runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "False"); runner.setProperty(ComputeHDFSChecksums.FILES, String.format("[" + fileEntry + "]", fileName, "AAACAAAAAAAAAAAArRnBpxcZ9ze14XqfLMB4yA==")); runner.enqueue(new byte[0]); runner.run();/*from ww w . j a v a 2 s . c o m*/ // Check relationships Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size()); Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size()); // Check whether checksum was passed correctly to attributes String filesJSON = runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).get(0) .getAttribute("files"); Gson jsonParser = new Gson(); ComputeHDFSChecksums.File[] files = jsonParser.fromJson(filesJSON, ComputeHDFSChecksums.File[].class); Assert.assertEquals(files[0].getComputedChecksum().getValue(), "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA="); // Check file system calls verifyGetFileChecksumCall(fileName); }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java
License:Apache License
@Test public void testSingleFileInListFailOnWrongChecksum() throws Exception { String fileName = "000000_0"; Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00"))) .when(fileSystem).getFileChecksum(any(Path.class)); runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True"); runner.setProperty(ComputeHDFSChecksums.FILES, String.format("[" + fileEntry + "]", fileName, "AAACAAAAAAAAAAAArRnBpxcZ9ze14XqfLMB4yA==")); runner.enqueue(new byte[0]); runner.run();/*from ww w . ja v a 2 s . c o m*/ // Check relationships Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size()); Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size()); // Check file system calls verifyGetFileChecksumCall(fileName); }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java
License:Apache License
@Test public void testSingleFileProperChecksum() throws Exception { String fileName = "000000_0"; Mockito.doReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00"))) .when(fileSystem).getFileChecksum(any(Path.class)); runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True"); runner.setProperty(ComputeHDFSChecksums.FILES, String.format("[" + fileEntry + "]", fileName, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=")); runner.enqueue(new byte[0]); runner.run();/*from w w w. j a v a 2 s . c o m*/ // Check relationships Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size()); Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size()); // Check file system calls verifyGetFileChecksumCall(fileName); }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java
License:Apache License
@Test public void testMultipleFilesFailOnSingleWrongChecksum() throws Exception { String fileName = "000000_0"; String fileName2 = "000000_1"; String fileName3 = "000000_2"; Mockito.when(fileSystem.getFileChecksum(any(Path.class))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00"))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff01"))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff02"))); runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True"); runner.setProperty(ComputeHDFSChecksums.FILES, String.format("[" + fileEntry + "," + fileEntry + "," + fileEntry + "]", fileName, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName2, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName3, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AgAAAAA=")); runner.enqueue(new byte[0]); runner.run();//w w w. ja va2 s. co m // Check relationships Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size()); Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size()); // Check file system calls InOrder inOrder = Mockito.inOrder(fileSystem); inOrder.verify(fileSystem).getFileChecksum(new Path(fileName)); inOrder.verify(fileSystem).getFileChecksum(new Path(fileName2)); inOrder.verifyNoMoreInteractions(); }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.ComputeHDFSChecksumsTest.java
License:Apache License
@Test public void testMultipleFilesWithDirectoryDefined() throws Exception { String fileName = "000000_0"; String fileName2 = "000000_1"; String fileName3 = "000000_2"; String directory = "/dropzone"; Mockito.when(fileSystem.getFileChecksum(any(Path.class))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff00"))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff01"))) .thenReturn(new MD5MD5CRC32FileChecksum(0, 512, new MD5Hash("112233445566778899aabbccddeeff02"))); runner.setProperty(ComputeHDFSChecksums.DIRECTORY, directory); runner.setProperty(ComputeHDFSChecksums.FAIL_IF_INCORRECT_CHECKSUM, "True"); runner.setProperty(ComputeHDFSChecksums.FILES, String.format("[" + fileEntry + "," + fileEntry + "," + fileEntry + "]", fileName, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AAAAAAA=", fileName2, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AQAAAAA=", fileName3, "AAAAAAAAAAAAAAIAESIzRFVmd4iZqrvM3e7/AgAAAAA=")); runner.enqueue(new byte[0]); runner.run();//from w ww .jav a2 s . c o m // Check relationships Assert.assertEquals(0, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_FAILURE).size()); Assert.assertEquals(1, runner.getFlowFilesForRelationship(ComputeHDFSChecksums.REL_SUCCESS).size()); // Check file system calls InOrder inOrder = Mockito.inOrder(fileSystem); inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName)); inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName2)); inOrder.verify(fileSystem).getFileChecksum(new Path(directory, fileName3)); inOrder.verifyNoMoreInteractions(); }
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
@Override public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter) throws IOException { if (url.getLength() == 0) { LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL()); return;//from w ww .j a v a 2 s . c o m } try { // allocate parse output ParseOutput parseOutput = new ParseOutput(); // json object out ... JsonObject jsonObj = new JsonObject(); // and create a crawl metadata CrawlMetadata metadata = parseOutput.getCrawlMetadata(); // and content (if available) ... Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null; URL originalURL = null; try { originalURL = new URL(url.toString()); } catch (MalformedURLException e) { LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1); return; } URL finalURL = originalURL; jsonObj.addProperty("attempt_time", value.getLastAttemptTime()); metadata.setAttemptTime(value.getLastAttemptTime()); // first step write status jsonObj.addProperty("disposition", (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE"); metadata.setCrawlDisposition( (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1)); // deal with redirects ... if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter); jsonObj.add("redirect_from", redirect.e1); finalURL = redirect.e0; } if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) { jsonObj.addProperty("failure_reason", CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason())); metadata.setFailureReason(value.getLastAttemptFailureReason()); jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail()); metadata.setFailureDetail(value.getLastAttemptFailureDetail()); } else { jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); metadata.setServerIP(value.getServerIP()); jsonObj.addProperty("http_result", value.getResultCode()); metadata.setHttpResult(value.getResultCode()); jsonObj.add("http_headers", httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders()))); metadata.setHttpHeaders(value.getHeaders()); jsonObj.addProperty("content_len", value.getContentRaw().getCount()); metadata.setContentLength(value.getContentRaw().getCount()); if (value.getResultCode() >= 200 && value.getResultCode() <= 299 && value.getContentRaw().getCount() > 0) { contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata); } } // ok ... write stuff out ... reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1); ////////////////////////////////////////////////////////////// // echo some stuff to parseOutput ... parseOutput.setMetadata(jsonObj.toString()); JsonElement mimeType = jsonObj.get("mime_type"); if (mimeType != null) { parseOutput.setNormalizedMimeType(mimeType.getAsString()); } JsonElement md5 = jsonObj.get("md5"); if (md5 != null) { MD5Hash hash = new MD5Hash(md5.getAsString()); byte[] bytes = hash.getDigest(); parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length)); } JsonElement simHash = jsonObj.get("text_simhash"); if (simHash != null) { parseOutput.setSimHash(simHash.getAsLong()); } parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); parseOutput.setFetchTime(value.getLastAttemptTime()); //////////////////////////////////////////////////////////// if (contentOut != null) { if (contentOut.e0 != null) { parseOutput.setTextContent(contentOut.e0); reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1); } if (contentOut.e1 != null) { // directly set the text bytes ... parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0); // mark it dirty !!! parseOutput.setFieldDirty(ParseOutput.Field_HEADERS); // if content available ... if (contentOut.e1.e1 != null) { parseOutput.setRawContent(contentOut.e1.e1); } reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1); } } //buildCompactMetadata(parseOutput,jsonObj,urlMap); output.collect(new Text(finalURL.toString()), parseOutput); } catch (IOException e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1); //TODO:HACK //throw e; } catch (Exception e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1); //TODO: HACK //throw new IOException(e); } }