Example usage for org.apache.hadoop.io MD5Hash digest

List of usage examples for org.apache.hadoop.io MD5Hash digest

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash digest.

Prototype

null digest

To view the source code for org.apache.hadoop.io MD5Hash digest.

Click Source Link

Usage

From source file:co.nubetech.hiho.dedup.HashUtility.java

License:Apache License

public static MD5Hash getMD5Hash(BooleanWritable key) throws IOException {
    return MD5Hash.digest(key.toString());
}

From source file:co.nubetech.hiho.dedup.HashUtility.java

License:Apache License

public static MD5Hash getMD5Hash(FloatWritable key) throws IOException {
    return MD5Hash.digest(key.toString());
}

From source file:co.nubetech.hiho.dedup.HashUtility.java

License:Apache License

public static MD5Hash getMD5Hash(ByteWritable key) throws IOException {
    return MD5Hash.digest(key.toString());
}

From source file:co.nubetech.hiho.dedup.HashUtility.java

License:Apache License

public static MD5Hash getMD5Hash(DoubleWritable key) throws IOException {
    return MD5Hash.digest(key.toString());
}

From source file:com.iflytek.spider.crawl.MD5Signature.java

License:Apache License

public byte[] calculate(Content content, Parse parse) {
    byte[] data = content.getContent();
    //    if (data == null) data = content.getUrl().getBytes();
    //    StringBuilder buf = new StringBuilder().append(data).append(parse.getText());
    return MD5Hash.digest(data).getDigest();
}

From source file:com.lightboxtechnologies.spectrum.BlockHashMapper.java

License:Apache License

@Override
protected void map(NullWritable k, FileSplit split, Context context) throws IOException, InterruptedException {
    final long startOffset = split.getStart(), endOffset = startOffset + split.getLength();
    LOG.info("startOffset = " + startOffset + "; endOffset = " + endOffset);
    context.setStatus("Offset " + startOffset);

    final Configuration conf = context.getConfiguration();
    final FileSystem fs = FileSystem.get(conf);

    openImgFile(split.getPath(), fs);//from  w  ww  .j a  va 2  s. co  m

    long numBlocks = 0;
    long curOffset = startOffset;
    while (curOffset < endOffset) {
        ImgFile.readFully(curOffset, Buffer);
        for (int i = 0; i < Buffer.length; i += BLOCK_SIZE) {
            BlockOffset.set(curOffset + i);
            context.write(BlockOffset, MD5Hash.digest(Buffer, i, BLOCK_SIZE));
            ++numBlocks;
        }
        curOffset += Buffer.length;
    }
    LOG.info("This split had " + numBlocks + " blocks in it");
}

From source file:com.mellanox.r4h.DFSClient.java

License:Apache License

/**
 * Get the checksum of the whole file of a range of the file. Note that the
 * range always starts from the beginning of the file.
 * /*from   w w w.  ja va  2  s.  c  om*/
 * @param src
 *            The file path
 * @param length
 *            the length of the range, i.e., the range is [0, length]
 * @return The checksum
 * @see DistributedFileSystem#getFileChecksum(Path)
 */
public MD5MD5CRC32FileChecksum getFileChecksum(String src, long length) throws IOException {
    checkOpen();
    Preconditions.checkArgument(length >= 0);
    // get block locations for the file range
    LocatedBlocks blockLocations = callGetBlockLocations(namenode, src, 0, length);
    if (null == blockLocations) {
        throw new FileNotFoundException("File does not exist: " + src);
    }
    List<LocatedBlock> locatedblocks = blockLocations.getLocatedBlocks();
    final DataOutputBuffer md5out = new DataOutputBuffer();
    int bytesPerCRC = -1;
    DataChecksum.Type crcType = DataChecksum.Type.DEFAULT;
    long crcPerBlock = 0;
    boolean refetchBlocks = false;
    int lastRetriedIndex = -1;

    // get block checksum for each block
    long remaining = length;
    if (src.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
        remaining = Math.min(length, blockLocations.getFileLength());
    }
    for (int i = 0; i < locatedblocks.size() && remaining > 0; i++) {
        if (refetchBlocks) { // refetch to get fresh tokens
            blockLocations = callGetBlockLocations(namenode, src, 0, length);
            if (null == blockLocations) {
                throw new FileNotFoundException("File does not exist: " + src);
            }
            locatedblocks = blockLocations.getLocatedBlocks();
            refetchBlocks = false;
        }
        LocatedBlock lb = locatedblocks.get(i);
        final ExtendedBlock block = lb.getBlock();
        if (remaining < block.getNumBytes()) {
            block.setNumBytes(remaining);
        }
        remaining -= block.getNumBytes();
        final DatanodeInfo[] datanodes = lb.getLocations();

        // try each datanode location of the block
        final int timeout = 3000 * datanodes.length + dfsClientConf.socketTimeout();
        boolean done = false;
        for (int j = 0; !done && j < datanodes.length; j++) {
            DataOutputStream out = null;
            DataInputStream in = null;

            try {
                // connect to a datanode
                IOStreamPair pair = connectToDN(datanodes[j], timeout, lb);
                out = new DataOutputStream(new BufferedOutputStream(pair.out, HdfsConstants.SMALL_BUFFER_SIZE));
                in = new DataInputStream(pair.in);

                if (LOG.isDebugEnabled()) {
                    LOG.debug("write to " + datanodes[j] + ": " + Op.BLOCK_CHECKSUM + ", block=" + block);
                }
                // get block MD5
                new Sender(out).blockChecksum(block, lb.getBlockToken());

                final BlockOpResponseProto reply = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));

                String logInfo = "for block " + block + " from datanode " + datanodes[j];
                DataTransferProtoUtil.checkBlockOpStatus(reply, logInfo);

                OpBlockChecksumResponseProto checksumData = reply.getChecksumResponse();

                // read byte-per-checksum
                final int bpc = checksumData.getBytesPerCrc();
                if (i == 0) { // first block
                    bytesPerCRC = bpc;
                } else if (bpc != bytesPerCRC) {
                    throw new IOException(
                            "Byte-per-checksum not matched: bpc=" + bpc + " but bytesPerCRC=" + bytesPerCRC);
                }

                // read crc-per-block
                final long cpb = checksumData.getCrcPerBlock();
                if (locatedblocks.size() > 1 && i == 0) {
                    crcPerBlock = cpb;
                }

                // read md5
                final MD5Hash md5 = new MD5Hash(checksumData.getMd5().toByteArray());
                md5.write(md5out);

                // read crc-type
                final DataChecksum.Type ct;
                if (checksumData.hasCrcType()) {
                    ct = PBHelper.convert(checksumData.getCrcType());
                } else {
                    LOG.debug("Retrieving checksum from an earlier-version DataNode: "
                            + "inferring checksum by reading first byte");
                    ct = inferChecksumTypeByReading(lb, datanodes[j]);
                }

                if (i == 0) { // first block
                    crcType = ct;
                } else if (crcType != DataChecksum.Type.MIXED && crcType != ct) {
                    // if crc types are mixed in a file
                    crcType = DataChecksum.Type.MIXED;
                }

                done = true;

                if (LOG.isDebugEnabled()) {
                    if (i == 0) {
                        LOG.debug("set bytesPerCRC=" + bytesPerCRC + ", crcPerBlock=" + crcPerBlock);
                    }
                    LOG.debug("got reply from " + datanodes[j] + ": md5=" + md5);
                }
            } catch (InvalidBlockTokenException ibte) {
                if (i > lastRetriedIndex) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Got access token error in response to OP_BLOCK_CHECKSUM " + "for file " + src
                                + " for block " + block + " from datanode " + datanodes[j]
                                + ". Will retry the block once.");
                    }
                    lastRetriedIndex = i;
                    done = true; // actually it's not done; but we'll retry
                    i--; // repeat at i-th block
                    refetchBlocks = true;
                    break;
                }
            } catch (IOException ie) {
                LOG.warn("src=" + src + ", datanodes[" + j + "]=" + datanodes[j], ie);
            } finally {
                IOUtils.closeStream(in);
                IOUtils.closeStream(out);
            }
        }

        if (!done) {
            throw new IOException("Fail to get block MD5 for " + block);
        }
    }

    // compute file MD5
    final MD5Hash fileMD5 = MD5Hash.digest(md5out.getData());
    switch (crcType) {
    case CRC32:
        return new MD5MD5CRC32GzipFileChecksum(bytesPerCRC, crcPerBlock, fileMD5);
    case CRC32C:
        return new MD5MD5CRC32CastagnoliFileChecksum(bytesPerCRC, crcPerBlock, fileMD5);
    default:
        // If there is no block allocated for the file,
        // return one with the magic entry that matches what previous
        // hdfs versions return.
        if (locatedblocks.size() == 0) {
            return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5);
        }

        // we should never get here since the validity was checked
        // when getCrcType() was called above.
        return null;
    }
}

From source file:com.shmsoft.dmass.main.FileProcessor.java

License:Apache License

public static MD5Hash createKeyHash(String fileName, Metadata metadata, String originalFileName)
        throws IOException {
    String extension = Util.getExtension(originalFileName);

    if ("eml".equalsIgnoreCase(extension)) {
        String hashNames = EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES);
        String[] hashNamesArr = hashNames.split(",");

        StringBuffer data = new StringBuffer();

        for (String hashName : hashNamesArr) {
            String value = metadata.get(hashName);
            if (value != null) {
                data.append(value);//www.j  a v  a 2 s .c om
                data.append(" ");
            }
        }

        return MD5Hash.digest(data.toString());
    } else {
        //use MD5 of the input file as Hadoop key      
        FileInputStream fileInputStream = new FileInputStream(fileName);
        MD5Hash key = MD5Hash.digest(fileInputStream);
        fileInputStream.close();

        return key;
    }
}

From source file:com.shmsoft.dmass.main.ZipFileProcessor.java

License:Apache License

/**
 * Emit the map with all metadata, native, and text
 *
 * @param fileName//from   www. j  a va2  s.  c o  m
 * @param metadata
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
private void emitAsMap(String fileName, Metadata metadata) throws IOException, InterruptedException {
    Project project = Project.getProject();
    if (project.checkSkip()) {
        return;
    }
    //History.appendToHistory("emitAsMap: fileName = " + fileName + " metadata = " + metadata.toString());
    System.out.println("emitAsMap: fileName = " + fileName + " metadata = " + metadata.toString());
    MapWritable mapWritable = createMapWritable(metadata);
    MD5Hash key = MD5Hash.digest(new FileInputStream(fileName));
    if (PlatformUtil.isNix()) {
        getContext().write(key, mapWritable);
        getContext().progress();
    } else {
        List<MapWritable> values = new ArrayList<MapWritable>();
        values.add(mapWritable);
        WindowsReduce.getInstance().reduce(key, values, null);
    }
    // update stats
    Stats.getInstance().increaseItemCount();
}

From source file:com.siriuser.weibocrawl.MD5Signature.java

License:Apache License

public String calculate(String content) {

    byte[] data = content.getBytes();

    return toHexString(MD5Hash.digest(data).getDigest());
}