Example usage for org.apache.hadoop.io MD5Hash digest

Introduction

In this page you can find the example usage for org.apache.hadoop.io MD5Hash digest.

Prototype

null digest

To view the source code for org.apache.hadoop.io MD5Hash digest.

Click Source Link

Usage

From source file:com.tripadvisor.hadoop.ExternalHDFSChecksumGenerator.java

License:Apache License

/**
 *
 * This is the function that calculates the hdfs-style checksum for a local file in the same way that
 * hdfs does it in a parallel fashion on all of the blocks in hdsf.
 *
 * @param strPath/*from  w  w  w.  j  a v a 2 s .  co  m*/
 * @param bytesPerCRC
 * @param lBlockSize
 * @return
 * @throws IOException
 */
public MD5MD5CRC32FileChecksum getLocalFilesystemHDFSStyleChecksum(String strPath, int bytesPerCRC,
        long lBlockSize) throws IOException {
    long lFileSize = 0;
    int iBlockCount = 0;
    DataOutputBuffer md5outDataBuffer = new DataOutputBuffer();
    DataChecksum chksm = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32, 512);
    InputStream in = null;
    MD5MD5CRC32FileChecksum returnChecksum = null;
    long crc_per_block = lBlockSize / bytesPerCRC;

    java.io.File file = new java.io.File(strPath);

    // FileStatus f_stats = srcFs.getFileStatus( srcPath );
    lFileSize = file.length();

    iBlockCount = (int) Math.ceil((double) lFileSize / (double) lBlockSize);

    // System.out.println( "Debug > getLen == " + f_stats.getLen() +
    // " bytes" );
    // System.out.println( "Debug > iBlockCount == " + iBlockCount );

    if (file.isDirectory()) {
        throw new IOException("Cannot compute local hdfs hash, " + strPath + " is a directory! ");
    }

    try {
        in = new FileInputStream(file);
        long lTotalBytesRead = 0;

        for (int x = 0; x < iBlockCount; x++) {

            ByteArrayOutputStream ar_CRC_Bytes = new ByteArrayOutputStream();

            byte crc[] = new byte[4];
            byte buf[] = new byte[512];

            try {

                int bytesRead = 0;

                while ((bytesRead = in.read(buf)) > 0) {

                    lTotalBytesRead += bytesRead;

                    chksm.reset();
                    chksm.update(buf, 0, bytesRead);
                    chksm.writeValue(crc, 0, true);
                    ar_CRC_Bytes.write(crc);

                    if (lTotalBytesRead >= (x + 1) * lBlockSize) {

                        break;
                    }

                } // while

                DataInputStream inputStream = new DataInputStream(
                        new ByteArrayInputStream(ar_CRC_Bytes.toByteArray()));

                // this actually computes one ---- run on the server
                // (DataXceiver) side
                final MD5Hash md5_dataxceiver = MD5Hash.digest(inputStream);
                md5_dataxceiver.write(md5outDataBuffer);

            } catch (IOException e) {

                e.printStackTrace();

            } catch (Exception e) {

                e.printStackTrace();

            }

        } // for

        // this is in 0.19.0 style with the extra padding bug
        final MD5Hash md5_of_md5 = MD5Hash.digest(md5outDataBuffer.getData());
        returnChecksum = new MD5MD5CRC32FileChecksum(bytesPerCRC, crc_per_block, md5_of_md5);

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {

        e.printStackTrace();

    } finally {
        in.close();
    } // try

    return returnChecksum;

}

From source file:edu.umd.lib.hadoopapps.MD5Sum.java

License:Apache License

/**
 * Runs the app.//from   w w w  . j  a  v a  2  s .c  o m
 */

public int run(String[] args) throws Exception {

    if (args.length < 1) {
        usage();
        return -1;
    }
    Path filePath = new Path(args[0]);
    FileSystem hdfs = FileSystem.get(new Configuration());
    if (hdfs.exists(filePath)) {
        InputStream in = (InputStream) (hdfs.open(filePath));
        MD5Hash md5hash = MD5Hash.digest(in);
        System.out.println("MD5Sum: " + md5hash.toString());
    } else {
        System.out.println("Invalid path!");
    }
    return 0;
}

From source file:org.apache.nutch.crawl.MD5Signature.java

License:Apache License

@Override
public byte[] calculate(WebPage page) {
    ByteBuffer buf = page.getContent();
    byte[] data;//ww w . ja v  a 2 s. c o m
    int of;
    int cb;
    if (buf == null) {
        Utf8 baseUrl = (Utf8) page.getBaseUrl();
        if (baseUrl == null) {
            data = null;
            of = 0;
            cb = 0;
        } else {
            data = baseUrl.getBytes();
            of = 0;
            cb = baseUrl.length();
        }
    } else {
        data = buf.array();
        of = buf.arrayOffset() + buf.position();
        cb = buf.remaining();
    }

    return MD5Hash.digest(data, of, cb).getDigest();
}

From source file:org.apache.nutch.crawl.TextMD5Signature.java

License:Apache License

@Override
public byte[] calculate(WebPage page) {
    CharSequence text = page.getText();

    if (text == null || text.length() == 0) {
        return fallback.calculate(page);
    }/*from  w w  w . j  av a  2 s. c o  m*/

    return MD5Hash.digest(text.toString()).getDigest();
}

From source file:org.apache.nutch.crawl.TextProfileSignature.java

License:Apache License

@Override
public byte[] calculate(WebPage page) {
    int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2);
    float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
    HashMap<String, Token> tokens = new HashMap<String, Token>();
    String text = null;/*from   ww  w .  ja v a 2 s  .c o  m*/
    if (page.getText() != null)
        text = page.getText().toString();
    if (text == null || text.length() == 0)
        return fallback.calculate(page);
    StringBuffer curToken = new StringBuffer();
    int maxFreq = 0;
    for (int i = 0; i < text.length(); i++) {
        char c = text.charAt(i);
        if (Character.isLetterOrDigit(c)) {
            curToken.append(Character.toLowerCase(c));
        } else {
            if (curToken.length() > 0) {
                if (curToken.length() > MIN_TOKEN_LEN) {
                    // add it
                    String s = curToken.toString();
                    Token tok = tokens.get(s);
                    if (tok == null) {
                        tok = new Token(0, s);
                        tokens.put(s, tok);
                    }
                    tok.cnt++;
                    if (tok.cnt > maxFreq)
                        maxFreq = tok.cnt;
                }
                curToken.setLength(0);
            }
        }
    }
    // check the last token
    if (curToken.length() > MIN_TOKEN_LEN) {
        // add it
        String s = curToken.toString();
        Token tok = tokens.get(s);
        if (tok == null) {
            tok = new Token(0, s);
            tokens.put(s, tok);
        }
        tok.cnt++;
        if (tok.cnt > maxFreq)
            maxFreq = tok.cnt;
    }
    Iterator<Token> it = tokens.values().iterator();
    ArrayList<Token> profile = new ArrayList<Token>();
    // calculate the QUANT value
    int QUANT = Math.round(maxFreq * QUANT_RATE);
    if (QUANT < 2) {
        if (maxFreq > 1)
            QUANT = 2;
        else
            QUANT = 1;
    }
    while (it.hasNext()) {
        Token t = it.next();
        // round down to the nearest QUANT
        t.cnt = (t.cnt / QUANT) * QUANT;
        // discard the frequencies below the QUANT
        if (t.cnt < QUANT) {
            continue;
        }
        profile.add(t);
    }
    Collections.sort(profile, new TokenComparator());
    StringBuffer newText = new StringBuffer();
    it = profile.iterator();
    while (it.hasNext()) {
        Token t = it.next();
        if (newText.length() > 0)
            newText.append("\n");
        newText.append(t.toString());
    }
    return MD5Hash.digest(newText.toString()).getDigest();
}

From source file:org.apache.nutch.indexer.blacklight.BlacklightIndexingFilter.java

License:Apache License

/**
 * The {@link BasicIndexingFilter} filter object which supports few 
 * configuration settings for adding basic searchable fields. 
 * See {@code indexer.add.domain}, {@code indexer.max.title.length}, 
 * {@code indexer.max.content.length} in nutch-default.xml.
 *  /* w w w. j  a v a 2s .c  om*/
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
        throws IndexingException {

    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
    String urlString = url.toString();

    //add url digest (id) url digest to avoid problems of dots within url (natural id) in blacklight/ruby

    /*
            
    doc.add("digest", digest);*/

    if (doc.getField("offerdate") == null)
        return null;
    Object pdObject = null;
    String pub_date_t = null;
    if (doc.getField("offerdate") != null)
        pdObject = doc.getField("offerdate").getValues().get(0);
    try {
        if (pdObject == null)
            pdObject = new Date(0);
        if (pdObject instanceof String) {

            pdObject = (Date) new SimpleDateFormat("dd.MM.yyyy").parseObject((String) pdObject);

        }
        pub_date_t = new SimpleDateFormat("dd/MM/yyyy").format(pdObject);
    } catch (ParseException e) {
        pdObject = new Date(0);
        LOG.warn("failed to parse pub_date for blacklight");
    }

    Date pubDate = (Date) pdObject;

    long pubdateSort = pubDate.getTime() / 1000;

    doc.add("pub_date_sort", pubdateSort);

    doc.add("pub_date_t", pub_date_t);
    doc.removeField("offerdate");

    String cleanedUrl = ((String) doc.getField("url").getValues().get(0)).replaceFirst("http(s)?://", "");

    if (doc.getField("Location") != null) {
        cleanedUrl = ((String) (doc.getField("Location").getValues().get(0))).replaceFirst("http(s)?://", "");
        doc.removeField("Location");

    }

    doc.getField("url").getValues().set(0, cleanedUrl);
    String digest = StringUtil.toHexString(MD5Hash.digest(cleanedUrl).getDigest());
    doc.getField("digest").getValues().set(0, digest);

    if (doc.getField("host") != null) {
        String host = (String) doc.getField("host").getValues().get(0);
        String host_facet = host.split("\\.")[1];
        doc.add("host_facet", host_facet);
    }

    //TODO:specify remove cache flag in conf
    doc.removeField("cache");

    //getSpotlightAnnotations(doc);
    /*
            
    migrated to the schema
            
    //sort fields
    if(key.equals("jobplace_t")||key.equals("societyname_t")){
       String fieldFacet = (String)val;
       inputDoc.addField(key.replace("_t", "_facet"), fieldFacet);
    }
            
            
            
    //facets
    if(key.equals("offertitle_t")||key.equals("societyname_t")){
       String fieldSort = (String)val;
       inputDoc.addField(key.replace("_t", "_sort"), fieldSort);
    }
            
    //display
    if(key.endsWith("_t")){
       String fieldDisplay = (String)val;
       inputDoc.addField(key.replace("_t", "_display"), fieldDisplay);
    }*/

    return doc;
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst)
        throws Exception {
    Path idx = new Path(root, name);
    Path sub = new Path(idx, "part-0000");
    Directory dir = FSDirectory.getDirectory(sub.toString());
    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
    Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1",
            1.0f + (incFirst ? inc : 0.0f), time);
    writer.addDocument(doc);/*w ww .ja  v  a 2 s. c om*/
    if (hashDup) {
        doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/2",
                1.0f + (!incFirst ? inc : 0.0f), time + 1);
    } else {
        doc = makeDoc(name, MD5Hash.digest("2").toString(), "http://www.example.com/1",
                1.0f + (!incFirst ? inc : 0.0f), time + 1);
    }
    writer.addDocument(doc);
    writer.close();
    return idx;
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

private Path createSingleDocIndex(String name, float inc, long time) throws Exception {
    Path idx = new Path(root, name);
    Path sub = new Path(idx, "part-0000");
    Directory dir = FSDirectory.getDirectory(sub.toString());
    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
    Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1", 1.0f + inc,
            time + 1);// w  w w . j a  v a 2 s.c o m
    writer.addDocument(doc);
    writer.close();
    return idx;
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testUrlDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//  ww  w  .  j a va2 s. co  m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testMixedDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index1, index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//w ww  .j  a va2  s  .c  o m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check url", "http://www.example.com/2", doc.get("url"));
        System.out.println(doc);
    }
    reader.close();
    dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}