List of usage examples for org.apache.hadoop.io MD5Hash digest
null digest
To view the source code for org.apache.hadoop.io MD5Hash digest.
Click Source Link
From source file:com.tripadvisor.hadoop.ExternalHDFSChecksumGenerator.java
License:Apache License
/** * * This is the function that calculates the hdfs-style checksum for a local file in the same way that * hdfs does it in a parallel fashion on all of the blocks in hdsf. * * @param strPath/*from w w w. j a v a 2 s . co m*/ * @param bytesPerCRC * @param lBlockSize * @return * @throws IOException */ public MD5MD5CRC32FileChecksum getLocalFilesystemHDFSStyleChecksum(String strPath, int bytesPerCRC, long lBlockSize) throws IOException { long lFileSize = 0; int iBlockCount = 0; DataOutputBuffer md5outDataBuffer = new DataOutputBuffer(); DataChecksum chksm = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32, 512); InputStream in = null; MD5MD5CRC32FileChecksum returnChecksum = null; long crc_per_block = lBlockSize / bytesPerCRC; java.io.File file = new java.io.File(strPath); // FileStatus f_stats = srcFs.getFileStatus( srcPath ); lFileSize = file.length(); iBlockCount = (int) Math.ceil((double) lFileSize / (double) lBlockSize); // System.out.println( "Debug > getLen == " + f_stats.getLen() + // " bytes" ); // System.out.println( "Debug > iBlockCount == " + iBlockCount ); if (file.isDirectory()) { throw new IOException("Cannot compute local hdfs hash, " + strPath + " is a directory! "); } try { in = new FileInputStream(file); long lTotalBytesRead = 0; for (int x = 0; x < iBlockCount; x++) { ByteArrayOutputStream ar_CRC_Bytes = new ByteArrayOutputStream(); byte crc[] = new byte[4]; byte buf[] = new byte[512]; try { int bytesRead = 0; while ((bytesRead = in.read(buf)) > 0) { lTotalBytesRead += bytesRead; chksm.reset(); chksm.update(buf, 0, bytesRead); chksm.writeValue(crc, 0, true); ar_CRC_Bytes.write(crc); if (lTotalBytesRead >= (x + 1) * lBlockSize) { break; } } // while DataInputStream inputStream = new DataInputStream( new ByteArrayInputStream(ar_CRC_Bytes.toByteArray())); // this actually computes one ---- run on the server // (DataXceiver) side final MD5Hash md5_dataxceiver = MD5Hash.digest(inputStream); md5_dataxceiver.write(md5outDataBuffer); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } // for // this is in 0.19.0 style with the extra padding bug final MD5Hash md5_of_md5 = MD5Hash.digest(md5outDataBuffer.getData()); returnChecksum = new MD5MD5CRC32FileChecksum(bytesPerCRC, crc_per_block, md5_of_md5); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally { in.close(); } // try return returnChecksum; }
From source file:edu.umd.lib.hadoopapps.MD5Sum.java
License:Apache License
/** * Runs the app.//from w w w . j a v a 2 s .c o m */ public int run(String[] args) throws Exception { if (args.length < 1) { usage(); return -1; } Path filePath = new Path(args[0]); FileSystem hdfs = FileSystem.get(new Configuration()); if (hdfs.exists(filePath)) { InputStream in = (InputStream) (hdfs.open(filePath)); MD5Hash md5hash = MD5Hash.digest(in); System.out.println("MD5Sum: " + md5hash.toString()); } else { System.out.println("Invalid path!"); } return 0; }
From source file:org.apache.nutch.crawl.MD5Signature.java
License:Apache License
@Override public byte[] calculate(WebPage page) { ByteBuffer buf = page.getContent(); byte[] data;//ww w . ja v a 2 s. c o m int of; int cb; if (buf == null) { Utf8 baseUrl = (Utf8) page.getBaseUrl(); if (baseUrl == null) { data = null; of = 0; cb = 0; } else { data = baseUrl.getBytes(); of = 0; cb = baseUrl.length(); } } else { data = buf.array(); of = buf.arrayOffset() + buf.position(); cb = buf.remaining(); } return MD5Hash.digest(data, of, cb).getDigest(); }
From source file:org.apache.nutch.crawl.TextMD5Signature.java
License:Apache License
@Override public byte[] calculate(WebPage page) { CharSequence text = page.getText(); if (text == null || text.length() == 0) { return fallback.calculate(page); }/*from w w w . j av a 2 s. c o m*/ return MD5Hash.digest(text.toString()).getDigest(); }
From source file:org.apache.nutch.crawl.TextProfileSignature.java
License:Apache License
@Override public byte[] calculate(WebPage page) { int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2); float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f); HashMap<String, Token> tokens = new HashMap<String, Token>(); String text = null;/*from ww w . ja v a 2 s .c o m*/ if (page.getText() != null) text = page.getText().toString(); if (text == null || text.length() == 0) return fallback.calculate(page); StringBuffer curToken = new StringBuffer(); int maxFreq = 0; for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (Character.isLetterOrDigit(c)) { curToken.append(Character.toLowerCase(c)); } else { if (curToken.length() > 0) { if (curToken.length() > MIN_TOKEN_LEN) { // add it String s = curToken.toString(); Token tok = tokens.get(s); if (tok == null) { tok = new Token(0, s); tokens.put(s, tok); } tok.cnt++; if (tok.cnt > maxFreq) maxFreq = tok.cnt; } curToken.setLength(0); } } } // check the last token if (curToken.length() > MIN_TOKEN_LEN) { // add it String s = curToken.toString(); Token tok = tokens.get(s); if (tok == null) { tok = new Token(0, s); tokens.put(s, tok); } tok.cnt++; if (tok.cnt > maxFreq) maxFreq = tok.cnt; } Iterator<Token> it = tokens.values().iterator(); ArrayList<Token> profile = new ArrayList<Token>(); // calculate the QUANT value int QUANT = Math.round(maxFreq * QUANT_RATE); if (QUANT < 2) { if (maxFreq > 1) QUANT = 2; else QUANT = 1; } while (it.hasNext()) { Token t = it.next(); // round down to the nearest QUANT t.cnt = (t.cnt / QUANT) * QUANT; // discard the frequencies below the QUANT if (t.cnt < QUANT) { continue; } profile.add(t); } Collections.sort(profile, new TokenComparator()); StringBuffer newText = new StringBuffer(); it = profile.iterator(); while (it.hasNext()) { Token t = it.next(); if (newText.length() > 0) newText.append("\n"); newText.append(t.toString()); } return MD5Hash.digest(newText.toString()).getDigest(); }
From source file:org.apache.nutch.indexer.blacklight.BlacklightIndexingFilter.java
License:Apache License
/** * The {@link BasicIndexingFilter} filter object which supports few * configuration settings for adding basic searchable fields. * See {@code indexer.add.domain}, {@code indexer.max.title.length}, * {@code indexer.max.content.length} in nutch-default.xml. * /* w w w. j a v a 2s .c om*/ * @param doc The {@link NutchDocument} object * @param parse The relevant {@link Parse} object passing through the filter * @param url URL to be filtered for anchor text * @param datum The {@link CrawlDatum} entry * @param inlinks The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); String reprUrlString = reprUrl != null ? reprUrl.toString() : null; String urlString = url.toString(); //add url digest (id) url digest to avoid problems of dots within url (natural id) in blacklight/ruby /* doc.add("digest", digest);*/ if (doc.getField("offerdate") == null) return null; Object pdObject = null; String pub_date_t = null; if (doc.getField("offerdate") != null) pdObject = doc.getField("offerdate").getValues().get(0); try { if (pdObject == null) pdObject = new Date(0); if (pdObject instanceof String) { pdObject = (Date) new SimpleDateFormat("dd.MM.yyyy").parseObject((String) pdObject); } pub_date_t = new SimpleDateFormat("dd/MM/yyyy").format(pdObject); } catch (ParseException e) { pdObject = new Date(0); LOG.warn("failed to parse pub_date for blacklight"); } Date pubDate = (Date) pdObject; long pubdateSort = pubDate.getTime() / 1000; doc.add("pub_date_sort", pubdateSort); doc.add("pub_date_t", pub_date_t); doc.removeField("offerdate"); String cleanedUrl = ((String) doc.getField("url").getValues().get(0)).replaceFirst("http(s)?://", ""); if (doc.getField("Location") != null) { cleanedUrl = ((String) (doc.getField("Location").getValues().get(0))).replaceFirst("http(s)?://", ""); doc.removeField("Location"); } doc.getField("url").getValues().set(0, cleanedUrl); String digest = StringUtil.toHexString(MD5Hash.digest(cleanedUrl).getDigest()); doc.getField("digest").getValues().set(0, digest); if (doc.getField("host") != null) { String host = (String) doc.getField("host").getValues().get(0); String host_facet = host.split("\\.")[1]; doc.add("host_facet", host_facet); } //TODO:specify remove cache flag in conf doc.removeField("cache"); //getSpotlightAnnotations(doc); /* migrated to the schema //sort fields if(key.equals("jobplace_t")||key.equals("societyname_t")){ String fieldFacet = (String)val; inputDoc.addField(key.replace("_t", "_facet"), fieldFacet); } //facets if(key.equals("offertitle_t")||key.equals("societyname_t")){ String fieldSort = (String)val; inputDoc.addField(key.replace("_t", "_sort"), fieldSort); } //display if(key.endsWith("_t")){ String fieldDisplay = (String)val; inputDoc.addField(key.replace("_t", "_display"), fieldDisplay); }*/ return doc; }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); Directory dir = FSDirectory.getDirectory(sub.toString()); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1", 1.0f + (incFirst ? inc : 0.0f), time); writer.addDocument(doc);/*w ww .ja v a 2 s. c om*/ if (hashDup) { doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/2", 1.0f + (!incFirst ? inc : 0.0f), time + 1); } else { doc = makeDoc(name, MD5Hash.digest("2").toString(), "http://www.example.com/1", 1.0f + (!incFirst ? inc : 0.0f), time + 1); } writer.addDocument(doc); writer.close(); return idx; }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
private Path createSingleDocIndex(String name, float inc, long time) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); Directory dir = FSDirectory.getDirectory(sub.toString()); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1", 1.0f + inc, time + 1);// w w w . j a v a 2 s.c o m writer.addDocument(doc); writer.close(); return idx; }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testUrlDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }// ww w . j a va2 s. co m Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testMixedDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index1, index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//w ww .j a va2 s .c o m Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", "http://www.example.com/2", doc.get("url")); System.out.println(doc); } reader.close(); dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }