List of usage examples for org.apache.hadoop.io.compress.bzip2 CBZip2OutputStream CBZip2OutputStream
public CBZip2OutputStream(final OutputStream out, final int blockSize) throws IOException
From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java
License:Apache License
private static byte[] bzip2(byte[] bytes, int bsize) throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); CBZip2OutputStream c = new CBZip2OutputStream(os, bsize); c.write(bytes);/*w w w .j ava 2 s . co m*/ c.finish(); c.flush(); return os.toByteArray(); }
From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java
License:Apache License
@Test public void testSplitCompressed() throws IOException { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path txtFile = new Path(dir, "testSplitCompressed.bz2"); fs.delete(dir, true);/*from w ww . ja v a2 s. co m*/ StreamWikiDumpInputFormat.setInputPaths(job, dir); for (int bsize : new int[] { 1, 5, 9 }) { OutputStreamWriter writer = new OutputStreamWriter(new CBZip2OutputStream(fs.create(txtFile), bsize)); int pagenum = 0; int revnum = 0; try { writer.write("<page><revision>AB</revision> \n" + "<revision>C</revision> <revisio" + "n>DER</revision></page> <page><revision>" + "long-long-long-long-long-long-long-long-" + "long-long-long-revision. </revision></pa"); pagenum += 2; revnum += 4; for (Integer len : new Integer[] { 2000, 81920, 5001, 2002, 1003, 1004, 1005 }) { writer.write("ge> <page><revision>long-long-long-long-"); for (int i = 0; i < len; ++i) { writer.write(upperCaseRegion(String.format("long-long-long--No%5d/%5d-long-long ", i, len), rand(40), rand(40))); writer.write( upperCaseRegion(String.format("long revision</revision>\n<revision>%5d", i), 0, 0)); writer.write(upperCaseRegion( String.format("long-long-long-long-%4d-long-long-long-", rand(1000)), rand(40), rand(40))); } writer.write("long-long-long-revision. </revision></pa"); revnum += len; } ++pagenum; writer.write("ge>\n"); } finally { writer.flush(); writer.close(); } StreamWikiDumpInputFormat format = new StreamWikiDumpInputFormat(); format.configure(job); for (Integer len : new Integer[] { 10000, 1000, 80000 }) { int bcount = 0; int ecount = 0; long size = 0; for (InputSplit is : format.getSplits(job, fs.getFileStatus(txtFile), "</page>", len)) { FileSplit split = (FileSplit) is; System.err.println("split " + len + ": " + split); size += split.getLength(); String str = new String(read(split, job)); str = str.trim(); if (str.length() == 0) continue; System.err.println("str: " + snip(str, 200)); assertTrue("no </page> in \"" + snip(str, 200) + "\"", str.indexOf("</page>") >= 0); assertTrue("no <page> in \"" + snip(str, 200) + "\"", str.indexOf("<page>") >= 0); bcount += count(str, "<page>"); ecount += count(str, "</page>"); } assertTrue("total size is too small: expected: " + fs.getFileStatus(txtFile).getLen() + ", found: " + size, fs.getFileStatus(txtFile).getLen() <= size); assertTrue("number of page beginnings is too small: expected: " + pagenum + ", found: " + bcount, pagenum <= bcount); assertTrue("number of page endings is too small: expected: " + pagenum + ", found: " + ecount, pagenum <= ecount); } for (Integer n : new Integer[] { 1, 2 }) { List<String> found_ = collect(format, job, n, Reporter.NULL); Set<String> found = new HashSet<String>(found_); assertTrue("number of revisions is too small: expected: " + revnum + ", found: " + found.size(), revnum <= found.size()); } } }