Example usage for org.apache.hadoop.io.compress.bzip2 CBZip2OutputStream CBZip2OutputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress.bzip2 CBZip2OutputStream CBZip2OutputStream.

Prototype

public CBZip2OutputStream(final OutputStream out, final int blockSize) throws IOException

Source Link

Document

Constructs a new CBZip2OutputStream with specified blocksize.

Usage

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

private static byte[] bzip2(byte[] bytes, int bsize) throws IOException {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    CBZip2OutputStream c = new CBZip2OutputStream(os, bsize);
    c.write(bytes);/*w  w w  .j ava 2  s . co m*/
    c.finish();
    c.flush();
    return os.toByteArray();
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

@Test
public void testSplitCompressed() throws IOException {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path txtFile = new Path(dir, "testSplitCompressed.bz2");

    fs.delete(dir, true);/*from   w  ww  . ja  v a2  s.  co m*/

    StreamWikiDumpInputFormat.setInputPaths(job, dir);

    for (int bsize : new int[] { 1, 5, 9 }) {
        OutputStreamWriter writer = new OutputStreamWriter(new CBZip2OutputStream(fs.create(txtFile), bsize));

        int pagenum = 0;
        int revnum = 0;
        try {
            writer.write("<page><revision>AB</revision>          \n"
                    + "<revision>C</revision>          <revisio" + "n>DER</revision></page> <page><revision>"
                    + "long-long-long-long-long-long-long-long-" + "long-long-long-revision. </revision></pa");
            pagenum += 2;
            revnum += 4;
            for (Integer len : new Integer[] { 2000, 81920, 5001, 2002, 1003, 1004, 1005 }) {
                writer.write("ge> <page><revision>long-long-long-long-");
                for (int i = 0; i < len; ++i) {
                    writer.write(upperCaseRegion(String.format("long-long-long--No%5d/%5d-long-long ", i, len),
                            rand(40), rand(40)));
                    writer.write(
                            upperCaseRegion(String.format("long revision</revision>\n<revision>%5d", i), 0, 0));
                    writer.write(upperCaseRegion(
                            String.format("long-long-long-long-%4d-long-long-long-", rand(1000)), rand(40),
                            rand(40)));
                }
                writer.write("long-long-long-revision. </revision></pa");
                revnum += len;
            }
            ++pagenum;
            writer.write("ge>\n");
        } finally {
            writer.flush();
            writer.close();
        }

        StreamWikiDumpInputFormat format = new StreamWikiDumpInputFormat();
        format.configure(job);

        for (Integer len : new Integer[] { 10000, 1000, 80000 }) {
            int bcount = 0;
            int ecount = 0;
            long size = 0;
            for (InputSplit is : format.getSplits(job, fs.getFileStatus(txtFile), "</page>", len)) {
                FileSplit split = (FileSplit) is;
                System.err.println("split " + len + ": " + split);
                size += split.getLength();
                String str = new String(read(split, job));
                str = str.trim();
                if (str.length() == 0)
                    continue;
                System.err.println("str: " + snip(str, 200));
                assertTrue("no </page> in \"" + snip(str, 200) + "\"", str.indexOf("</page>") >= 0);
                assertTrue("no <page> in \"" + snip(str, 200) + "\"", str.indexOf("<page>") >= 0);
                bcount += count(str, "<page>");
                ecount += count(str, "</page>");
            }
            assertTrue("total size is too small: expected: " + fs.getFileStatus(txtFile).getLen() + ", found: "
                    + size, fs.getFileStatus(txtFile).getLen() <= size);
            assertTrue("number of page beginnings is too small: expected: " + pagenum + ", found: " + bcount,
                    pagenum <= bcount);
            assertTrue("number of page endings is too small: expected: " + pagenum + ", found: " + ecount,
                    pagenum <= ecount);
        }
        for (Integer n : new Integer[] { 1, 2 }) {
            List<String> found_ = collect(format, job, n, Reporter.NULL);
            Set<String> found = new HashSet<String>(found_);
            assertTrue("number of revisions is too small: expected: " + revnum + ", found: " + found.size(),
                    revnum <= found.size());
        }
    }
}