Example usage for org.apache.hadoop.io.compress.bzip2 CBZip2InputStream CBZip2InputStream

List of usage examples for org.apache.hadoop.io.compress.bzip2 CBZip2InputStream CBZip2InputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress.bzip2 CBZip2InputStream CBZip2InputStream.

Prototype

public CBZip2InputStream(final InputStream in) throws IOException 

Source Link

Usage

From source file:Importer.java

License:Open Source License

public static void copyFile(File file) throws Exception {
    //    String TEST_PREFIX = "";
    File destFile = new File(outDir, file.getName() + ".seq");
    Path dest = new Path(destFile.getAbsolutePath());

    Configuration conf = new Configuration();
    FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")),
            conf);//from   w ww. jav  a 2s. c o m
    CompressionCodec codec = new DefaultCodec();
    fileSys.mkdirs(dest.getParent());
    FSDataOutputStream outputStr = fileSys.create(dest);
    seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class,
            SequenceFile.CompressionType.BLOCK, codec);
    String filename = file.getName();
    InputStream in = new BufferedInputStream(new FileInputStream(file));
    if (filename.endsWith(".bz2")) {
        in.read();
        in.read(); //snarf header
        in = new CBZip2InputStream(in);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII"));

    System.out.println("working on file " + file);
    int records = 0;
    long bytes = 0, bytes_since_status = 0;
    long startTime = System.currentTimeMillis();
    String s = null;
    Text content = new Text();
    while ((s = br.readLine()) != null) {
        if (s.startsWith("---END.OF.DOCUMENT---")) {
            Text name = new Text(hash(content));
            seqFileWriter.append(name, content);
            records++;
            content = new Text();
        } else {
            byte[] line_as_bytes = (s + " ").getBytes();
            for (byte b : line_as_bytes) {
                assert b < 128 : "found an unexpected high-bit set";
            }

            content.append(line_as_bytes, 0, line_as_bytes.length);
            bytes += line_as_bytes.length;
            /*
            bytes_since_status += line_as_bytes.length;
            if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB
              System.err.print('.');
              bytes_since_status = 0;
            }*/
        }
    } //end while
    if (content.getLength() > 5) {
        Text name = new Text(hash(content));
        seqFileWriter.append(name, content);
        records++;
    }
    totalBytes += bytes;
    totalRecords += records;
    long time = (System.currentTimeMillis() - startTime) / 1000 + 1;
    long kbSec = bytes / 1024 / time;
    System.out.println(new java.util.Date());
    System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time
            + " seconds (" + kbSec + " KB/sec).");
    in.close();
    seqFileWriter.close();
    outputStr.close();
}

From source file:bme.iclef.hadoop.file2seq.TarToSeqFile.java

License:Apache License

private TarInputStream openInputFile() throws Exception {
    InputStream fileStream = new FileInputStream(inputFile);
    String name = inputFile.getName();
    InputStream theStream = null;
    if (name.endsWith(".tar.gz") || name.endsWith(".tgz")) {
        theStream = new GZIPInputStream(fileStream);
    } else if (name.endsWith(".tar.bz2") || name.endsWith(".tbz2")) {
        /* Skip the "BZ" header added by bzip2. */
        fileStream.skip(2);/*from w  w w .j a v a  2  s  .  c o  m*/
        theStream = new CBZip2InputStream(fileStream);
    } else {
        /* Assume uncompressed tar file. */
        theStream = fileStream;
    }
    return new TarInputStream(theStream);
}

From source file:com.endgame.binarypig.util.BuildSequenceFileFromArchive.java

License:Apache License

public void load(FileSystem fs, Configuration conf, File archive, Path outputDir) throws Exception {
    Text key = new Text();
    BytesWritable val = new BytesWritable();

    SequenceFile.Writer writer = null;
    ArchiveInputStream archiveInputStream = null;

    try {/*w w  w. j a v a 2 s  . c om*/
        Path sequenceName = new Path(outputDir, archive.getName() + ".seq");
        System.out.println("Writing to " + sequenceName);
        writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class, BytesWritable.class,
                CompressionType.RECORD);
        String lowerName = archive.toString().toLowerCase();

        if (lowerName.endsWith(".tar.gz") || lowerName.endsWith(".tgz")) {
            archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar",
                    new GZIPInputStream(new FileInputStream(archive)));
        } else if (lowerName.endsWith(".tar.bz") || lowerName.endsWith(".tar.bz2")
                || lowerName.endsWith(".tbz")) {
            FileInputStream is = new FileInputStream(archive);
            is.read(); // read 'B'
            is.read(); // read 'Z'
            archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar",
                    new CBZip2InputStream(is));
        } else if (lowerName.endsWith(".tar")) {
            archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar",
                    new FileInputStream(archive));
        } else if (lowerName.endsWith(".zip")) {
            archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("zip",
                    new FileInputStream(archive));
        } else {
            throw new RuntimeException("Can't handle archive format for: " + archive);
        }

        ArchiveEntry entry = null;
        while ((entry = archiveInputStream.getNextEntry()) != null) {
            if (!entry.isDirectory()) {
                try {
                    byte[] outputFile = IOUtils.toByteArray(archiveInputStream);
                    val.set(outputFile, 0, outputFile.length);
                    key.set(DigestUtils.md5Hex(outputFile));

                    writer.append(key, val);
                } catch (IOException e) {
                    System.err.println("Warning: archive may be truncated: " + archive);
                    // Truncated Archive
                    break;
                }
            }
        }
    } finally {
        archiveInputStream.close();
        writer.close();
    }
}

From source file:com.mucommander.commons.file.impl.bzip2.Bzip2ArchiveFile.java

License:Open Source License

@Override
public InputStream getEntryInputStream(ArchiveEntry entry, ArchiveEntryIterator entryIterator)
        throws IOException {
    try {//from w w  w  .j  a va  2  s . c o  m
        InputStream in = getInputStream();

        // Skips the 2 magic bytes 'BZ', as required by CBZip2InputStream. Quoted from CBZip2InputStream's Javadoc:
        // "Although BZip2 headers are marked with the magic 'Bz'. this constructor expects the next byte in the
        // stream to be the first one after the magic.  Thus callers have to skip the first two bytes. Otherwise
        // this constructor will throw an exception."
        // Note: the return value of read() is unchecked. In the unlikely event that EOF is reached in the first
        // 2 bytes, CBZip2InputStream will throw an IOException.
        in.read();
        in.read();

        // Quoted from CBZip2InputStream's Javadoc:
        // "CBZip2InputStream reads bytes from the compressed source stream via the single byte {@link java.io.InputStream#read()
        // read()} method exclusively. Thus you should consider to use a buffered source stream."
        return new CBZip2InputStream(new BufferedInputStream(in));
    } catch (Exception e) {
        // CBZip2InputStream is known to throw NullPointerException if file is not properly Bzip2-encoded
        // so we need to catch those and throw them as IOException
        LOGGER.info("Exception caught while creating CBZip2InputStream, throwing IOException", e);

        throw new IOException();
    }
}

From source file:com.mucommander.commons.file.impl.tar.TarArchiveFile.java

License:Open Source License

/**
 * Returns a TarInputStream which can be used to read TAR entries.
 *
 * @param entryOffset offset from the start of the archive to an entry. Must be a multiple of recordSize, or
 * <code>0</code> to start at the first entry.
 * @return a TarInputStream which can be used to read TAR entries
 * @throws IOException if an error occurred while create the stream
 * @throws UnsupportedFileOperationException if this operation is not supported by the underlying filesystem,
 * or is not implemented./*from w w w  .  j a  v  a 2s . c om*/
 */
private TarInputStream createTarStream(long entryOffset) throws IOException, UnsupportedFileOperationException {
    InputStream in = file.getInputStream();

    String name = getName();
    // Gzip-compressed file
    if (StringUtils.endsWithIgnoreCase(name, "tgz") || StringUtils.endsWithIgnoreCase(name, "tar.gz"))
        // Note: this will fail for gz/tgz entries inside a tar file (IOException: Not in GZIP format),
        // why is a complete mystery: the gz/tgz entry can be extracted and then properly browsed
        in = new GZIPInputStream(in);

    // Bzip2-compressed file
    else if (StringUtils.endsWithIgnoreCase(name, "tbz2") || StringUtils.endsWithIgnoreCase(name, "tar.bz2")) {
        try {
            // Skips the 2 magic bytes 'BZ', as required by CBZip2InputStream. Quoted from CBZip2InputStream's Javadoc:
            // "Although BZip2 headers are marked with the magic 'Bz'. this constructor expects the next byte in the
            // stream to be the first one after the magic.  Thus callers have to skip the first two bytes. Otherwise
            // this constructor will throw an exception."
            StreamUtils.skipFully(in, 2);

            // Quoted from CBZip2InputStream's Javadoc:
            // "CBZip2InputStream reads bytes from the compressed source stream via the single byte {@link java.io.InputStream#read()
            // read()} method exclusively. Thus you should consider to use a buffered source stream."
            in = new CBZip2InputStream(new BufferedInputStream(in));
        } catch (Exception e) {
            // CBZip2InputStream is known to throw NullPointerException if file is not properly Bzip2-encoded
            // so we need to catch those and throw them as IOException
            LOGGER.info("Exception caught while creating CBZip2InputStream, throwing IOException", e);

            throw new IOException();
        }
    }

    return new TarInputStream(in, entryOffset);
}

From source file:io.covert.binary.analysis.BuildSequenceFileFromTarball.java

License:Apache License

public void load(FileSystem fs, Configuration conf, File inputTarball, Path outputDir) throws Exception {
    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Path sequenceName = new Path(outputDir, inputTarball.getName() + ".seq");
    System.out.println("Writing to " + sequenceName);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class,
            BytesWritable.class, CompressionType.RECORD);

    InputStream is = new FileInputStream(inputTarball);
    if (inputTarball.toString().toLowerCase().endsWith(".gz")) {
        is = new GZIPInputStream(is);
    } else if (inputTarball.toString().toLowerCase().endsWith(".bz")
            || inputTarball.toString().endsWith(".bz2")) {
        is.read(); // read 'B'
        is.read(); // read 'Z'
        is = new CBZip2InputStream(is);
    }//from ww  w  . j  a v  a  2s .  c o  m

    final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory()
            .createArchiveInputStream("tar", is);
    TarArchiveEntry entry = null;
    while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) {
        if (!entry.isDirectory()) {

            try {
                final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream();
                IOUtils.copy(debInputStream, outputFileStream);
                outputFileStream.close();
                byte[] outputFile = outputFileStream.toByteArray();
                val.set(outputFile, 0, outputFile.length);

                MessageDigest md = MessageDigest.getInstance("MD5");
                md.update(outputFile);
                byte[] digest = md.digest();
                String hexdigest = "";
                for (int i = 0; i < digest.length; i++) {
                    hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1);
                }
                key.set(hexdigest);
                writer.append(key, val);
            } catch (IOException e) {
                System.err.println("Warning: tarball may be truncated: " + inputTarball);
                // Truncated Tarball
                break;
            }
        }
    }
    debInputStream.close();
    writer.close();
}