com.hadoop.compression.lzo.LzopInputStream.java Source code

Introduction

Here is the source code for com.hadoop.compression.lzo.LzopInputStream.java
Source

/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * <http://www.gnu.org/licenses/>.
 */

package com.hadoop.compression.lzo;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.Map;
import java.util.zip.Adler32;
import java.util.zip.CRC32;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.compress.BlockDecompressorStream;
import org.apache.hadoop.io.compress.Decompressor;

public class LzopInputStream extends BlockDecompressorStream {

    private static final Log LOG = LogFactory.getLog(LzopInputStream.class);

    private final EnumSet<DChecksum> dflags = EnumSet.allOf(DChecksum.class);
    private final EnumSet<CChecksum> cflags = EnumSet.allOf(CChecksum.class);

    private final byte[] buf = new byte[9];
    private final EnumMap<DChecksum, Integer> dcheck = new EnumMap<DChecksum, Integer>(DChecksum.class);
    private final EnumMap<CChecksum, Integer> ccheck = new EnumMap<CChecksum, Integer>(CChecksum.class);

    private int noUncompressedBytes = 0;
    private int noCompressedBytes = 0;
    private int uncompressedBlockSize = 0;

    public LzopInputStream(InputStream in, Decompressor decompressor, int bufferSize) throws IOException {
        super(in, decompressor, bufferSize);
        readHeader(in);
    }

    /**
     * Reads len bytes in a loop.
     *
     * This is copied from IOUtils.readFully except that it throws an EOFException
     * instead of generic IOException on EOF.
     *
     * @param in The InputStream to read from
     * @param buf The buffer to fill
     * @param off offset from the buffer
     * @param len the length of bytes to read
     */
    private static void readFully(InputStream in, byte buf[], int off, int len) throws IOException, EOFException {
        int toRead = len;
        while (toRead > 0) {
            int ret = in.read(buf, off, toRead);
            if (ret < 0) {
                throw new EOFException("Premature EOF from inputStream");
            }
            toRead -= ret;
            off += ret;
        }
    }

    /**
     * Read len bytes into buf, st LSB of int returned is the last byte of the
     * first word read.
     */
    private static int readInt(InputStream in, byte[] buf, int len) throws IOException {
        readFully(in, buf, 0, len);
        int ret = (0xFF & buf[0]) << 24;
        ret |= (0xFF & buf[1]) << 16;
        ret |= (0xFF & buf[2]) << 8;
        ret |= (0xFF & buf[3]);
        return (len > 3) ? ret : (ret >>> (8 * (4 - len)));
    }

    /**
     * Read bytes, update checksums, return first four bytes as an int, first
     * byte read in the MSB.
     */
    private static int readHeaderItem(InputStream in, byte[] buf, int len, Adler32 adler, CRC32 crc32)
            throws IOException {
        int ret = readInt(in, buf, len);
        adler.update(buf, 0, len);
        crc32.update(buf, 0, len);
        Arrays.fill(buf, (byte) 0);
        return ret;
    }

    /**
     * Read and verify an lzo header, setting relevant block checksum options
     * and ignoring most everything else.
     */
    protected void readHeader(InputStream in) throws IOException {
        readFully(in, buf, 0, 9);
        if (!Arrays.equals(buf, LzopCodec.LZO_MAGIC)) {
            throw new IOException("Invalid LZO header");
        }
        Arrays.fill(buf, (byte) 0);
        Adler32 adler = new Adler32();
        CRC32 crc32 = new CRC32();
        int hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop version
        if (hitem > LzopCodec.LZOP_VERSION) {
            LOG.debug("Compressed with later version of lzop: " + Integer.toHexString(hitem) + " (expected 0x"
                    + Integer.toHexString(LzopCodec.LZOP_VERSION) + ")");
        }
        hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzo library version
        if (hitem < LzoDecompressor.MINIMUM_LZO_VERSION) {
            throw new IOException("Compressed with incompatible lzo version: 0x" + Integer.toHexString(hitem)
                    + " (expected at least 0x" + Integer.toHexString(LzoDecompressor.MINIMUM_LZO_VERSION) + ")");
        }
        hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop extract version
        if (hitem > LzopCodec.LZOP_VERSION) {
            throw new IOException("Compressed with incompatible lzop version: 0x" + Integer.toHexString(hitem)
                    + " (expected 0x" + Integer.toHexString(LzopCodec.LZOP_VERSION) + ")");
        }
        hitem = readHeaderItem(in, buf, 1, adler, crc32); // method
        if (hitem < 1 || hitem > 3) {
            throw new IOException("Invalid strategy: " + Integer.toHexString(hitem));
        }
        readHeaderItem(in, buf, 1, adler, crc32); // ignore level

        // flags
        hitem = readHeaderItem(in, buf, 4, adler, crc32);
        try {
            for (DChecksum f : dflags) {
                if (0 == (f.getHeaderMask() & hitem)) {
                    dflags.remove(f);
                } else {
                    dcheck.put(f, (int) f.getChecksumClass().newInstance().getValue());
                }
            }
            for (CChecksum f : cflags) {
                if (0 == (f.getHeaderMask() & hitem)) {
                    cflags.remove(f);
                } else {
                    ccheck.put(f, (int) f.getChecksumClass().newInstance().getValue());
                }
            }
        } catch (InstantiationException e) {
            throw new RuntimeException("Internal error", e);
        } catch (IllegalAccessException e) {
            throw new RuntimeException("Internal error", e);
        }
        ((LzopDecompressor) decompressor).initHeaderFlags(dflags, cflags);
        boolean useCRC32 = 0 != (hitem & 0x00001000); // F_H_CRC32
        boolean extraField = 0 != (hitem & 0x00000040); // F_H_EXTRA_FIELD
        if (0 != (hitem & 0x400)) { // F_MULTIPART
            throw new IOException("Multipart lzop not supported");
        }
        if (0 != (hitem & 0x800)) { // F_H_FILTER
            throw new IOException("lzop filter not supported");
        }
        if (0 != (hitem & 0x000FC000)) { // F_RESERVED
            throw new IOException("Unknown flags in header");
        }
        // known !F_H_FILTER, so no optional block

        readHeaderItem(in, buf, 4, adler, crc32); // ignore mode
        readHeaderItem(in, buf, 4, adler, crc32); // ignore mtime
        readHeaderItem(in, buf, 4, adler, crc32); // ignore gmtdiff
        hitem = readHeaderItem(in, buf, 1, adler, crc32); // fn len
        if (hitem > 0) {
            // skip filename
            int filenameLen = Math.max(4, hitem); // buffer must be at least 4 bytes for readHeaderItem to work.
            readHeaderItem(in, new byte[filenameLen], hitem, adler, crc32);
        }
        int checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue());
        hitem = readHeaderItem(in, buf, 4, adler, crc32); // read checksum
        if (hitem != checksum) {
            throw new IOException("Invalid header checksum: " + Long.toHexString(checksum) + " (expected 0x"
                    + Integer.toHexString(hitem) + ")");
        }
        if (extraField) { // lzop 1.08 ultimately ignores this
            LOG.debug("Extra header field not processed");
            adler.reset();
            crc32.reset();
            hitem = readHeaderItem(in, buf, 4, adler, crc32);
            readHeaderItem(in, new byte[hitem], hitem, adler, crc32);
            checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue());
            if (checksum != readHeaderItem(in, buf, 4, adler, crc32)) {
                throw new IOException("Invalid checksum for extra header field");
            }
        }
    }

    /**
     * Take checksums recorded from block header and verify them against
     * those recorded by the decomrpessor.
     */
    private void verifyChecksums() throws IOException {
        LzopDecompressor ldecompressor = ((LzopDecompressor) decompressor);
        for (Map.Entry<DChecksum, Integer> chk : dcheck.entrySet()) {
            if (!ldecompressor.verifyDChecksum(chk.getKey(), chk.getValue())) {
                throw new IOException("Corrupted uncompressed block");
            }
        }
        if (!ldecompressor.isCurrentBlockUncompressed()) {
            for (Map.Entry<CChecksum, Integer> chk : ccheck.entrySet()) {
                if (!ldecompressor.verifyCChecksum(chk.getKey(), chk.getValue())) {
                    throw new IOException("Corrupted compressed block");
                }
            }
        }
    }

    @Override
    protected int decompress(byte[] b, int off, int len) throws IOException {
        // Check if we are the beginning of a block
        if (noUncompressedBytes == uncompressedBlockSize) {
            // Get original data size
            try {
                byte[] tempBuf = new byte[4];
                uncompressedBlockSize = readInt(in, tempBuf, 4);
                noCompressedBytes += 4;
            } catch (EOFException e) {
                return -1;
            }
            noUncompressedBytes = 0;
        }

        int n = 0;
        while ((n = decompressor.decompress(b, off, len)) == 0) {
            if (decompressor.finished() || decompressor.needsDictionary()) {
                if (noUncompressedBytes >= uncompressedBlockSize) {
                    eof = true;
                    return -1;
                }
            }
            if (decompressor.needsInput()) {
                try {
                    getCompressedData();
                } catch (EOFException e) {
                    eof = true;
                    return -1;
                } catch (IOException e) {
                    LOG.warn("IOException in getCompressedData; likely LZO corruption.", e);
                    throw e;
                }
            }
        }

        // Note the no. of decompressed bytes read from 'current' block
        noUncompressedBytes += n;

        return n;
    }

    /**
     * Read checksums and feed compressed block data into decompressor.
     */
    @Override
    protected void getCompressedData() throws IOException {
        checkStream();
        verifyChecksums();

        // Get the size of the compressed chunk
        int compressedLen = readInt(in, buf, 4);
        noCompressedBytes += 4;

        if (compressedLen > LzoCodec.MAX_BLOCK_SIZE) {
            throw new IOException("Compressed length " + compressedLen + " exceeds max block size "
                    + LzoCodec.MAX_BLOCK_SIZE + " (probably corrupt file)");
        }

        LzopDecompressor ldecompressor = (LzopDecompressor) decompressor;
        // If the lzo compressor compresses a block of data, and that compression
        // actually makes the block larger, it writes the block as uncompressed instead.
        // In this case, the compressed size and the uncompressed size in the header
        // are identical, and there is NO compressed checksum written.
        ldecompressor.setCurrentBlockUncompressed(compressedLen >= uncompressedBlockSize);

        for (DChecksum chk : dcheck.keySet()) {
            dcheck.put(chk, readInt(in, buf, 4));
            noCompressedBytes += 4;
        }

        if (!ldecompressor.isCurrentBlockUncompressed()) {
            for (CChecksum chk : ccheck.keySet()) {
                ccheck.put(chk, readInt(in, buf, 4));
                noCompressedBytes += 4;
            }
        }

        ldecompressor.resetChecksum();

        // Read len bytes from underlying stream
        if (compressedLen > buffer.length) {
            buffer = new byte[compressedLen];
        }
        readFully(in, buffer, 0, compressedLen);
        noCompressedBytes += compressedLen;

        // Send the read data to the decompressor.
        ldecompressor.setInput(buffer, 0, compressedLen);
    }

    public long getCompressedBytesRead() {
        return noCompressedBytes;
    }

    @Override
    public void close() throws IOException {
        byte[] b = new byte[4096];
        while (!decompressor.finished()) {
            decompressor.decompress(b, 0, b.length);
        }
        super.close();
        try {
            verifyChecksums();
        } catch (IOException e) {
            // LZO requires that each file ends with 4 trailing zeroes.  If we are here,
            // the file didn't.  It's not critical, though, so log and eat it in this case.
            LOG.warn("Incorrect LZO file format: file did not end with four trailing zeroes.", e);
        }
    }
}