org.apache.hadoop.io.compress.BZip2Codec.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.io.compress.BZip2Codec.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.io.compress;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
import org.apache.hadoop.io.compress.bzip2.CBZip2OutputStream;
import org.apache.hadoop.io.compress.bzip2.Bzip2Factory;

import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;

/**
 * This class provides output and input streams for bzip2 compression
 * and decompression.  It uses the native bzip2 library on the system
 * if possible, else it uses a pure-Java implementation of the bzip2
 * algorithm.  The configuration parameter
 * io.compression.codec.bzip2.library can be used to control this
 * behavior.
 *
 * In the pure-Java mode, the Compressor and Decompressor interfaces
 * are not implemented.  Therefore, in that mode, those methods of
 * CompressionCodec which have a Compressor or Decompressor type
 * argument, throw UnsupportedOperationException.
 *
 * Currently, support for splittability is available only in the
 * pure-Java mode; therefore, if a SplitCompressionInputStream is
 * requested, the pure-Java implementation is used, regardless of the
 * setting of the configuration parameter mentioned above.
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class BZip2Codec implements Configurable, SplittableCompressionCodec {

    private static final String HEADER = "BZ";
    private static final int HEADER_LEN = HEADER.length();
    private static final String SUB_HEADER = "h9";
    private static final int SUB_HEADER_LEN = SUB_HEADER.length();

    private Configuration conf;

    /**
     * Set the configuration to be used by this object.
     *
     * @param conf the configuration object.
     */
    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    /**
     * Return the configuration used by this object.
     *
     * @return the configuration object used by this objec.
     */
    @Override
    public Configuration getConf() {
        return conf;
    }

    /**
    * Creates a new instance of BZip2Codec.
    */
    public BZip2Codec() {
    }

    /**
     * Create a {@link CompressionOutputStream} that will write to the given
     * {@link OutputStream}.
     *
     * @param out        the location for the final output stream
     * @return a stream the user can write uncompressed data to, to have it 
     *         compressed
     * @throws IOException
     */
    @Override
    public CompressionOutputStream createOutputStream(OutputStream out) throws IOException {
        return CompressionCodec.Util.createOutputStreamWithCodecPool(this, conf, out);
    }

    /**
     * Create a {@link CompressionOutputStream} that will write to the given
     * {@link OutputStream} with the given {@link Compressor}.
     *
     * @param out        the location for the final output stream
     * @param compressor compressor to use
     * @return a stream the user can write uncompressed data to, to have it 
     *         compressed
     * @throws IOException
     */
    @Override
    public CompressionOutputStream createOutputStream(OutputStream out, Compressor compressor) throws IOException {
        return Bzip2Factory.isNativeBzip2Loaded(conf)
                ? new CompressorStream(out, compressor,
                        conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT))
                : new BZip2CompressionOutputStream(out);
    }

    /**
     * Get the type of {@link Compressor} needed by this {@link CompressionCodec}.
     *
     * @return the type of compressor needed by this codec.
     */
    @Override
    public Class<? extends Compressor> getCompressorType() {
        return Bzip2Factory.getBzip2CompressorType(conf);
    }

    /**
     * Create a new {@link Compressor} for use by this {@link CompressionCodec}.
     *
     * @return a new compressor for use by this codec
     */
    @Override
    public Compressor createCompressor() {
        return Bzip2Factory.getBzip2Compressor(conf);
    }

    /**
     * Create a {@link CompressionInputStream} that will read from the given
     * input stream and return a stream for uncompressed data.
     *
     * @param in the stream to read compressed bytes from
     * @return a stream to read uncompressed bytes from
     * @throws IOException
     */
    @Override
    public CompressionInputStream createInputStream(InputStream in) throws IOException {
        return CompressionCodec.Util.createInputStreamWithCodecPool(this, conf, in);
    }

    /**
     * Create a {@link CompressionInputStream} that will read from the given
     * {@link InputStream} with the given {@link Decompressor}, and return a 
     * stream for uncompressed data.
     *
     * @param in           the stream to read compressed bytes from
     * @param decompressor decompressor to use
     * @return a stream to read uncompressed bytes from
     * @throws IOException
     */
    @Override
    public CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException {
        return Bzip2Factory.isNativeBzip2Loaded(conf)
                ? new DecompressorStream(in, decompressor,
                        conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT))
                : new BZip2CompressionInputStream(in, 0L, Long.MAX_VALUE, READ_MODE.BYBLOCK);
    }

    /**
     * Creates CompressionInputStream to be used to read off uncompressed data
     * in one of the two reading modes. i.e. Continuous or Blocked reading modes
     *
     * @param seekableIn The InputStream
     * @param start The start offset into the compressed stream
     * @param end The end offset into the compressed stream
     * @param readMode Controls whether progress is reported continuously or
     *                 only at block boundaries.
     *
     * @return CompressionInputStream for BZip2 aligned at block boundaries
     */
    public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor,
            long start, long end, READ_MODE readMode) throws IOException {

        if (!(seekableIn instanceof Seekable)) {
            throw new IOException("seekableIn must be an instance of " + Seekable.class.getName());
        }

        ((Seekable) seekableIn).seek(start);
        return new BZip2CompressionInputStream(seekableIn, start, end, readMode);
    }

    /**
     * Get the type of {@link Decompressor} needed by this {@link CompressionCodec}.
     *
     * @return the type of decompressor needed by this codec.
     */
    @Override
    public Class<? extends Decompressor> getDecompressorType() {
        return Bzip2Factory.getBzip2DecompressorType(conf);
    }

    /**
     * Create a new {@link Decompressor} for use by this {@link CompressionCodec}.
     *
     * @return a new decompressor for use by this codec
     */
    @Override
    public Decompressor createDecompressor() {
        return Bzip2Factory.getBzip2Decompressor(conf);
    }

    /**
    * .bz2 is recognized as the default extension for compressed BZip2 files
    *
    * @return A String telling the default bzip2 file extension
    */
    @Override
    public String getDefaultExtension() {
        return ".bz2";
    }

    private static class BZip2CompressionOutputStream extends CompressionOutputStream {

        // class data starts here//
        private CBZip2OutputStream output;
        private boolean needsReset;
        // class data ends here//

        public BZip2CompressionOutputStream(OutputStream out) throws IOException {
            super(out);
            needsReset = true;
        }

        private void writeStreamHeader() throws IOException {
            if (super.out != null) {
                // The compressed bzip2 stream should start with the
                // identifying characters BZ. Caller of CBZip2OutputStream
                // i.e. this class must write these characters.
                out.write(HEADER.getBytes(StandardCharsets.UTF_8));
            }
        }

        public void finish() throws IOException {
            if (needsReset) {
                // In the case that nothing is written to this stream, we still need to
                // write out the header before closing, otherwise the stream won't be
                // recognized by BZip2CompressionInputStream.
                internalReset();
            }
            this.output.finish();
            needsReset = true;
        }

        private void internalReset() throws IOException {
            if (needsReset) {
                needsReset = false;
                writeStreamHeader();
                this.output = new CBZip2OutputStream(out);
            }
        }

        public void resetState() throws IOException {
            // Cannot write to out at this point because out might not be ready
            // yet, as in SequenceFile.Writer implementation.
            needsReset = true;
        }

        public void write(int b) throws IOException {
            if (needsReset) {
                internalReset();
            }
            this.output.write(b);
        }

        public void write(byte[] b, int off, int len) throws IOException {
            if (needsReset) {
                internalReset();
            }
            this.output.write(b, off, len);
        }

        public void close() throws IOException {
            try {
                super.close();
            } finally {
                output.close();
            }
        }

    }// end of class BZip2CompressionOutputStream

    /**
     * This class is capable to de-compress BZip2 data in two modes;
     * CONTINOUS and BYBLOCK.  BYBLOCK mode makes it possible to
     * do decompression starting any arbitrary position in the stream.
     *
     * So this facility can easily be used to parallelize decompression
     * of a large BZip2 file for performance reasons.  (It is exactly
     * done so for Hadoop framework.  See LineRecordReader for an
     * example).  So one can break the file (of course logically) into
     * chunks for parallel processing.  These "splits" should be like
     * default Hadoop splits (e.g as in FileInputFormat getSplit metod).
     * So this code is designed and tested for FileInputFormat's way
     * of splitting only.
     */

    private static class BZip2CompressionInputStream extends SplitCompressionInputStream {

        // class data starts here//
        private CBZip2InputStream input;
        boolean needsReset;
        private BufferedInputStream bufferedIn;
        private boolean isHeaderStripped = false;
        private boolean isSubHeaderStripped = false;
        private READ_MODE readMode = READ_MODE.CONTINUOUS;
        private long startingPos = 0L;

        // Following state machine handles different states of compressed stream
        // position
        // HOLD : Don't advertise compressed stream position
        // ADVERTISE : Read 1 more character and advertise stream position
        // See more comments about it before updatePos method.
        private enum POS_ADVERTISEMENT_STATE_MACHINE {
            HOLD, ADVERTISE
        };

        POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
        long compressedStreamPosition = 0;

        // class data ends here//

        public BZip2CompressionInputStream(InputStream in) throws IOException {
            this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
        }

        public BZip2CompressionInputStream(InputStream in, long start, long end, READ_MODE readMode)
                throws IOException {
            super(in, start, end);
            needsReset = false;
            bufferedIn = new BufferedInputStream(super.in);
            this.startingPos = super.getPos();
            this.readMode = readMode;
            long numSkipped = 0;
            if (this.startingPos == 0) {
                // We only strip header if it is start of file
                bufferedIn = readStreamHeader();
            } else if (this.readMode == READ_MODE.BYBLOCK && this.startingPos <= HEADER_LEN + SUB_HEADER_LEN) {
                // When we're in BYBLOCK mode and the start position is >=0
                // and < HEADER_LEN + SUB_HEADER_LEN, we should skip to after
                // start of the first bz2 block to avoid duplicated records
                numSkipped = HEADER_LEN + SUB_HEADER_LEN + 1 - this.startingPos;
                long skipBytes = numSkipped;
                while (skipBytes > 0) {
                    long s = bufferedIn.skip(skipBytes);
                    if (s > 0) {
                        skipBytes -= s;
                    } else {
                        if (bufferedIn.read() == -1) {
                            break; // end of the split
                        } else {
                            skipBytes--;
                        }
                    }
                }
            }
            input = new CBZip2InputStream(bufferedIn, readMode);
            if (this.isHeaderStripped) {
                input.updateReportedByteCount(HEADER_LEN);
            }

            if (this.isSubHeaderStripped) {
                input.updateReportedByteCount(SUB_HEADER_LEN);
            }

            if (numSkipped > 0) {
                input.updateReportedByteCount((int) numSkipped);
            }

            // To avoid dropped records, not advertising a new byte position
            // when we are in BYBLOCK mode and the start position is 0
            if (!(this.readMode == READ_MODE.BYBLOCK && this.startingPos == 0)) {
                this.updatePos(false);
            }
        }

        private BufferedInputStream readStreamHeader() throws IOException {
            // We are flexible enough to allow the compressed stream not to
            // start with the header of BZ. So it works fine either we have
            // the header or not.
            if (super.in != null) {
                bufferedIn.mark(HEADER_LEN);
                byte[] headerBytes = new byte[HEADER_LEN];
                int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
                if (actualRead != -1) {
                    String header = new String(headerBytes, StandardCharsets.UTF_8);
                    if (header.compareTo(HEADER) != 0) {
                        bufferedIn.reset();
                    } else {
                        this.isHeaderStripped = true;
                        // In case of BYBLOCK mode, we also want to strip off
                        // remaining two character of the header.
                        if (this.readMode == READ_MODE.BYBLOCK) {
                            actualRead = bufferedIn.read(headerBytes, 0, SUB_HEADER_LEN);
                            if (actualRead != -1) {
                                this.isSubHeaderStripped = true;
                            }
                        }
                    }
                }
            }

            if (bufferedIn == null) {
                throw new IOException("Failed to read bzip2 stream.");
            }

            return bufferedIn;

        }// end of method

        public void close() throws IOException {
            if (!needsReset) {
                try {
                    input.close();
                    needsReset = true;
                } finally {
                    super.close();
                }
            }
        }

        /**
        * This method updates compressed stream position exactly when the
        * client of this code has read off at least one byte passed any BZip2
        * end of block marker.
        *
        * This mechanism is very helpful to deal with data level record
        * boundaries. Please see constructor and next methods of
        * org.apache.hadoop.mapred.LineRecordReader as an example usage of this
        * feature.  We elaborate it with an example in the following:
        *
        * Assume two different scenarios of the BZip2 compressed stream, where
        * [m] represent end of block, \n is line delimiter and . represent compressed
        * data.
        *
        * ............[m]......\n.......
        *
        * ..........\n[m]......\n.......
        *
        * Assume that end is right after [m].  In the first case the reading
        * will stop at \n and there is no need to read one more line.  (To see the
        * reason of reading one more line in the next() method is explained in LineRecordReader.)
        * While in the second example LineRecordReader needs to read one more line
        * (till the second \n).  Now since BZip2Codecs only update position
        * at least one byte passed a maker, so it is straight forward to differentiate
        * between the two cases mentioned.
        *
        */

        public int read(byte[] b, int off, int len) throws IOException {
            if (needsReset) {
                internalReset();
            }

            int result = 0;
            result = this.input.read(b, off, len);
            if (result == BZip2Constants.END_OF_BLOCK) {
                this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
            }

            if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
                result = this.input.read(b, off, off + 1);
                // This is the precise time to update compressed stream position
                // to the client of this code.
                this.updatePos(true);
                this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
            }

            return result;

        }

        public int read() throws IOException {
            byte b[] = new byte[1];
            int result = this.read(b, 0, 1);
            return (result < 0) ? result : (b[0] & 0xff);
        }

        private void internalReset() throws IOException {
            if (needsReset) {
                needsReset = false;
                BufferedInputStream bufferedIn = readStreamHeader();
                input = new CBZip2InputStream(bufferedIn, this.readMode);
            }
        }

        public void resetState() throws IOException {
            // Cannot read from bufferedIn at this point because bufferedIn
            // might not be ready
            // yet, as in SequenceFile.Reader implementation.
            needsReset = true;
        }

        public long getPos() {
            return this.compressedStreamPosition;
        }

        /*
         * As the comments before read method tell that
         * compressed stream is advertised when at least
         * one byte passed EOB have been read off.  But
         * there is an exception to this rule.  When we
         * construct the stream we advertise the position
         * exactly at EOB.  In the following method
         * shouldAddOn boolean captures this exception.
         *
         */
        private void updatePos(boolean shouldAddOn) {
            int addOn = shouldAddOn ? 1 : 0;
            this.compressedStreamPosition = this.startingPos + this.input.getProcessedByteCount() + addOn;
        }

    }// end of BZip2CompressionInputStream

}