Example usage for org.apache.hadoop.io.compress CompressionCodec createDecompressor

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createDecompressor.

Prototype

Decompressor createDecompressor();

Source Link

Document

Create a new Decompressor for use by this CompressionCodec .

Usage

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in)
        throws IOException {

    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return in;
    System.out.println("Getting InputStream : " + codec.getDefaultExtension());
    System.out.println("Getting InputStream : " + codec);
    Decompressor compressor = codec.createDecompressor();
    in = codec.createInputStream(in, compressor);

    return in;/*from  w  ww.j  a  v a2  s . co m*/
}

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./* www  .  j ava2  s  .c om*/
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from  w  w w. j  a  v a  2 s.  co m*/
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:data.intelligence.platform.yarn.etl.io.CodecPool.java

License:Apache License

/**
 * Get a {@link Decompressor} for the given {@link CompressionCodec} from
 * the pool or a new one.//from   w ww . j  a  v a2s  .  co m
 * 
 * @param codec
 *            the <code>CompressionCodec</code> for which to get the
 *            <code>Decompressor</code>
 * @return <code>Decompressor</code> for the given
 *         <code>CompressionCodec</code> the pool or a new one
 */
public static Decompressor getDecompressor(CompressionCodec codec) {
    Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
    if (decompressor == null) {
        decompressor = codec.createDecompressor();
        LOG.info("Got brand-new decompressor");
    } else {
        LOG.debug("Got recycled decompressor");
    }
    return decompressor;
}

From source file:io.airlift.compress.HadoopNative.java

License:Apache License

private static void requireNativeZlib() {
    Configuration conf = new Configuration();
    if (!ZlibFactory.isNativeZlibLoaded(conf)) {
        throw new RuntimeException("native zlib is not loaded");
    }/*  w  w  w  . jav a  2s.  c  om*/

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodecByClassName(GzipCodec.class.getName());
    if (codec == null) {
        throw new RuntimeException("failed to load GzipCodec");
    }
    org.apache.hadoop.io.compress.Decompressor decompressor = codec.createDecompressor();
    if (!(decompressor instanceof ZlibDecompressor)) {
        throw new RuntimeException("wrong gzip decompressor: " + decompressor.getClass().getName());
    }
}

From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java

License:Apache License

/**
 * Extracts Text file//w  w  w  .j  a  v a 2s.  c om
 * @param file
 * @param start
 * @param length
 * @throws IOException
 */
@SuppressWarnings("resource")
private void extractTextFile(Path file, long start, long length) throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream filestream = fs.open(file);
    CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file);
    LineReader filereader;
    Seekable fileseeker = filestream;

    // Hadoop 1.0 does not have support for custom record delimiter and thus
    // we
    // are supporting only default one.
    // We might add another "else if" case for SplittableCompressionCodec once
    // we drop support for Hadoop 1.0.
    if (codec == null) {
        filestream.seek(start);
        filereader = new LineReader(filestream);
    } else {
        filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf);
        fileseeker = filestream;
    }
    if (start != 0) {
        // always throw away first record because
        // one extra line is read in previous split
        start += filereader.readLine(new Text(), 0);
    }
    int size;
    LOG.info("Start position: " + String.valueOf(start));
    long next = start;
    while (next <= end) {
        Text line = new Text();
        size = filereader.readLine(line, Integer.MAX_VALUE);
        if (size == 0) {
            break;
        }
        if (codec == null) {
            next += size;
        } else {
            next = fileseeker.getPos();
        }
        rowRead++;
        dataWriter.writeStringRecord(line.toString());
    }
    LOG.info("Extracting ended on position: " + fileseeker.getPos());
    filestream.close();
}

From source file:org.apache.sqoop.connector.hdfs.TestLoader.java

License:Apache License

private void verifyOutput(FileSystem fs, Path file) throws IOException {
    Configuration conf = new Configuration();
    FSDataInputStream fsin = fs.open(file);
    CompressionCodec codec;

    switch (outputFormat) {
    case TEXT_FILE:
        codec = (new CompressionCodecFactory(conf)).getCodec(file);

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            } else {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1);
            }/*w w  w.ja va  2s .  c o m*/
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        InputStreamReader in;
        if (codec == null) {
            in = new InputStreamReader(fsin);
        } else {
            in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor()));
        }
        BufferedReader textReader = new BufferedReader(in);

        for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) {
            Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine());
        }
        break;

    case SEQUENCE_FILE:
        SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf);
        codec = sequenceReader.getCompressionCodec();

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        Text line = new Text();
        int index = 1;
        while (sequenceReader.next(line)) {
            Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString());
            line = new Text();
        }
        break;
    }
}

From source file:org.apache.tajo.storage.compress.CodecPool.java

License:Apache License

/**
 * Get a {@link Decompressor} for the given {@link CompressionCodec} from the
 * pool or a new one./*w  w  w .j  ava 2  s.c om*/
 *
 * @param codec
 *          the <code>CompressionCodec</code> for which to get the
 *          <code>Decompressor</code>
 * @return <code>Decompressor</code> for the given
 *         <code>CompressionCodec</code> the pool or a new one
 */
public static Decompressor getDecompressor(CompressionCodec codec) {
    Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
    if (decompressor == null) {
        decompressor = codec.createDecompressor();
        LOG.info("Got brand-new decompressor [" + codec.getDefaultExtension() + "]");
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Got recycled decompressor");
        }
    }
    return decompressor;
}

From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java

License:Apache License

@Test
public void testInternalErrorTranslation() throws Exception {
    String codecErrorMsg = "codec failure";
    CompressionInputStream mockCodecStream = mock(CompressionInputStream.class);
    when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt()))
            .thenThrow(new InternalError(codecErrorMsg));
    Decompressor mockDecoder = mock(Decompressor.class);
    CompressionCodec mockCodec = mock(CompressionCodec.class);
    when(mockCodec.createDecompressor()).thenReturn(mockDecoder);
    when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class)))
            .thenReturn(mockCodecStream);
    byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 };
    try {//from www . ja  v a2s  .c  o m
        ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec,
                false, 0, mock(Logger.class), "identifier");
        Assert.fail("shuffle was supposed to throw!");
    } catch (IOException e) {
        Assert.assertTrue(e.getCause() instanceof InternalError);
        Assert.assertTrue(e.getMessage().contains(codecErrorMsg));
    }
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./*from  w  w w . jav  a 2 s  .  c  o  m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}