Example usage for org.apache.hadoop.io.compress CompressionCodec createDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createDecompressor.

Prototype

Decompressor createDecompressor();

Source Link

Document

Create a new Decompressor for use by this CompressionCodec .

Usage

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in)
        throws IOException {

    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return in;
    System.out.println("Getting InputStream : " + codec.getDefaultExtension());
    System.out.println("Getting InputStream : " + codec);
    Decompressor compressor = codec.createDecompressor();
    in = codec.createInputStream(in, compressor);

    return in;/*from  w  ww.j  a  v a2  s . co m*/
}

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./* www  .  j ava2  s  .c om*/
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from  w  w w. j  a  v a  2 s.  co m*/
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:data.intelligence.platform.yarn.etl.io.CodecPool.java

License:Apache License

/**
 * Get a {@link Decompressor} for the given {@link CompressionCodec} from
 * the pool or a new one.//from   w ww . j  a  v a2s  .  co m
 * 
 * @param codec
 *            the <code>CompressionCodec</code> for which to get the
 *            <code>Decompressor</code>
 * @return <code>Decompressor</code> for the given
 *         <code>CompressionCodec</code> the pool or a new one
 */
public static Decompressor getDecompressor(CompressionCodec codec) {
    Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
    if (decompressor == null) {
        decompressor = codec.createDecompressor();
        LOG.info("Got brand-new decompressor");
    } else {
        LOG.debug("Got recycled decompressor");
    }
    return decompressor;
}

From source file:io.airlift.compress.HadoopNative.java

License:Apache License

private static void requireNativeZlib() {
    Configuration conf = new Configuration();
    if (!ZlibFactory.isNativeZlibLoaded(conf)) {
        throw new RuntimeException("native zlib is not loaded");
    }/*  w  w  w  . jav a  2s.  c  om*/

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodecByClassName(GzipCodec.class.getName());
    if (codec == null) {
        throw new RuntimeException("failed to load GzipCodec");
    }
    org.apache.hadoop.io.compress.Decompressor decompressor = codec.createDecompressor();
    if (!(decompressor instanceof ZlibDecompressor)) {
        throw new RuntimeException("wrong gzip decompressor: " + decompressor.getClass().getName());
    }
}

From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java

License:Apache License

/**
 * Extracts Text file//w  w  w  .j  a  v a 2s.  c om
 * @param file
 * @param start
 * @param length
 * @throws IOException
 */
@SuppressWarnings("resource")
private void extractTextFile(Path file, long start, long length) throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream filestream = fs.open(file);
    CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file);
    LineReader filereader;
    Seekable fileseeker = filestream;

    // Hadoop 1.0 does not have support for custom record delimiter and thus
    // we
    // are supporting only default one.
    // We might add another "else if" case for SplittableCompressionCodec once
    // we drop support for Hadoop 1.0.
    if (codec == null) {
        filestream.seek(start);
        filereader = new LineReader(filestream);
    } else {
        filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf);
        fileseeker = filestream;
    }
    if (start != 0) {
        // always throw away first record because
        // one extra line is read in previous split
        start += filereader.readLine(new Text(), 0);
    }
    int size;
    LOG.info("Start position: " + String.valueOf(start));
    long next = start;
    while (next <= end) {
        Text line = new Text();
        size = filereader.readLine(line, Integer.MAX_VALUE);
        if (size == 0) {
            break;
        }
        if (codec == null) {
            next += size;
        } else {
            next = fileseeker.getPos();
        }
        rowRead++;
        dataWriter.writeStringRecord(line.toString());
    }
    LOG.info("Extracting ended on position: " + fileseeker.getPos());
    filestream.close();
}

From source file:org.apache.sqoop.connector.hdfs.TestLoader.java

License:Apache License

private void verifyOutput(FileSystem fs, Path file) throws IOException {
    Configuration conf = new Configuration();
    FSDataInputStream fsin = fs.open(file);
    CompressionCodec codec;

    switch (outputFormat) {
    case TEXT_FILE:
        codec = (new CompressionCodecFactory(conf)).getCodec(file);

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            } else {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1);
            }/*w w  w.ja va  2s .  c o m*/
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        InputStreamReader in;
        if (codec == null) {
            in = new InputStreamReader(fsin);
        } else {
            in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor()));
        }
        BufferedReader textReader = new BufferedReader(in);

        for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) {
            Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine());
        }
        break;

    case SEQUENCE_FILE:
        SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf);
        codec = sequenceReader.getCompressionCodec();

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        Text line = new Text();
        int index = 1;
        while (sequenceReader.next(line)) {
            Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString());
            line = new Text();
        }
        break;
    }
}

From source file:org.apache.tajo.storage.compress.CodecPool.java

License:Apache License

/**
 * Get a {@link Decompressor} for the given {@link CompressionCodec} from the
 * pool or a new one./*w  w  w .j  ava 2  s.c om*/
 *
 * @param codec
 *          the <code>CompressionCodec</code> for which to get the
 *          <code>Decompressor</code>
 * @return <code>Decompressor</code> for the given
 *         <code>CompressionCodec</code> the pool or a new one
 */
public static Decompressor getDecompressor(CompressionCodec codec) {
    Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
    if (decompressor == null) {
        decompressor = codec.createDecompressor();
        LOG.info("Got brand-new decompressor [" + codec.getDefaultExtension() + "]");
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Got recycled decompressor");
        }
    }
    return decompressor;
}

From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java

License:Apache License

@Test
public void testInternalErrorTranslation() throws Exception {
    String codecErrorMsg = "codec failure";
    CompressionInputStream mockCodecStream = mock(CompressionInputStream.class);
    when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt()))
            .thenThrow(new InternalError(codecErrorMsg));
    Decompressor mockDecoder = mock(Decompressor.class);
    CompressionCodec mockCodec = mock(CompressionCodec.class);
    when(mockCodec.createDecompressor()).thenReturn(mockDecoder);
    when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class)))
            .thenReturn(mockCodecStream);
    byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 };
    try {//from www . ja  v a2s  .c  o m
        ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec,
                false, 0, mock(Logger.class), "identifier");
        Assert.fail("shuffle was supposed to throw!");
    } catch (IOException e) {
        Assert.assertTrue(e.getCause() instanceof InternalError);
        Assert.assertTrue(e.getMessage().contains(codecErrorMsg));
    }
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./*from  w  w w . jav  a 2 s  .  c  o  m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}