Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given InputStream with the given Decompressor .

Usage

From source file:mapreduce.CustomTemporalLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from w  w  w.j  a va 2s .c om
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    Text text = new Text();
    String str = null;
    int prevTime = -1;
    int currentTime = -1;
    if (start != 0) {
        start += in.readLine(text, 0, maxBytesToConsume(start));
        start += in.readLine(text, maxLineLength, maxBytesToConsume(start));
        str = text.toString();
        currentTime = Integer.parseInt(str.split(",")[1]);
        prevTime = currentTime;
        text = new Text();
        int offset = 0;
        while ((offset = in.readLine(text, maxLineLength, maxBytesToConsume(start))) >= 0) {
            start += offset;
            str = text.toString();
            currentTime = Integer.parseInt(str.split(",")[1]);
            if (currentTime != prevTime) {
                useRecordReadInInitialize = true;
                key = new LongWritable(start - offset);
                value = text;
                break;
            } else {
                prevTime = currentTime;
                text = new Text();
            }
        }
    }
    this.pos = start;
}

From source file:org.apache.ben.FileCleaningRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from w  w w  . j a  v a  2  s  .co  m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(cIn, job);
            } else {
                in = new QuotationLineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new QuotationLineReader(fileIn, job);
        } else {
            in = new QuotationLineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java

License:Apache License

/**
 * Extracts Text file//from   w  ww. ja  va 2  s  .  c om
 * @param file
 * @param start
 * @param length
 * @throws IOException
 */
@SuppressWarnings("resource")
private void extractTextFile(Path file, long start, long length) throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream filestream = fs.open(file);
    CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file);
    LineReader filereader;
    Seekable fileseeker = filestream;

    // Hadoop 1.0 does not have support for custom record delimiter and thus
    // we
    // are supporting only default one.
    // We might add another "else if" case for SplittableCompressionCodec once
    // we drop support for Hadoop 1.0.
    if (codec == null) {
        filestream.seek(start);
        filereader = new LineReader(filestream);
    } else {
        filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf);
        fileseeker = filestream;
    }
    if (start != 0) {
        // always throw away first record because
        // one extra line is read in previous split
        start += filereader.readLine(new Text(), 0);
    }
    int size;
    LOG.info("Start position: " + String.valueOf(start));
    long next = start;
    while (next <= end) {
        Text line = new Text();
        size = filereader.readLine(line, Integer.MAX_VALUE);
        if (size == 0) {
            break;
        }
        if (codec == null) {
            next += size;
        } else {
            next = fileseeker.getPos();
        }
        rowRead++;
        dataWriter.writeStringRecord(line.toString());
    }
    LOG.info("Extracting ended on position: " + fileseeker.getPos());
    filestream.close();
}

From source file:org.apache.sqoop.connector.hdfs.TestLoader.java

License:Apache License

private void verifyOutput(FileSystem fs, Path file) throws IOException {
    Configuration conf = new Configuration();
    FSDataInputStream fsin = fs.open(file);
    CompressionCodec codec;

    switch (outputFormat) {
    case TEXT_FILE:
        codec = (new CompressionCodecFactory(conf)).getCodec(file);

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            } else {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1);
            }/*from w  w  w .j  a v a  2 s . c om*/
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        InputStreamReader in;
        if (codec == null) {
            in = new InputStreamReader(fsin);
        } else {
            in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor()));
        }
        BufferedReader textReader = new BufferedReader(in);

        for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) {
            Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine());
        }
        break;

    case SEQUENCE_FILE:
        SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf);
        codec = sequenceReader.getCompressionCodec();

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        Text line = new Text();
        int index = 1;
        while (sequenceReader.next(line)) {
            Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString());
            line = new Text();
        }
        break;
    }
}

From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java

License:Apache License

@Test
public void testInternalErrorTranslation() throws Exception {
    String codecErrorMsg = "codec failure";
    CompressionInputStream mockCodecStream = mock(CompressionInputStream.class);
    when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt()))
            .thenThrow(new InternalError(codecErrorMsg));
    Decompressor mockDecoder = mock(Decompressor.class);
    CompressionCodec mockCodec = mock(CompressionCodec.class);
    when(mockCodec.createDecompressor()).thenReturn(mockDecoder);
    when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class)))
            .thenReturn(mockCodecStream);
    byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 };
    try {//from  w  w w.j  a  v  a 2s.c o  m
        ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec,
                false, 0, mock(Logger.class), "identifier");
        Assert.fail("shuffle was supposed to throw!");
    } catch (IOException e) {
        Assert.assertTrue(e.getCause() instanceof InternalError);
        Assert.assertTrue(e.getMessage().contains(codecErrorMsg));
    }
}

From source file:org.apache.tez.runtime.library.shuffle.common.ShuffleUtils.java

License:Apache License

@SuppressWarnings("resource")
public static void shuffleToMemory(MemoryFetchedInput fetchedInput, InputStream input, int decompressedLength,
        int compressedLength, CompressionCodec codec, boolean ifileReadAhead, int ifileReadAheadLength, Log LOG)
        throws IOException {
    IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, ifileReadAhead,
            ifileReadAheadLength);// w  w w  . j a v  a  2 s.c o  m

    input = checksumIn;

    // Are map-outputs compressed?
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        decompressor.reset();
        input = codec.createInputStream(input, decompressor);
    }
    // Copy map-output into an in-memory buffer
    byte[] shuffleData = fetchedInput.getBytes();

    try {
        IOUtils.readFully(input, shuffleData, 0, shuffleData.length);
        // metrics.inputBytes(shuffleData.length);
        LOG.info("Read " + shuffleData.length + " bytes from input for "
                + fetchedInput.getInputAttemptIdentifier());
    } catch (IOException ioe) {
        // Close the streams
        IOUtils.cleanup(LOG, input);
        // Re-throw
        throw ioe;
    }
}

From source file:org.hedera.util.SeekableInputStream.java

License:Apache License

public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs,
        CompressionCodecFactory compressionCodecs) throws IOException {
    CompressionCodec codec = compressionCodecs.getCodec(path);
    FSDataInputStream din = fs.open(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec;
            SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            return new SeekableInputStream(cin);
        } else {//w  ww . j  av a  2s .c om
            // non-splittable compression input stream
            // no seeking or offsetting is needed
            assert start == 0;
            CompressionInputStream cin = codec.createInputStream(din, decompressor);
            return new SeekableInputStream(cin, din);
        }
    } else {
        // non compression input stream
        // we seek to the start of the split
        din.seek(start);
        return new SeekableInputStream(din);
    }
}

From source file:org.mrgeo.data.raster.RasterWritable.java

License:Apache License

public static MrGeoRaster toMrGeoRaster(final RasterWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from   w  w w .j  av a2 s.c o  m*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.bytes, 0, writable.getSize());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    return toMrGeoRaster(new RasterWritable(baos.toByteArray()));
}

From source file:org.mrgeo.vector.mrsvector.VectorTileWritable.java

License:Apache License

public static VectorTile toMrsVector(final VectorTileWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from  ww w.  j a v  a  2s  . com*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.getBytes(), 0, writable.getLength());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    byte[] data = baos.toByteArray();
    return VectorTile.fromProtobuf(data, 0, data.length);
}

From source file:org.springframework.data.hadoop.store.AbstractStorage.java

License:Apache License

protected synchronized StreamsHolder<InputStream> getInput(Path inputPath) throws IOException {
    if (inputHolder == null) {
        log.info("Creating new InputStream");
        inputHolder = new StreamsHolder<InputStream>();
        final FileSystem fs = basePath.getFileSystem(configuration);
        // TODO: hadoop2 isUriPathAbsolute() ?
        Path p = inputPath.isAbsolute() ? inputPath : new Path(getPath(), inputPath);
        if (!isCompressed()) {
            InputStream input = fs.open(p);
            inputHolder.setStream(input);
        } else {//from   ww w .j  a  va2s . c o m
            Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(),
                    getClass().getClassLoader());
            CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz,
                    getConfiguration());
            Decompressor decompressor = CodecPool.getDecompressor(compressionCodec);
            FSDataInputStream winput = fs.open(p);
            InputStream input = compressionCodec.createInputStream(winput, decompressor);
            inputHolder.setWrappedStream(winput);
            inputHolder.setStream(input);
        }
    }
    return inputHolder;
}