Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given InputStream with the given Decompressor .

Usage

From source file:mapreduce.CustomTemporalLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from w  w  w.j  a va 2s .c om
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    Text text = new Text();
    String str = null;
    int prevTime = -1;
    int currentTime = -1;
    if (start != 0) {
        start += in.readLine(text, 0, maxBytesToConsume(start));
        start += in.readLine(text, maxLineLength, maxBytesToConsume(start));
        str = text.toString();
        currentTime = Integer.parseInt(str.split(",")[1]);
        prevTime = currentTime;
        text = new Text();
        int offset = 0;
        while ((offset = in.readLine(text, maxLineLength, maxBytesToConsume(start))) >= 0) {
            start += offset;
            str = text.toString();
            currentTime = Integer.parseInt(str.split(",")[1]);
            if (currentTime != prevTime) {
                useRecordReadInInitialize = true;
                key = new LongWritable(start - offset);
                value = text;
                break;
            } else {
                prevTime = currentTime;
                text = new Text();
            }
        }
    }
    this.pos = start;
}

From source file:org.apache.ben.FileCleaningRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from w  w w  . j a  v a  2  s  .co  m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(cIn, job);
            } else {
                in = new QuotationLineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new QuotationLineReader(fileIn, job);
        } else {
            in = new QuotationLineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java

License:Apache License

/**
 * Extracts Text file//from   w  ww. ja  va 2  s  .  c om
 * @param file
 * @param start
 * @param length
 * @throws IOException
 */
@SuppressWarnings("resource")
private void extractTextFile(Path file, long start, long length) throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream filestream = fs.open(file);
    CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file);
    LineReader filereader;
    Seekable fileseeker = filestream;

    // Hadoop 1.0 does not have support for custom record delimiter and thus
    // we
    // are supporting only default one.
    // We might add another "else if" case for SplittableCompressionCodec once
    // we drop support for Hadoop 1.0.
    if (codec == null) {
        filestream.seek(start);
        filereader = new LineReader(filestream);
    } else {
        filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf);
        fileseeker = filestream;
    }
    if (start != 0) {
        // always throw away first record because
        // one extra line is read in previous split
        start += filereader.readLine(new Text(), 0);
    }
    int size;
    LOG.info("Start position: " + String.valueOf(start));
    long next = start;
    while (next <= end) {
        Text line = new Text();
        size = filereader.readLine(line, Integer.MAX_VALUE);
        if (size == 0) {
            break;
        }
        if (codec == null) {
            next += size;
        } else {
            next = fileseeker.getPos();
        }
        rowRead++;
        dataWriter.writeStringRecord(line.toString());
    }
    LOG.info("Extracting ended on position: " + fileseeker.getPos());
    filestream.close();
}

From source file:org.apache.sqoop.connector.hdfs.TestLoader.java

License:Apache License

private void verifyOutput(FileSystem fs, Path file) throws IOException {
    Configuration conf = new Configuration();
    FSDataInputStream fsin = fs.open(file);
    CompressionCodec codec;

    switch (outputFormat) {
    case TEXT_FILE:
        codec = (new CompressionCodecFactory(conf)).getCodec(file);

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            } else {
                Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1);
            }/*from w  w  w .j  a v a  2 s . c om*/
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        InputStreamReader in;
        if (codec == null) {
            in = new InputStreamReader(fsin);
        } else {
            in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor()));
        }
        BufferedReader textReader = new BufferedReader(in);

        for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) {
            Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine());
        }
        break;

    case SEQUENCE_FILE:
        SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf);
        codec = sequenceReader.getCompressionCodec();

        // Verify compression
        switch (compression) {
        case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;

        case DEFAULT:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
            break;

        case NONE:
        default:
            Assert.assertNull(codec);
            break;
        }

        Text line = new Text();
        int index = 1;
        while (sequenceReader.next(line)) {
            Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString());
            line = new Text();
        }
        break;
    }
}

From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java

License:Apache License

@Test
public void testInternalErrorTranslation() throws Exception {
    String codecErrorMsg = "codec failure";
    CompressionInputStream mockCodecStream = mock(CompressionInputStream.class);
    when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt()))
            .thenThrow(new InternalError(codecErrorMsg));
    Decompressor mockDecoder = mock(Decompressor.class);
    CompressionCodec mockCodec = mock(CompressionCodec.class);
    when(mockCodec.createDecompressor()).thenReturn(mockDecoder);
    when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class)))
            .thenReturn(mockCodecStream);
    byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 };
    try {//from  w  w w.j  a  v  a 2s.c o  m
        ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec,
                false, 0, mock(Logger.class), "identifier");
        Assert.fail("shuffle was supposed to throw!");
    } catch (IOException e) {
        Assert.assertTrue(e.getCause() instanceof InternalError);
        Assert.assertTrue(e.getMessage().contains(codecErrorMsg));
    }
}

From source file:org.apache.tez.runtime.library.shuffle.common.ShuffleUtils.java

License:Apache License

@SuppressWarnings("resource")
public static void shuffleToMemory(MemoryFetchedInput fetchedInput, InputStream input, int decompressedLength,
        int compressedLength, CompressionCodec codec, boolean ifileReadAhead, int ifileReadAheadLength, Log LOG)
        throws IOException {
    IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, ifileReadAhead,
            ifileReadAheadLength);// w  w w  . j a v  a  2 s.c o  m

    input = checksumIn;

    // Are map-outputs compressed?
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        decompressor.reset();
        input = codec.createInputStream(input, decompressor);
    }
    // Copy map-output into an in-memory buffer
    byte[] shuffleData = fetchedInput.getBytes();

    try {
        IOUtils.readFully(input, shuffleData, 0, shuffleData.length);
        // metrics.inputBytes(shuffleData.length);
        LOG.info("Read " + shuffleData.length + " bytes from input for "
                + fetchedInput.getInputAttemptIdentifier());
    } catch (IOException ioe) {
        // Close the streams
        IOUtils.cleanup(LOG, input);
        // Re-throw
        throw ioe;
    }
}

From source file:org.hedera.util.SeekableInputStream.java

License:Apache License

public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs,
        CompressionCodecFactory compressionCodecs) throws IOException {
    CompressionCodec codec = compressionCodecs.getCodec(path);
    FSDataInputStream din = fs.open(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec;
            SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            return new SeekableInputStream(cin);
        } else {//w  ww . j  av a  2s .c om
            // non-splittable compression input stream
            // no seeking or offsetting is needed
            assert start == 0;
            CompressionInputStream cin = codec.createInputStream(din, decompressor);
            return new SeekableInputStream(cin, din);
        }
    } else {
        // non compression input stream
        // we seek to the start of the split
        din.seek(start);
        return new SeekableInputStream(din);
    }
}

From source file:org.mrgeo.data.raster.RasterWritable.java

License:Apache License

public static MrGeoRaster toMrGeoRaster(final RasterWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from   w  w w .j  av a2 s.c o  m*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.bytes, 0, writable.getSize());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    return toMrGeoRaster(new RasterWritable(baos.toByteArray()));
}

From source file:org.mrgeo.vector.mrsvector.VectorTileWritable.java

License:Apache License

public static VectorTile toMrsVector(final VectorTileWritable writable, final CompressionCodec codec,
        final Decompressor decompressor) throws IOException {
    decompressor.reset();/*from  ww w.  j a v  a  2s  . com*/
    final ByteArrayInputStream bis = new ByteArrayInputStream(writable.getBytes(), 0, writable.getLength());
    final CompressionInputStream gis = codec.createInputStream(bis, decompressor);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true);

    byte[] data = baos.toByteArray();
    return VectorTile.fromProtobuf(data, 0, data.length);
}

From source file:org.springframework.data.hadoop.store.AbstractStorage.java

License:Apache License

protected synchronized StreamsHolder<InputStream> getInput(Path inputPath) throws IOException {
    if (inputHolder == null) {
        log.info("Creating new InputStream");
        inputHolder = new StreamsHolder<InputStream>();
        final FileSystem fs = basePath.getFileSystem(configuration);
        // TODO: hadoop2 isUriPathAbsolute() ?
        Path p = inputPath.isAbsolute() ? inputPath : new Path(getPath(), inputPath);
        if (!isCompressed()) {
            InputStream input = fs.open(p);
            inputHolder.setStream(input);
        } else {//from   ww w .j  a  va2s . c o m
            Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(),
                    getClass().getClassLoader());
            CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz,
                    getConfiguration());
            Decompressor decompressor = CodecPool.getDecompressor(compressionCodec);
            FSDataInputStream winput = fs.open(p);
            InputStream input = compressionCodec.createInputStream(winput, decompressor);
            inputHolder.setWrappedStream(winput);
            inputHolder.setStream(input);
        }
    }
    return inputHolder;
}