Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given InputStream with the given Decompressor .

Usage

From source file:ca.sparkera.adapters.mapreduce.MainframeVBRecordReader.java

License:Apache License

public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {

    start = splitStart;/*  w w w  .ja v a 2s .co m*/
    end = start + splitLength;
    LOG.info("Start of the split:" + start + "-End of split:" + end);
    LOG.debug("VLR initialize started: start pos:" + start + "endpos:" + end);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        numBytesRemainingInSplit = splitLength;
        LOG.info("Variable length input; cannot compute number of records in the split");

    }
    this.pos = start;
}

From source file:com.bonc.mr_roamRecognition_hjpt.comm.PathRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//w  w w.  j  av  a 2 s .  c om
    end = start + split.getLength();
    final Path file = split.getPath();

    path = split.getPath().toString();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.cloudera.ByteBufferRecordReader.java

License:Apache License

private void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;//ww w .  ja v a  2 s.  com
    end = start + splitLength;
    pos = start;

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    this.readStats = new ReadStatistics();
    this.bufferPool = new ElasticByteBufferPool();
    boolean skipChecksums = job.getBoolean("bytecount.skipChecksums", false);
    this.readOption = skipChecksums ? EnumSet.of(ReadOption.SKIP_CHECKSUMS) : EnumSet.noneOf(ReadOption.class);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        LOG.info("Split pos = " + start + " length " + splitLength);
    }
}

From source file:com.cloudera.sqoop.TestExport.java

License:Apache License

private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException {
    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }/*from  ww  w .  ja  v  a2s.co  m*/
    FileSystem fs = FileSystem.get(conf);
    InputStream is = fs.open(f);
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(f);
    LOG.info("gzip check codec is " + codec);
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (null == decompressor) {
        LOG.info("Verifying gzip sanity with null decompressor");
    } else {
        LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString());
    }
    is = codec.createInputStream(is, decompressor);
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
        String ln = r.readLine();
        if (ln == null) {
            break;
        }
        numLines++;
    }

    r.close();
    assertEquals("Did not read back correct number of lines", expectedNumLines, numLines);
    LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
}

From source file:com.datascience.hadoop.CsvInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf,
        Reporter reporter) throws IOException {
    String charsetName = conf.get(CHARSET);
    Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

    FileSplit split = (FileSplit) inputSplit;
    Path path = split.getPath();//w  w  w .j  ava2 s .  c o  m
    FileSystem fs = path.getFileSystem(conf);
    InputStream is = fs.open(path);

    // If the input is compressed, load the compression codec.
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        is = codec.createInputStream(is, decompressor);
    }
    return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(),
            conf.getBoolean(STRICT_MODE, true));
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java

License:Apache License

public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in)
        throws IOException {

    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(new Path(path));
    if (codec == null)
        return in;
    System.out.println("Getting InputStream : " + codec.getDefaultExtension());
    System.out.println("Getting InputStream : " + codec);
    Decompressor compressor = codec.createDecompressor();
    in = codec.createInputStream(in, compressor);

    return in;/*from w ww  .j  a  va 2  s.  c o m*/
}

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from   ww  w . jav a  2 s.  co  m*/
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs.//  w  w w .j  av  a 2s .c o  m
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:com.hadoop.mapreduce.TestLzoLazyLoading.java

License:Open Source License

public static String readFile(String name) throws IOException {
    Path file = new Path(TEST_ROOT_DIR + "/" + name);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
    InputStream f;//from  w w w. java 2  s .  co  m
    Decompressor decompressor = null;
    if (codec == null) {
        f = localFs.open(file);
    } else {
        decompressor = CodecPool.getDecompressor(codec);
        f = codec.createInputStream(localFs.open(file), decompressor);
    }
    BufferedReader b = new BufferedReader(new InputStreamReader(f));
    StringBuilder result = new StringBuilder();
    String line = b.readLine();
    while (line != null) {
        result.append(line);
        result.append('\n');
        line = b.readLine();
    }
    b.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return result.toString();
}

From source file:com.mycompany.keywordsearch.LineRecordReaderV2.java

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from www  .  j  a v a2  s .co m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    locationKey.set(file.toString());
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}