Example usage for org.apache.hadoop.io.compress CodecPool getDecompressor

List of usage examples for org.apache.hadoop.io.compress CodecPool getDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CodecPool getDecompressor.

Prototype

public static Decompressor getDecompressor(CompressionCodec codec) 

Source Link

Document

Get a Decompressor for the given CompressionCodec from the pool or a new one.

Usage

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    done = false;/*from w ww .j a v a  2  s  . c  om*/

    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();

    file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    currentValue = new ValueWritable();
    value = new Text();
    tmpValue = new Text();
    tmp = new Text();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR");
    //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS));

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;

    setKeySeq(fs, job); //Set currentKey

    nextMyKeyValue(); //Leggo il primo record se esiste.

}

From source file:hadoop.TweetRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    start = split.getStart();/*from  www . ja va2 s. c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }

    this.pos = start;
}

From source file:hivemall.utils.hadoop.HadoopUtils.java

License:Open Source License

public static BufferedReader getBufferedReader(File file, MapredContext context) throws IOException {
    URI fileuri = file.toURI();//ww  w. ja va2s .c  o  m
    Path path = new Path(fileuri);

    Configuration conf = context.getJobConf();
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(path);

    if (codec == null) {
        return new BufferedReader(new FileReader(file));
    } else {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        FileInputStream fis = new FileInputStream(file);
        CompressionInputStream cis = codec.createInputStream(fis, decompressor);
        BufferedReader br = new BufferedReaderExt(new InputStreamReader(cis), decompressor);
        return br;
    }
}

From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java

License:Open Source License

public FileNameLineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.LineRecordReader.maxlength", Integer.MAX_VALUE);
    fileName = split.getPath().getName();
    start = split.getStart();/*from   w ww .j a  v a 2s. c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from   ww w  .j  ava 2 s .c  om
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:mapreduce.CustomTemporalLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//  www  . ja va2 s.co m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    Text text = new Text();
    String str = null;
    int prevTime = -1;
    int currentTime = -1;
    if (start != 0) {
        start += in.readLine(text, 0, maxBytesToConsume(start));
        start += in.readLine(text, maxLineLength, maxBytesToConsume(start));
        str = text.toString();
        currentTime = Integer.parseInt(str.split(",")[1]);
        prevTime = currentTime;
        text = new Text();
        int offset = 0;
        while ((offset = in.readLine(text, maxLineLength, maxBytesToConsume(start))) >= 0) {
            start += offset;
            str = text.toString();
            currentTime = Integer.parseInt(str.split(",")[1]);
            if (currentTime != prevTime) {
                useRecordReadInInitialize = true;
                key = new LongWritable(start - offset);
                value = text;
                break;
            } else {
                prevTime = currentTime;
                text = new Text();
            }
        }
    }
    this.pos = start;
}

From source file:mr.MyFileRecordReader2.java

License:Apache License

public MyFileRecordReader2(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException {
    //this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.MyFileRecordReader2.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*  w  w w  .  j av  a  2  s.  co  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            //in = new MyLineReader(cIn, job, recordDelimiter);
            in = new MyLineReader(cIn, job, recordDelimiter);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new MyLineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter);
            //in = new MyLineReader(codec.createInputStream(fileIn,decompressor), recordDelimiter);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new MyLineReader(fileIn, job, recordDelimiter);
        //in = new MyLineReader(fileIn, recordDelimiter);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java

License:Apache License

/**
 * This test checks if reading the file in a splitted way results
 * in the same lines as reading the file as a single 'split'.
 *///from   ww  w  .j  a  v  a 2s  .  c o m
private void validateSplitSeams(final Configuration conf, final FileSystem fs, final Path filename,
        final Class<? extends SplittableCompressionCodec> codecClass, final long splitSize,
        final long recordsInFile, final long lastSplitSizeLimit) throws IOException {
    // To make the test predictable
    conf.setInt("io.file.buffer.size", BUFFER_SIZE);

    final FileStatus infile = fs.getFileStatus(filename);
    final long inputLength = infile.getLen();

    if (inputLength > Integer.MAX_VALUE) {
        fail("Bad test file length.");
    }

    LOG.info("Input is " + inputLength + " bytes. " + "making a split every " + splitSize + " bytes.");

    if (inputLength <= splitSize) {
        fail("The compressed test file is too small to do any useful testing.");
    }

    final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

    /*
     * The validation is done as follows:
     * 1) We open the entire file as a single split as the reference
     * 2) We create a sequence of splits and validate each line with the
     *    reference split.
     * The lines from these two must match 100%.
     */

    final Text refLine = new Text();
    final Decompressor refDcmp = CodecPool.getDecompressor(codec);
    assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

    final SplitCompressionInputStream refStream = codec.createInputStream(fs.open(infile.getPath()), refDcmp, 0,
            inputLength, SplittableCompressionCodec.READ_MODE.BYBLOCK);
    final LineReader refReader = new LineReader(refStream, conf);

    final Text line = new Text();
    final Decompressor dcmp = CodecPool.getDecompressor(codec);
    assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

    try {
        long start = 0;
        long end = splitSize;
        int splitCount = 0;
        long refLineNumber = 0;
        long splitLineNumber;

        while (end <= inputLength) {
            splitLineNumber = 0;
            ++splitCount;
            LOG.debug("-------------------------------------------------------");
            dcmp.reset(); // Reset the Decompressor for reuse with the new stream

            final SplitCompressionInputStream splitStream = codec.createInputStream(fs.open(infile.getPath()),
                    dcmp, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

            final long adjustedStart = splitStream.getAdjustedStart();
            final long adjustedEnd = splitStream.getAdjustedEnd();

            if (LOG.isDebugEnabled()) {
                LOG.debug("Doing split " + splitCount + " on range " + " (" + start + "-" + end + ")"
                        + " adjusted to (" + adjustedStart + "-" + adjustedEnd + ")");
            }

            final LineReader lreader = new LineReader(splitStream, conf);

            if (start != 0) {
                // Not the first split so we discard the first (incomplete) line.
                int readChars = lreader.readLine(line);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("DISCARD LINE " + 0 + " in split " + splitCount + " pos=" + splitStream.getPos()
                            + " length=" + readChars + ": \"" + line + "\"");
                }
            }

            // Now read until the end of this split
            while (nextKeyValue(splitStream, lreader, adjustedEnd, line)) {
                ++splitLineNumber;

                // Get the reference value
                if (!nextKeyValue(refStream, refReader, inputLength, refLine)) {
                    LOG.error(String.format("S>%05d: %s", splitLineNumber, line));
                    fail("Split goes beyond the end of the reference with line number " + splitLineNumber);
                }
                ++refLineNumber;

                if (LOG.isDebugEnabled() && refLineNumber > (recordsInFile - 10)) {
                    LOG.debug(String.format("R<%05d: %s", refLineNumber, refLine));
                    LOG.debug(String.format("S>%05d: %s", splitLineNumber, line));
                }

                assertEquals("Line must be same in reference and in split at line " + refLineNumber, refLine,
                        line);

                if (LOG.isTraceEnabled()) {
                    LOG.trace("LINE " + splitLineNumber + " in split " + splitCount + " (" + refLineNumber
                            + ") pos=" + splitStream.getPos() + " length=" + line.getLength() + ": \"" + line
                            + "\"");
                }
            }

            // We just read through the entire split
            LOG.debug("Checked split " + splitCount + " (" + adjustedStart + "-" + adjustedEnd + ") "
                    + "containing " + splitLineNumber + " lines.");

            if (end == inputLength) {
                LOG.info("====================> Finished the last split <====================");
                break; // We've reached the end of the last split
            }

            // Determine start and end for the next split
            start = end;

            if ((end + lastSplitSizeLimit) > inputLength) {
                end = inputLength;
                LOG.info("====================> Starting the last split (" + start + " - " + end
                        + ") <====================");
            } else {
                end += splitSize;
                LOG.info("====================> Starting the next split (" + start + " - " + end
                        + ") <====================");
            }

        }

        if (nextKeyValue(refStream, refReader, inputLength, refLine)) {
            ++refLineNumber;
            LOG.error(String.format("R<%05d: %s", refLineNumber, refLine));
            fail("The reference is at least one line longer than the last split ( " + "splitSize=" + splitSize
                    + ", " + "inputLength= " + inputLength + ", " + "split start=" + start + ", " + "split end="
                    + end + ", " + "line=" + refLineNumber + ")");
        }

        LOG.info("Verified " + refLineNumber + " lines in " + splitCount + " splits.");

    } finally {
        CodecPool.returnDecompressor(dcmp);
        CodecPool.returnDecompressor(refDcmp);
    }
}

From source file:nl.cwi.kba2013.thrift.bin.ThriftRecordReader.java

License:Apache License

/** Boilerplate initialization code for file input streams. */
@Override/*from w  w  w. j  a  va2 s .c om*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

    conf = context.getConfiguration();
    fileSplit = (FileSplit) split;
    start = fileSplit.getStart();
    length = fileSplit.getLength();
    position = start;

    Path path = fileSplit.getPath();
    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(path);

    compressionCodecs = new CompressionCodecFactory(conf);
    codec = compressionCodecs.getCodec(path);

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        in = new DataInputStream(codec.createInputStream(fileIn, decompressor));
        filePosition = fileIn;
        //LOG.info("Successfully initialized input stream for compressed data.");
    } else {
        fileIn.seek(start);
        in = fileIn;
        filePosition = fileIn;
    }

    tp = new TBinaryProtocol.Factory().getProtocol(new TIOStreamTransport(in));
}

From source file:org.apache.ben.FileCleaningRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from w w  w  .j  av a 2s .  c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(cIn, job);
            } else {
                in = new QuotationLineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new QuotationLineReader(fileIn, job);
        } else {
            in = new QuotationLineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}