Example usage for org.apache.hadoop.io.compress CodecPool getDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CodecPool getDecompressor.

Prototype

public static Decompressor getDecompressor(CompressionCodec codec)

Source Link

Document

Get a Decompressor for the given CompressionCodec from the pool or a new one.

Usage

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    done = false;/*from w ww .j a v a  2  s  . c  om*/

    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();

    file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    currentValue = new ValueWritable();
    value = new Text();
    tmpValue = new Text();
    tmp = new Text();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR");
    //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS));

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;

    setKeySeq(fs, job); //Set currentKey

    nextMyKeyValue(); //Leggo il primo record se esiste.

}

From source file:hadoop.TweetRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    start = split.getStart();/*from  www . ja va2 s. c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }

    this.pos = start;
}

From source file:hivemall.utils.hadoop.HadoopUtils.java

License:Open Source License

public static BufferedReader getBufferedReader(File file, MapredContext context) throws IOException {
    URI fileuri = file.toURI();//ww  w. ja va2s .c  o  m
    Path path = new Path(fileuri);

    Configuration conf = context.getJobConf();
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(path);

    if (codec == null) {
        return new BufferedReader(new FileReader(file));
    } else {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        FileInputStream fis = new FileInputStream(file);
        CompressionInputStream cis = codec.createInputStream(fis, decompressor);
        BufferedReader br = new BufferedReaderExt(new InputStreamReader(cis), decompressor);
        return br;
    }
}

From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java

License:Open Source License

public FileNameLineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.LineRecordReader.maxlength", Integer.MAX_VALUE);
    fileName = split.getPath().getName();
    start = split.getStart();/*from   w ww .j a  v a 2s. c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from   ww w  .j  ava 2 s .c  om
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:mapreduce.CustomTemporalLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//  www  . ja va2 s.co m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    Text text = new Text();
    String str = null;
    int prevTime = -1;
    int currentTime = -1;
    if (start != 0) {
        start += in.readLine(text, 0, maxBytesToConsume(start));
        start += in.readLine(text, maxLineLength, maxBytesToConsume(start));
        str = text.toString();
        currentTime = Integer.parseInt(str.split(",")[1]);
        prevTime = currentTime;
        text = new Text();
        int offset = 0;
        while ((offset = in.readLine(text, maxLineLength, maxBytesToConsume(start))) >= 0) {
            start += offset;
            str = text.toString();
            currentTime = Integer.parseInt(str.split(",")[1]);
            if (currentTime != prevTime) {
                useRecordReadInInitialize = true;
                key = new LongWritable(start - offset);
                value = text;
                break;
            } else {
                prevTime = currentTime;
                text = new Text();
            }
        }
    }
    this.pos = start;
}

From source file:mr.MyFileRecordReader2.java

License:Apache License

public MyFileRecordReader2(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException {
    //this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.MyFileRecordReader2.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*  w  w w  .  j av  a  2  s.  co  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            //in = new MyLineReader(cIn, job, recordDelimiter);
            in = new MyLineReader(cIn, job, recordDelimiter);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new MyLineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter);
            //in = new MyLineReader(codec.createInputStream(fileIn,decompressor), recordDelimiter);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new MyLineReader(fileIn, job, recordDelimiter);
        //in = new MyLineReader(fileIn, recordDelimiter);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java

License:Apache License

/**
 * This test checks if reading the file in a splitted way results
 * in the same lines as reading the file as a single 'split'.
 *///from   ww  w  .j  a  v  a 2s  .  c o m
private void validateSplitSeams(final Configuration conf, final FileSystem fs, final Path filename,
        final Class<? extends SplittableCompressionCodec> codecClass, final long splitSize,
        final long recordsInFile, final long lastSplitSizeLimit) throws IOException {
    // To make the test predictable
    conf.setInt("io.file.buffer.size", BUFFER_SIZE);

    final FileStatus infile = fs.getFileStatus(filename);
    final long inputLength = infile.getLen();

    if (inputLength > Integer.MAX_VALUE) {
        fail("Bad test file length.");
    }

    LOG.info("Input is " + inputLength + " bytes. " + "making a split every " + splitSize + " bytes.");

    if (inputLength <= splitSize) {
        fail("The compressed test file is too small to do any useful testing.");
    }

    final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

    /*
     * The validation is done as follows:
     * 1) We open the entire file as a single split as the reference
     * 2) We create a sequence of splits and validate each line with the
     *    reference split.
     * The lines from these two must match 100%.
     */

    final Text refLine = new Text();
    final Decompressor refDcmp = CodecPool.getDecompressor(codec);
    assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

    final SplitCompressionInputStream refStream = codec.createInputStream(fs.open(infile.getPath()), refDcmp, 0,
            inputLength, SplittableCompressionCodec.READ_MODE.BYBLOCK);
    final LineReader refReader = new LineReader(refStream, conf);

    final Text line = new Text();
    final Decompressor dcmp = CodecPool.getDecompressor(codec);
    assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

    try {
        long start = 0;
        long end = splitSize;
        int splitCount = 0;
        long refLineNumber = 0;
        long splitLineNumber;

        while (end <= inputLength) {
            splitLineNumber = 0;
            ++splitCount;
            LOG.debug("-------------------------------------------------------");
            dcmp.reset(); // Reset the Decompressor for reuse with the new stream

            final SplitCompressionInputStream splitStream = codec.createInputStream(fs.open(infile.getPath()),
                    dcmp, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

            final long adjustedStart = splitStream.getAdjustedStart();
            final long adjustedEnd = splitStream.getAdjustedEnd();

            if (LOG.isDebugEnabled()) {
                LOG.debug("Doing split " + splitCount + " on range " + " (" + start + "-" + end + ")"
                        + " adjusted to (" + adjustedStart + "-" + adjustedEnd + ")");
            }

            final LineReader lreader = new LineReader(splitStream, conf);

            if (start != 0) {
                // Not the first split so we discard the first (incomplete) line.
                int readChars = lreader.readLine(line);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("DISCARD LINE " + 0 + " in split " + splitCount + " pos=" + splitStream.getPos()
                            + " length=" + readChars + ": \"" + line + "\"");
                }
            }

            // Now read until the end of this split
            while (nextKeyValue(splitStream, lreader, adjustedEnd, line)) {
                ++splitLineNumber;

                // Get the reference value
                if (!nextKeyValue(refStream, refReader, inputLength, refLine)) {
                    LOG.error(String.format("S>%05d: %s", splitLineNumber, line));
                    fail("Split goes beyond the end of the reference with line number " + splitLineNumber);
                }
                ++refLineNumber;

                if (LOG.isDebugEnabled() && refLineNumber > (recordsInFile - 10)) {
                    LOG.debug(String.format("R<%05d: %s", refLineNumber, refLine));
                    LOG.debug(String.format("S>%05d: %s", splitLineNumber, line));
                }

                assertEquals("Line must be same in reference and in split at line " + refLineNumber, refLine,
                        line);

                if (LOG.isTraceEnabled()) {
                    LOG.trace("LINE " + splitLineNumber + " in split " + splitCount + " (" + refLineNumber
                            + ") pos=" + splitStream.getPos() + " length=" + line.getLength() + ": \"" + line
                            + "\"");
                }
            }

            // We just read through the entire split
            LOG.debug("Checked split " + splitCount + " (" + adjustedStart + "-" + adjustedEnd + ") "
                    + "containing " + splitLineNumber + " lines.");

            if (end == inputLength) {
                LOG.info("====================> Finished the last split <====================");
                break; // We've reached the end of the last split
            }

            // Determine start and end for the next split
            start = end;

            if ((end + lastSplitSizeLimit) > inputLength) {
                end = inputLength;
                LOG.info("====================> Starting the last split (" + start + " - " + end
                        + ") <====================");
            } else {
                end += splitSize;
                LOG.info("====================> Starting the next split (" + start + " - " + end
                        + ") <====================");
            }

        }

        if (nextKeyValue(refStream, refReader, inputLength, refLine)) {
            ++refLineNumber;
            LOG.error(String.format("R<%05d: %s", refLineNumber, refLine));
            fail("The reference is at least one line longer than the last split ( " + "splitSize=" + splitSize
                    + ", " + "inputLength= " + inputLength + ", " + "split start=" + start + ", " + "split end="
                    + end + ", " + "line=" + refLineNumber + ")");
        }

        LOG.info("Verified " + refLineNumber + " lines in " + splitCount + " splits.");

    } finally {
        CodecPool.returnDecompressor(dcmp);
        CodecPool.returnDecompressor(refDcmp);
    }
}

From source file:nl.cwi.kba2013.thrift.bin.ThriftRecordReader.java

License:Apache License

/** Boilerplate initialization code for file input streams. */
@Override/*from w  w  w. j  a  va2 s .c om*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

    conf = context.getConfiguration();
    fileSplit = (FileSplit) split;
    start = fileSplit.getStart();
    length = fileSplit.getLength();
    position = start;

    Path path = fileSplit.getPath();
    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(path);

    compressionCodecs = new CompressionCodecFactory(conf);
    codec = compressionCodecs.getCodec(path);

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        in = new DataInputStream(codec.createInputStream(fileIn, decompressor));
        filePosition = fileIn;
        //LOG.info("Successfully initialized input stream for compressed data.");
    } else {
        fileIn.seek(start);
        in = fileIn;
        filePosition = fileIn;
    }

    tp = new TBinaryProtocol.Factory().getProtocol(new TIOStreamTransport(in));
}

From source file:org.apache.ben.FileCleaningRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from w w  w  .j  av a 2s .  c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(cIn, job);
            } else {
                in = new QuotationLineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new QuotationLineReader(fileIn, job);
        } else {
            in = new QuotationLineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}