Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:kogiri.common.hadoop.io.reader.fasta.FastaRawReadReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    this.filename = file.getName();

    this.firstRead = true;

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {// ww  w .  jav  a2s .co  m
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);

    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
    } else {
        if (this.start != 0) {
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }

    // skip lines until we meet new read start
    while (this.start < this.end) {
        Text skipText = new Text();
        long newSize = this.in.readLine(skipText, this.maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength));
        if (newSize == 0) {
            // EOF
            this.hasNextRead = false;
            this.pos = this.end;
            break;
        }

        if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) {
            this.prevLine = skipText;
            this.prevSize = newSize;
            this.hasNextRead = true;
            this.pos = this.start;
            break;
        }

        this.start += newSize;

        if (this.start >= this.end) {
            // EOF
            this.hasNextRead = false;
            this.pos = this.end;
            break;
        }
    }

    this.key = null;
    this.value = null;
}

From source file:kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    this.filename = file.getName();

    this.firstRead = true;

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {/*from w  ww  .ja  va 2 s .com*/
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);

    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
    } else {
        if (this.start != 0) {
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }

    // skip lines until we meet new record start
    while (this.start < this.end) {
        Text skipText = new Text();
        long newSize = this.in.readLine(skipText, this.maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength));
        if (newSize == 0) {
            // EOF
            this.hasNextRecord = false;
            this.pos = this.end;
            break;
        }

        if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) {
            this.prevLine = skipText;
            this.prevSize = newSize;
            this.hasNextRecord = true;
            this.pos = this.start;
            break;
        }

        this.start += newSize;

        if (this.start >= this.end) {
            // EOF
            this.hasNextRecord = false;
            this.pos = this.end;
            break;
        }
    }

    this.key = null;
    this.value = null;
}

From source file:lennard.PiRecordReader.java

License:Apache License

public PiRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from w w w .ja va2  s. com
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;

    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:libra.common.hadoop.io.reader.fasta.FastaKmerReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    this.kmersize = FastaKmerInputFormat.getKmerSize(conf);
    this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {//w  w  w  .  j av a2  s . co m
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);
    boolean inTheMiddle = false;
    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        if (this.start != 0) {
            this.start--;
            fileIn.seek(this.start);

            inTheMiddle = true;
        }
        this.in = new LineReader(fileIn, conf);
    }

    this.buffer = new Text();

    if (inTheMiddle) {
        // find new start line
        this.start += this.in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start));

        // back off
        FSDataInputStream fileIn2 = fs.open(file);
        fileIn2.seek(this.start - 1000);

        LineReader in2 = new LineReader(fileIn2, conf);
        Text tempLine = new Text();
        long curpos = this.start - 1000;
        while (curpos < this.start) {
            curpos += in2.readLine(tempLine, 0, (int) (this.start - curpos));
        }

        if (tempLine.charAt(0) == READ_DELIMITER) {
            // clean start
            this.buffer.clear();
        } else {
            // leave k-1 seq in the buffer
            String seq = tempLine.toString().trim();
            String left = seq.substring(seq.length() - this.kmersize + 1);
            this.buffer.set(left);
        }

        in2.close();
    }

    this.pos = this.start;

    this.key = null;
    this.value = null;
}

From source file:ml.shifu.guagua.hadoop.io.GuaguaLineRecordReader.java

License:Apache License

@Override
public void initialize(GuaguaFileSplit genericSplit) throws IOException {
    this.maxLineLength = Integer.MAX_VALUE;
    start = genericSplit.getOffset();/*from w  ww  .  j  a v a 2 s  .c  om*/
    end = start + genericSplit.getLength();
    final Path file = new Path(genericSplit.getPath());
    compressionCodecs = new CompressionCodecFactory(this.conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(this.conf);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE,
                    this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
        } else {
            in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE, this.recordDelimiterBytes);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:ml.shifu.guagua.mapreduce.GuaguaLineRecordReader.java

License:Apache License

@Override
public void initialize(GuaguaFileSplit split) throws IOException {
    this.maxLineLength = Integer.MAX_VALUE;
    start = split.getOffset();//from   w w  w  .jav a  2  s .  c  o  m
    end = start + split.getLength();
    final Path file = new Path(split.getPath());
    compressionCodecs = new CompressionCodecFactory(this.conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(new Configuration());
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:ml.shifu.guagua.yarn.GuaguaLineRecordReader.java

License:Apache License

@Override
public void initialize(GuaguaFileSplit genericSplit) throws IOException {
    this.maxLineLength = Integer.MAX_VALUE;
    start = genericSplit.getOffset();/*w w w .j av  a 2 s  .  c o m*/
    end = start + genericSplit.getLength();
    final Path file = new Path(genericSplit.getPath());
    compressionCodecs = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(new Configuration());
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE,
                    this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
        } else {
            in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE, this.recordDelimiterBytes);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:ml.shifu.shifu.core.mr.input.CombineRecordReader.java

License:Apache License

private void initializeOne(TaskAttemptContext context, FileSplit split) throws IOException {
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from  w ww  .ja  v  a 2s  .c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:ml.shifu.shifu.util.HdfsGlobalFile.java

License:Apache License

private InputStream openPartFileAsStream(FileStatus fileStatus) throws IOException {
    CompressionCodecFactory compressionFactory = new CompressionCodecFactory(new Configuration());
    InputStream is = null;//w w  w . j  a  v a 2  s .c  o m

    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    CompressionCodec codec = compressionFactory.getCodec(fileStatus.getPath());
    if (codec != null) {
        is = codec.createInputStream(fs.open(fileStatus.getPath()));
    } else {
        is = fs.open(fileStatus.getPath());
    }
    return is;
}

From source file:name.abhijitsarkar.hadoop.io.IOUtils.java

License:Open Source License

/**
 * //from w  w  w .j a v a  2s.co  m
 * @param archiveURI
 *            The archive to be extracted
 * @param conf
 *            Job configuration
 * @return Extracted file URI
 * @throws IOException
 *             If fails to extract the archive
 */
public static URI uncompressFile(final URI archiveURI, final Configuration conf) throws IOException {
    Path archivePath = new Path(archiveURI);
    OutputStream outputStream = null;
    InputStream inputStream = null;
    Path uncompressionPath = null;

    try {
        final FileSystem fs = FileSystem.getLocal(conf);

        FileStatus[] statuses = new FileStatus[] { fs.getFileStatus(archivePath) };
        if (statuses[0].isDir()) {
            statuses = fs.listStatus(archivePath);

            LOGGER.debug("Archive is a directory and contains {} elements.", statuses.length);

            archivePath = statuses[0].getPath();
        }

        LOGGER.debug("archiveURI: {}.", archivePath.toUri());

        final CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(archivePath);
        if (codec == null) {
            LOGGER.debug("Not an archive: {}.", archivePath.toUri());
            return archivePath.toUri();
        }

        LOGGER.debug("Using codec: {}.", codec.getClass().getName());
        uncompressionPath = new Path(addExtension(archivePath.toUri().getPath(), ".new", true));

        LOGGER.debug("uncompressedURI: {}.", uncompressionPath.toUri());

        outputStream = new FileOutputStream(uncompressionPath.toUri().getPath());

        inputStream = new FileInputStream(archivePath.toUri().getPath());
        final CompressionInputStream in = codec.createInputStream(inputStream);
        org.apache.hadoop.io.IOUtils.copyBytes(in, outputStream, conf, false);
    } catch (IOException e) {
        throw e;
    } finally {
        closeStreams(inputStream, outputStream);
    }

    return uncompressionPath.toUri();
}