Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:org.mitre.bio.mapred.io.FastaRecordReader.java

License:Open Source License

public FastaRecordReader(FileSplit split, JobConf job) throws IOException {
    this.pushBackString = null;
    this.pushBackSize = 0;

    this.maxLineLength = job.getInt("io.file.buffer.size", // mapred.linereader.maxlength
            Integer.MAX_VALUE);/*  w w  w  . j a  v  a 2s . c o m*/

    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();

    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
        this.end = Long.MAX_VALUE;
    } else {
        /**
         * From LineRecordReader, what is this doing?
         */
        if (this.start != 0) {
            LOG.info("Skipping first line in split");
            skipFirstLine = true;
            --this.start;
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) {
        /**
         * Skipping first line to re-established "start".
         */
        this.start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.mrgeo.hdfs.ingest.HdfsImageIngestDataProvider.java

License:Apache License

@Override
public InputStream openImage() throws IOException {

    Path path = new Path(getResourceName());

    final FileSystem fs = HadoopFileUtils.getFileSystem(conf, path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }/*from   ww w  .j a v a  2s.c o  m*/

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.mrgeo.image.geotools.GeotoolsRasterUtils.java

License:Apache License

private static InputStream openImageStream(String name) throws IOException {

    Path path = new Path(name);

    final FileSystem fs = HadoopFileUtils.getFileSystem(path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }/*  www.j  av a 2 s. co  m*/

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;/*from  ww  w . ja v a2s  .  c om*/
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.rassee.omniture.hadoop.mapreduce.OmnitureDataFileRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;/* w  w w .  j  av a 2 s.c om*/
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.springframework.data.hadoop.fs.HdfsResource.java

License:Apache License

@Override
public InputStream getInputStream() throws IOException {
    if (exists) {
        InputStream stream = fs.open(path);

        if (codecsFactory != null) {
            CompressionCodec codec = codecsFactory.getCodec(path);
            if (codec != null) {
                // the pool is not used since the returned inputstream needs to be decorated
                // to return the decompressor on close which can mask the actual stream
                // it's also unclear whether the pool is actually useful or not
                // Decompressor decompressor = CodecPool.getDecompressor(codec);
                // stream = (decompressor != null ? codec.createInputStream(stream, decompressor) : codec.createInputStream(stream));
                stream = codec.createInputStream(stream);
            }/*from w w  w  . ja  v  a 2 s .c  om*/
        }

        return stream;
    }
    throw new IOException("Cannot open stream for " + getDescription());
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.FileCollectionRecordReader.java

License:Mozilla Public License

/** Opens a collection on the next file. */
@Override//  w ww. ja va 2  s  .c  o m
protected Collection openCollectionSplit(int index) throws IOException {
    if (index >= ((CombineFileSplit) split.getSplit()).getNumPaths()) {
        //no more splits left to process
        return null;
    }
    Path file = ((CombineFileSplit) split.getSplit()).getPath(index);
    logger.info("Opening " + file);
    long offset = 0;//TODO populate from split?
    FileSystem fs = file.getFileSystem(config);

    //WT2G collection has incorrectly named extensions. Terrier can deal with this,
    //Hadoop cant
    CompressionCodec codec = compressionCodecs.getCodec(new Path(file.toString().replaceAll("\\.GZ$", ".gz")));

    length = fs.getFileStatus(file).getLen();
    FSDataInputStream _input = fs.open(file); //TODO: we could use utility.Files here if
    //no codec was found   
    InputStream internalInputStream = null;
    start = offset;

    if (codec != null) {
        start = 0;
        inputStream = new CountingInputStream(_input);
        internalInputStream = codec.createInputStream(inputStream);
    } else {
        if (start != 0) //TODO: start is always zero? 
        {
            --start;
            _input.seek(start);
        }
        internalInputStream = inputStream = new CountingInputStream(_input, start);
    }
    Collection rtr = CollectionFactory.loadCollection(
            ApplicationSetup.getProperty("trec.collection.class", "TRECCollection"),
            new Class[] { InputStream.class }, new Object[] { internalInputStream });

    if (rtr == null) {
        throw new IOException("Collection did not load properly");
    }
    return rtr;
}

From source file:org.utils.TarballReader.java

License:Apache License

@Override
public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException {
    try {//www . j  av a2 s. c  om
        pos = 0;
        end = Long.MAX_VALUE;
        key = new TarballEntry();
        value = new Text();

        FileSplit split = (FileSplit) isplit;
        Path file = split.getPath();
        tarball = file.getName();

        Configuration conf = context.getConfiguration();
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(file);

        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream fileIn = fs.open(split.getPath());

        in = new TarInputStream(codec.createInputStream(fileIn));
    } catch (IOException ex) {
        Logger.getLogger(TarballReader.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:redpoll.examples.sogou.SogouRecordReader.java

License:Apache License

public SogouRecordReader(Configuration job, FileSplit split) throws IOException {
    start = split.getStart();/*  w  ww  . ja  v a 2  s. c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    if (codec != null) {
        in = new SogouCorpusReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0)
            fileIn.seek(start);
        in = new SogouCorpusReader(fileIn, job);
    }
    this.pos = start;
}

From source file:StorageEngineClient.FormatStorageIRecordReader.java

License:Open Source License

public FormatStorageIRecordReader(CombineFileSplit split, Configuration conf, Reporter report, Integer idx)
        throws IOException {
    int id = idx.intValue();
    this.conf = conf;
    Path p = split.getPath(id);//from w ww  .j  a v  a 2  s.  c  om
    file = p.toString();
    if (file.toLowerCase().endsWith(".gz")) {
        int index = file.lastIndexOf("_");
        String sub = file.substring(index + 1, file.length() - 3);
        this.recnum = Integer.valueOf(sub);
        isGZ = true;
        compressionCodecs = new CompressionCodecFactory(conf);
        final CompressionCodec codec = compressionCodecs.getCodec(p);
        FileSystem fs = new Path(file).getFileSystem(conf);
        FSDataInputStream fileIn = fs.open(p);
        in = new LineReader(codec.createInputStream(fileIn), conf);
        Text t = new Text();
        in.readLine(t);
        StringTokenizer stk = new StringTokenizer(t.toString(), new String(new char[] { '\01' }));
        int k = 0;
        while (stk.hasMoreTokens()) {
            String str = stk.nextToken();
            byte b = Byte.valueOf(str);
            IRecord.IFType type = new IRecord.IFType(b, k);
            fieldtypes.put(k, type);
            k++;
        }
        maxLineLength = Integer.MAX_VALUE;
        currentrec = 0;
    } else {
        ifdf = new IFormatDataFile(conf);
        ifdf.open(file);

        ISegmentIndex isi = ifdf.segIndex();
        if (isi.getSegnum() == 0) {
            this.recnum = 0;
        } else {
            long offset = split.getOffset(id);
            long len = split.getLength(id);
            int[] segids = isi.getsigidsinoffsetrange(offset, (int) len);
            System.out.println("fsplit:\toffset:  " + offset + "  len:  " + len + "  segids[0]:  " + segids[0]
                    + "  segids[1]:  " + segids[1]);
            if (segids[0] >= 0 && segids[0] < isi.getSegnum() && segids[1] <= isi.getSegnum()
                    && segids[1] > segids[0]) {
                int line = isi.getILineIndex(segids[0]).beginline();
                this.beginline = line;
                ifdf.seek(line);
                this.recnum = 0;
                for (int i = segids[0]; i < segids[1]; i++) {
                    this.recnum += isi.getILineIndex(i).recnum();
                }
            } else {
                this.recnum = 0;
            }
        }
    }
}