Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:org.mitre.bio.mapred.io.FastaRecordReader.java

License:Open Source License

public FastaRecordReader(FileSplit split, JobConf job) throws IOException {
    this.pushBackString = null;
    this.pushBackSize = 0;

    this.maxLineLength = job.getInt("io.file.buffer.size", // mapred.linereader.maxlength
            Integer.MAX_VALUE);/*  w w  w  . j a  v  a 2s . c o m*/

    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();

    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
        this.end = Long.MAX_VALUE;
    } else {
        /**
         * From LineRecordReader, what is this doing?
         */
        if (this.start != 0) {
            LOG.info("Skipping first line in split");
            skipFirstLine = true;
            --this.start;
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) {
        /**
         * Skipping first line to re-established "start".
         */
        this.start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.mrgeo.hdfs.ingest.HdfsImageIngestDataProvider.java

License:Apache License

@Override
public InputStream openImage() throws IOException {

    Path path = new Path(getResourceName());

    final FileSystem fs = HadoopFileUtils.getFileSystem(conf, path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }/*from   ww w  .j a v a  2s.c o  m*/

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.mrgeo.image.geotools.GeotoolsRasterUtils.java

License:Apache License

private static InputStream openImageStream(String name) throws IOException {

    Path path = new Path(name);

    final FileSystem fs = HadoopFileUtils.getFileSystem(path);

    if (fs.exists(path)) {
        final InputStream stream = fs.open(path, 131072); // give open a 128K buffer

        Configuration localConf = HadoopUtils.createConfiguration();
        // see if were compressed
        final CompressionCodecFactory factory = new CompressionCodecFactory(localConf);
        final CompressionCodec codec = factory.getCodec(path);

        if (codec != null) {
            return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream));
        }/*  www.j  av a 2 s. co  m*/

        return stream;
    }

    throw new FileNotFoundException("File not found: " + path.toUri().toString());
}

From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;/*from  ww  w . ja v a2s  .  c om*/
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.rassee.omniture.hadoop.mapreduce.OmnitureDataFileRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;/* w  w w .  j  av a 2 s.c om*/
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.springframework.data.hadoop.fs.HdfsResource.java

License:Apache License

@Override
public InputStream getInputStream() throws IOException {
    if (exists) {
        InputStream stream = fs.open(path);

        if (codecsFactory != null) {
            CompressionCodec codec = codecsFactory.getCodec(path);
            if (codec != null) {
                // the pool is not used since the returned inputstream needs to be decorated
                // to return the decompressor on close which can mask the actual stream
                // it's also unclear whether the pool is actually useful or not
                // Decompressor decompressor = CodecPool.getDecompressor(codec);
                // stream = (decompressor != null ? codec.createInputStream(stream, decompressor) : codec.createInputStream(stream));
                stream = codec.createInputStream(stream);
            }/*from w w  w  . ja  v  a 2 s .c  om*/
        }

        return stream;
    }
    throw new IOException("Cannot open stream for " + getDescription());
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.FileCollectionRecordReader.java

License:Mozilla Public License

/** Opens a collection on the next file. */
@Override//  w ww. ja va 2  s  .c  o m
protected Collection openCollectionSplit(int index) throws IOException {
    if (index >= ((CombineFileSplit) split.getSplit()).getNumPaths()) {
        //no more splits left to process
        return null;
    }
    Path file = ((CombineFileSplit) split.getSplit()).getPath(index);
    logger.info("Opening " + file);
    long offset = 0;//TODO populate from split?
    FileSystem fs = file.getFileSystem(config);

    //WT2G collection has incorrectly named extensions. Terrier can deal with this,
    //Hadoop cant
    CompressionCodec codec = compressionCodecs.getCodec(new Path(file.toString().replaceAll("\\.GZ$", ".gz")));

    length = fs.getFileStatus(file).getLen();
    FSDataInputStream _input = fs.open(file); //TODO: we could use utility.Files here if
    //no codec was found   
    InputStream internalInputStream = null;
    start = offset;

    if (codec != null) {
        start = 0;
        inputStream = new CountingInputStream(_input);
        internalInputStream = codec.createInputStream(inputStream);
    } else {
        if (start != 0) //TODO: start is always zero? 
        {
            --start;
            _input.seek(start);
        }
        internalInputStream = inputStream = new CountingInputStream(_input, start);
    }
    Collection rtr = CollectionFactory.loadCollection(
            ApplicationSetup.getProperty("trec.collection.class", "TRECCollection"),
            new Class[] { InputStream.class }, new Object[] { internalInputStream });

    if (rtr == null) {
        throw new IOException("Collection did not load properly");
    }
    return rtr;
}

From source file:org.utils.TarballReader.java

License:Apache License

@Override
public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException {
    try {//www . j  av a2 s. c  om
        pos = 0;
        end = Long.MAX_VALUE;
        key = new TarballEntry();
        value = new Text();

        FileSplit split = (FileSplit) isplit;
        Path file = split.getPath();
        tarball = file.getName();

        Configuration conf = context.getConfiguration();
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(file);

        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream fileIn = fs.open(split.getPath());

        in = new TarInputStream(codec.createInputStream(fileIn));
    } catch (IOException ex) {
        Logger.getLogger(TarballReader.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:redpoll.examples.sogou.SogouRecordReader.java

License:Apache License

public SogouRecordReader(Configuration job, FileSplit split) throws IOException {
    start = split.getStart();/*  w  ww  . ja  v a 2  s. c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    if (codec != null) {
        in = new SogouCorpusReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0)
            fileIn.seek(start);
        in = new SogouCorpusReader(fileIn, job);
    }
    this.pos = start;
}

From source file:StorageEngineClient.FormatStorageIRecordReader.java

License:Open Source License

public FormatStorageIRecordReader(CombineFileSplit split, Configuration conf, Reporter report, Integer idx)
        throws IOException {
    int id = idx.intValue();
    this.conf = conf;
    Path p = split.getPath(id);//from w ww  .j  a v  a 2  s.  c  om
    file = p.toString();
    if (file.toLowerCase().endsWith(".gz")) {
        int index = file.lastIndexOf("_");
        String sub = file.substring(index + 1, file.length() - 3);
        this.recnum = Integer.valueOf(sub);
        isGZ = true;
        compressionCodecs = new CompressionCodecFactory(conf);
        final CompressionCodec codec = compressionCodecs.getCodec(p);
        FileSystem fs = new Path(file).getFileSystem(conf);
        FSDataInputStream fileIn = fs.open(p);
        in = new LineReader(codec.createInputStream(fileIn), conf);
        Text t = new Text();
        in.readLine(t);
        StringTokenizer stk = new StringTokenizer(t.toString(), new String(new char[] { '\01' }));
        int k = 0;
        while (stk.hasMoreTokens()) {
            String str = stk.nextToken();
            byte b = Byte.valueOf(str);
            IRecord.IFType type = new IRecord.IFType(b, k);
            fieldtypes.put(k, type);
            k++;
        }
        maxLineLength = Integer.MAX_VALUE;
        currentrec = 0;
    } else {
        ifdf = new IFormatDataFile(conf);
        ifdf.open(file);

        ISegmentIndex isi = ifdf.segIndex();
        if (isi.getSegnum() == 0) {
            this.recnum = 0;
        } else {
            long offset = split.getOffset(id);
            long len = split.getLength(id);
            int[] segids = isi.getsigidsinoffsetrange(offset, (int) len);
            System.out.println("fsplit:\toffset:  " + offset + "  len:  " + len + "  segids[0]:  " + segids[0]
                    + "  segids[1]:  " + segids[1]);
            if (segids[0] >= 0 && segids[0] < isi.getSegnum() && segids[1] <= isi.getSegnum()
                    && segids[1] > segids[0]) {
                int line = isi.getILineIndex(segids[0]).beginline();
                this.beginline = line;
                ifdf.seek(line);
                this.recnum = 0;
                for (int i = segids[0]; i < segids[1]; i++) {
                    this.recnum += isi.getILineIndex(i).recnum();
                }
            } else {
                this.recnum = 0;
            }
        }
    }
}