List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:org.mitre.bio.mapred.io.FastaRecordReader.java
License:Open Source License
public FastaRecordReader(FileSplit split, JobConf job) throws IOException { this.pushBackString = null; this.pushBackSize = 0; this.maxLineLength = job.getInt("io.file.buffer.size", // mapred.linereader.maxlength Integer.MAX_VALUE);/* w w w . j a v a 2s . c o m*/ this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); this.end = Long.MAX_VALUE; } else { /** * From LineRecordReader, what is this doing? */ if (this.start != 0) { LOG.info("Skipping first line in split"); skipFirstLine = true; --this.start; fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } if (skipFirstLine) { /** * Skipping first line to re-established "start". */ this.start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.mrgeo.hdfs.ingest.HdfsImageIngestDataProvider.java
License:Apache License
@Override public InputStream openImage() throws IOException { Path path = new Path(getResourceName()); final FileSystem fs = HadoopFileUtils.getFileSystem(conf, path); if (fs.exists(path)) { final InputStream stream = fs.open(path, 131072); // give open a 128K buffer Configuration localConf = HadoopUtils.createConfiguration(); // see if were compressed final CompressionCodecFactory factory = new CompressionCodecFactory(localConf); final CompressionCodec codec = factory.getCodec(path); if (codec != null) { return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream)); }/*from ww w .j a v a 2s.c o m*/ return stream; } throw new FileNotFoundException("File not found: " + path.toUri().toString()); }
From source file:org.mrgeo.image.geotools.GeotoolsRasterUtils.java
License:Apache License
private static InputStream openImageStream(String name) throws IOException { Path path = new Path(name); final FileSystem fs = HadoopFileUtils.getFileSystem(path); if (fs.exists(path)) { final InputStream stream = fs.open(path, 131072); // give open a 128K buffer Configuration localConf = HadoopUtils.createConfiguration(); // see if were compressed final CompressionCodecFactory factory = new CompressionCodecFactory(localConf); final CompressionCodec codec = factory.getCodec(path); if (codec != null) { return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream)); }/* www.j av a 2 s. co m*/ return stream; } throw new FileNotFoundException("File not found: " + path.toUri().toString()); }
From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java
License:Open Source License
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;/*from ww w . ja v a2s . c om*/ } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.rassee.omniture.hadoop.mapreduce.OmnitureDataFileRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;/* w w w . j av a 2 s.c om*/ } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.springframework.data.hadoop.fs.HdfsResource.java
License:Apache License
@Override public InputStream getInputStream() throws IOException { if (exists) { InputStream stream = fs.open(path); if (codecsFactory != null) { CompressionCodec codec = codecsFactory.getCodec(path); if (codec != null) { // the pool is not used since the returned inputstream needs to be decorated // to return the decompressor on close which can mask the actual stream // it's also unclear whether the pool is actually useful or not // Decompressor decompressor = CodecPool.getDecompressor(codec); // stream = (decompressor != null ? codec.createInputStream(stream, decompressor) : codec.createInputStream(stream)); stream = codec.createInputStream(stream); }/*from w w w . ja v a 2 s .c om*/ } return stream; } throw new IOException("Cannot open stream for " + getDescription()); }
From source file:org.terrier.structures.indexing.singlepass.hadoop.FileCollectionRecordReader.java
License:Mozilla Public License
/** Opens a collection on the next file. */ @Override// w ww. ja va 2 s .c o m protected Collection openCollectionSplit(int index) throws IOException { if (index >= ((CombineFileSplit) split.getSplit()).getNumPaths()) { //no more splits left to process return null; } Path file = ((CombineFileSplit) split.getSplit()).getPath(index); logger.info("Opening " + file); long offset = 0;//TODO populate from split? FileSystem fs = file.getFileSystem(config); //WT2G collection has incorrectly named extensions. Terrier can deal with this, //Hadoop cant CompressionCodec codec = compressionCodecs.getCodec(new Path(file.toString().replaceAll("\\.GZ$", ".gz"))); length = fs.getFileStatus(file).getLen(); FSDataInputStream _input = fs.open(file); //TODO: we could use utility.Files here if //no codec was found InputStream internalInputStream = null; start = offset; if (codec != null) { start = 0; inputStream = new CountingInputStream(_input); internalInputStream = codec.createInputStream(inputStream); } else { if (start != 0) //TODO: start is always zero? { --start; _input.seek(start); } internalInputStream = inputStream = new CountingInputStream(_input, start); } Collection rtr = CollectionFactory.loadCollection( ApplicationSetup.getProperty("trec.collection.class", "TRECCollection"), new Class[] { InputStream.class }, new Object[] { internalInputStream }); if (rtr == null) { throw new IOException("Collection did not load properly"); } return rtr; }
From source file:org.utils.TarballReader.java
License:Apache License
@Override public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException { try {//www . j av a2 s. c om pos = 0; end = Long.MAX_VALUE; key = new TarballEntry(); value = new Text(); FileSplit split = (FileSplit) isplit; Path file = split.getPath(); tarball = file.getName(); Configuration conf = context.getConfiguration(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); in = new TarInputStream(codec.createInputStream(fileIn)); } catch (IOException ex) { Logger.getLogger(TarballReader.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:redpoll.examples.sogou.SogouRecordReader.java
License:Apache License
public SogouRecordReader(Configuration job, FileSplit split) throws IOException { start = split.getStart();/* w ww . ja v a 2 s. c om*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { in = new SogouCorpusReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) fileIn.seek(start); in = new SogouCorpusReader(fileIn, job); } this.pos = start; }
From source file:StorageEngineClient.FormatStorageIRecordReader.java
License:Open Source License
public FormatStorageIRecordReader(CombineFileSplit split, Configuration conf, Reporter report, Integer idx) throws IOException { int id = idx.intValue(); this.conf = conf; Path p = split.getPath(id);//from w ww .j a v a 2 s. c om file = p.toString(); if (file.toLowerCase().endsWith(".gz")) { int index = file.lastIndexOf("_"); String sub = file.substring(index + 1, file.length() - 3); this.recnum = Integer.valueOf(sub); isGZ = true; compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(p); FileSystem fs = new Path(file).getFileSystem(conf); FSDataInputStream fileIn = fs.open(p); in = new LineReader(codec.createInputStream(fileIn), conf); Text t = new Text(); in.readLine(t); StringTokenizer stk = new StringTokenizer(t.toString(), new String(new char[] { '\01' })); int k = 0; while (stk.hasMoreTokens()) { String str = stk.nextToken(); byte b = Byte.valueOf(str); IRecord.IFType type = new IRecord.IFType(b, k); fieldtypes.put(k, type); k++; } maxLineLength = Integer.MAX_VALUE; currentrec = 0; } else { ifdf = new IFormatDataFile(conf); ifdf.open(file); ISegmentIndex isi = ifdf.segIndex(); if (isi.getSegnum() == 0) { this.recnum = 0; } else { long offset = split.getOffset(id); long len = split.getLength(id); int[] segids = isi.getsigidsinoffsetrange(offset, (int) len); System.out.println("fsplit:\toffset: " + offset + " len: " + len + " segids[0]: " + segids[0] + " segids[1]: " + segids[1]); if (segids[0] >= 0 && segids[0] < isi.getSegnum() && segids[1] <= isi.getSegnum() && segids[1] > segids[0]) { int line = isi.getILineIndex(segids[0]).beginline(); this.beginline = line; ifdf.seek(line); this.recnum = 0; for (int i = segids[0]; i < segids[1]; i++) { this.recnum += isi.getILineIndex(i).recnum(); } } else { this.recnum = 0; } } } }