List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException;
From source file:mapreduce.CustomTemporalLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();//from w w w.j a va 2s .c om end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. Text text = new Text(); String str = null; int prevTime = -1; int currentTime = -1; if (start != 0) { start += in.readLine(text, 0, maxBytesToConsume(start)); start += in.readLine(text, maxLineLength, maxBytesToConsume(start)); str = text.toString(); currentTime = Integer.parseInt(str.split(",")[1]); prevTime = currentTime; text = new Text(); int offset = 0; while ((offset = in.readLine(text, maxLineLength, maxBytesToConsume(start))) >= 0) { start += offset; str = text.toString(); currentTime = Integer.parseInt(str.split(",")[1]); if (currentTime != prevTime) { useRecordReadInInitialize = true; key = new LongWritable(start - offset); value = text; break; } else { prevTime = currentTime; text = new Text(); } } } this.pos = start; }
From source file:org.apache.ben.FileCleaningRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();//from w w w . j a v a 2 s .co m end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new QuotationLineReader(cIn, job); } else { in = new QuotationLineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new QuotationLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new QuotationLineReader(fileIn, job); } else { in = new QuotationLineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java
License:Apache License
/** * Extracts Text file//from w ww. ja va 2 s . c om * @param file * @param start * @param length * @throws IOException */ @SuppressWarnings("resource") private void extractTextFile(Path file, long start, long length) throws IOException { LOG.info("Extracting text file"); long end = start + length; FileSystem fs = file.getFileSystem(conf); FSDataInputStream filestream = fs.open(file); CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file); LineReader filereader; Seekable fileseeker = filestream; // Hadoop 1.0 does not have support for custom record delimiter and thus // we // are supporting only default one. // We might add another "else if" case for SplittableCompressionCodec once // we drop support for Hadoop 1.0. if (codec == null) { filestream.seek(start); filereader = new LineReader(filestream); } else { filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf); fileseeker = filestream; } if (start != 0) { // always throw away first record because // one extra line is read in previous split start += filereader.readLine(new Text(), 0); } int size; LOG.info("Start position: " + String.valueOf(start)); long next = start; while (next <= end) { Text line = new Text(); size = filereader.readLine(line, Integer.MAX_VALUE); if (size == 0) { break; } if (codec == null) { next += size; } else { next = fileseeker.getPos(); } rowRead++; dataWriter.writeStringRecord(line.toString()); } LOG.info("Extracting ended on position: " + fileseeker.getPos()); filestream.close(); }
From source file:org.apache.sqoop.connector.hdfs.TestLoader.java
License:Apache License
private void verifyOutput(FileSystem fs, Path file) throws IOException { Configuration conf = new Configuration(); FSDataInputStream fsin = fs.open(file); CompressionCodec codec; switch (outputFormat) { case TEXT_FILE: codec = (new CompressionCodecFactory(conf)).getCodec(file); // Verify compression switch (compression) { case BZIP2: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1); break; case DEFAULT: if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) { Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1); } else { Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1); }/*from w w w .j a v a 2 s . c om*/ break; case NONE: default: Assert.assertNull(codec); break; } InputStreamReader in; if (codec == null) { in = new InputStreamReader(fsin); } else { in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor())); } BufferedReader textReader = new BufferedReader(in); for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) { Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine()); } break; case SEQUENCE_FILE: SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf); codec = sequenceReader.getCompressionCodec(); // Verify compression switch (compression) { case BZIP2: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1); break; case DEFAULT: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1); break; case NONE: default: Assert.assertNull(codec); break; } Text line = new Text(); int index = 1; while (sequenceReader.next(line)) { Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString()); line = new Text(); } break; } }
From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java
License:Apache License
@Test public void testInternalErrorTranslation() throws Exception { String codecErrorMsg = "codec failure"; CompressionInputStream mockCodecStream = mock(CompressionInputStream.class); when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt())) .thenThrow(new InternalError(codecErrorMsg)); Decompressor mockDecoder = mock(Decompressor.class); CompressionCodec mockCodec = mock(CompressionCodec.class); when(mockCodec.createDecompressor()).thenReturn(mockDecoder); when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class))) .thenReturn(mockCodecStream); byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 }; try {//from w w w.j a v a 2s.c o m ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec, false, 0, mock(Logger.class), "identifier"); Assert.fail("shuffle was supposed to throw!"); } catch (IOException e) { Assert.assertTrue(e.getCause() instanceof InternalError); Assert.assertTrue(e.getMessage().contains(codecErrorMsg)); } }
From source file:org.apache.tez.runtime.library.shuffle.common.ShuffleUtils.java
License:Apache License
@SuppressWarnings("resource") public static void shuffleToMemory(MemoryFetchedInput fetchedInput, InputStream input, int decompressedLength, int compressedLength, CompressionCodec codec, boolean ifileReadAhead, int ifileReadAheadLength, Log LOG) throws IOException { IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, ifileReadAhead, ifileReadAheadLength);// w w w . j a v a 2 s.c o m input = checksumIn; // Are map-outputs compressed? if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); decompressor.reset(); input = codec.createInputStream(input, decompressor); } // Copy map-output into an in-memory buffer byte[] shuffleData = fetchedInput.getBytes(); try { IOUtils.readFully(input, shuffleData, 0, shuffleData.length); // metrics.inputBytes(shuffleData.length); LOG.info("Read " + shuffleData.length + " bytes from input for " + fetchedInput.getInputAttemptIdentifier()); } catch (IOException ioe) { // Close the streams IOUtils.cleanup(LOG, input); // Re-throw throw ioe; } }
From source file:org.hedera.util.SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { CompressionCodec codec = compressionCodecs.getCodec(path); FSDataInputStream din = fs.open(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec; SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); return new SeekableInputStream(cin); } else {//w ww . j av a 2s .c om // non-splittable compression input stream // no seeking or offsetting is needed assert start == 0; CompressionInputStream cin = codec.createInputStream(din, decompressor); return new SeekableInputStream(cin, din); } } else { // non compression input stream // we seek to the start of the split din.seek(start); return new SeekableInputStream(din); } }
From source file:org.mrgeo.data.raster.RasterWritable.java
License:Apache License
public static MrGeoRaster toMrGeoRaster(final RasterWritable writable, final CompressionCodec codec, final Decompressor decompressor) throws IOException { decompressor.reset();/*from w w w .j av a2 s.c o m*/ final ByteArrayInputStream bis = new ByteArrayInputStream(writable.bytes, 0, writable.getSize()); final CompressionInputStream gis = codec.createInputStream(bis, decompressor); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true); return toMrGeoRaster(new RasterWritable(baos.toByteArray())); }
From source file:org.mrgeo.vector.mrsvector.VectorTileWritable.java
License:Apache License
public static VectorTile toMrsVector(final VectorTileWritable writable, final CompressionCodec codec, final Decompressor decompressor) throws IOException { decompressor.reset();/*from ww w. j a v a 2s . com*/ final ByteArrayInputStream bis = new ByteArrayInputStream(writable.getBytes(), 0, writable.getLength()); final CompressionInputStream gis = codec.createInputStream(bis, decompressor); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copyBytes(gis, baos, 1024 * 1024 * 2, true); byte[] data = baos.toByteArray(); return VectorTile.fromProtobuf(data, 0, data.length); }
From source file:org.springframework.data.hadoop.store.AbstractStorage.java
License:Apache License
protected synchronized StreamsHolder<InputStream> getInput(Path inputPath) throws IOException { if (inputHolder == null) { log.info("Creating new InputStream"); inputHolder = new StreamsHolder<InputStream>(); final FileSystem fs = basePath.getFileSystem(configuration); // TODO: hadoop2 isUriPathAbsolute() ? Path p = inputPath.isAbsolute() ? inputPath : new Path(getPath(), inputPath); if (!isCompressed()) { InputStream input = fs.open(p); inputHolder.setStream(input); } else {//from ww w .j a va2s . c o m Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(), getClass().getClassLoader()); CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz, getConfiguration()); Decompressor decompressor = CodecPool.getDecompressor(compressionCodec); FSDataInputStream winput = fs.open(p); InputStream input = compressionCodec.createInputStream(winput, decompressor); inputHolder.setWrappedStream(winput); inputHolder.setStream(input); } } return inputHolder; }