List of usage examples for org.apache.hadoop.io.compress CompressionCodec createDecompressor
Decompressor createDecompressor();
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(new Path(path)); if (codec == null) return in; System.out.println("Getting InputStream : " + codec.getDefaultExtension()); System.out.println("Getting InputStream : " + codec); Decompressor compressor = codec.createDecompressor(); in = codec.createInputStream(in, compressor); return in;/*from w ww.j a v a2 s . co m*/ }
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./* www . j ava2 s .c om*/ * * @param fs File system that contains the file. * @param lzoFile the lzo file to index. For filename.lzo, the created index file will be * filename.lzo.index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); if (null == codec) { throw new IOException("Could not find codec for file " + lzoFile + " - you may need to add the LZO codec to your io.compression.codecs " + "configuration in core-site.xml"); } ((Configurable) codec).setConf(conf); FSDataInputStream is = null; FSDataOutputStream os = null; Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX); Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX); // Track whether an exception was thrown or not, so we know to either // delete the tmp index file on failure, or rename it to the new index file on success. boolean indexingSucceeded = false; try { is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // Solely for reading the header codec.createInputStream(is, decompressor); int numCompressedChecksums = decompressor.getCompressedChecksumsCount(); int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } // See LzopInputStream.getCompressedData boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize); int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums; long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip)); } // If we're here, indexing was successful. indexingSucceeded = true; } finally { // Close any open streams. if (is != null) { is.close(); } if (os != null) { os.close(); } if (!indexingSucceeded) { // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves. fs.delete(tmpOutputFile, false); } else { // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index. fs.rename(tmpOutputFile, outputFile); } } }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./*from w w w. j a v a 2 s. co m*/ * * @param fs * File system that contains the file. * @param lzoFile * the lzo file to index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); InputStream lzoIs = null; FSDataOutputStream os = null; Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX); Path tmpOutputFile = outputFile.suffix(".tmp"); try { FSDataInputStream is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // for reading the header lzoIs = codec.createInputStream(is, decompressor); int numChecksums = decompressor.getChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksums)); } } finally { if (lzoIs != null) { lzoIs.close(); } if (os != null) { os.close(); } } fs.rename(tmpOutputFile, outputFile); }
From source file:data.intelligence.platform.yarn.etl.io.CodecPool.java
License:Apache License
/** * Get a {@link Decompressor} for the given {@link CompressionCodec} from * the pool or a new one.//from w ww . j a v a2s . co m * * @param codec * the <code>CompressionCodec</code> for which to get the * <code>Decompressor</code> * @return <code>Decompressor</code> for the given * <code>CompressionCodec</code> the pool or a new one */ public static Decompressor getDecompressor(CompressionCodec codec) { Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType()); if (decompressor == null) { decompressor = codec.createDecompressor(); LOG.info("Got brand-new decompressor"); } else { LOG.debug("Got recycled decompressor"); } return decompressor; }
From source file:io.airlift.compress.HadoopNative.java
License:Apache License
private static void requireNativeZlib() { Configuration conf = new Configuration(); if (!ZlibFactory.isNativeZlibLoaded(conf)) { throw new RuntimeException("native zlib is not loaded"); }/* w w w . jav a 2s. c om*/ CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodecByClassName(GzipCodec.class.getName()); if (codec == null) { throw new RuntimeException("failed to load GzipCodec"); } org.apache.hadoop.io.compress.Decompressor decompressor = codec.createDecompressor(); if (!(decompressor instanceof ZlibDecompressor)) { throw new RuntimeException("wrong gzip decompressor: " + decompressor.getClass().getName()); } }
From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java
License:Apache License
/** * Extracts Text file//w w w .j a v a 2s. c om * @param file * @param start * @param length * @throws IOException */ @SuppressWarnings("resource") private void extractTextFile(Path file, long start, long length) throws IOException { LOG.info("Extracting text file"); long end = start + length; FileSystem fs = file.getFileSystem(conf); FSDataInputStream filestream = fs.open(file); CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file); LineReader filereader; Seekable fileseeker = filestream; // Hadoop 1.0 does not have support for custom record delimiter and thus // we // are supporting only default one. // We might add another "else if" case for SplittableCompressionCodec once // we drop support for Hadoop 1.0. if (codec == null) { filestream.seek(start); filereader = new LineReader(filestream); } else { filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf); fileseeker = filestream; } if (start != 0) { // always throw away first record because // one extra line is read in previous split start += filereader.readLine(new Text(), 0); } int size; LOG.info("Start position: " + String.valueOf(start)); long next = start; while (next <= end) { Text line = new Text(); size = filereader.readLine(line, Integer.MAX_VALUE); if (size == 0) { break; } if (codec == null) { next += size; } else { next = fileseeker.getPos(); } rowRead++; dataWriter.writeStringRecord(line.toString()); } LOG.info("Extracting ended on position: " + fileseeker.getPos()); filestream.close(); }
From source file:org.apache.sqoop.connector.hdfs.TestLoader.java
License:Apache License
private void verifyOutput(FileSystem fs, Path file) throws IOException { Configuration conf = new Configuration(); FSDataInputStream fsin = fs.open(file); CompressionCodec codec; switch (outputFormat) { case TEXT_FILE: codec = (new CompressionCodecFactory(conf)).getCodec(file); // Verify compression switch (compression) { case BZIP2: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1); break; case DEFAULT: if (org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) { Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1); } else { Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Deflate") != -1); }/*w w w.ja va 2s . c o m*/ break; case NONE: default: Assert.assertNull(codec); break; } InputStreamReader in; if (codec == null) { in = new InputStreamReader(fsin); } else { in = new InputStreamReader(codec.createInputStream(fsin, codec.createDecompressor())); } BufferedReader textReader = new BufferedReader(in); for (int i = 1; i <= NUMBER_OF_ROWS_PER_FILE; ++i) { Assert.assertEquals(i + "," + (double) i + ",'" + i + "'", textReader.readLine()); } break; case SEQUENCE_FILE: SequenceFile.Reader sequenceReader = new SequenceFile.Reader(fs, file, conf); codec = sequenceReader.getCompressionCodec(); // Verify compression switch (compression) { case BZIP2: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1); break; case DEFAULT: Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1); break; case NONE: default: Assert.assertNull(codec); break; } Text line = new Text(); int index = 1; while (sequenceReader.next(line)) { Assert.assertEquals(index + "," + (double) index + ",'" + index++ + "'", line.toString()); line = new Text(); } break; } }
From source file:org.apache.tajo.storage.compress.CodecPool.java
License:Apache License
/** * Get a {@link Decompressor} for the given {@link CompressionCodec} from the * pool or a new one./*w w w .j ava 2 s.c om*/ * * @param codec * the <code>CompressionCodec</code> for which to get the * <code>Decompressor</code> * @return <code>Decompressor</code> for the given * <code>CompressionCodec</code> the pool or a new one */ public static Decompressor getDecompressor(CompressionCodec codec) { Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType()); if (decompressor == null) { decompressor = codec.createDecompressor(); LOG.info("Got brand-new decompressor [" + codec.getDefaultExtension() + "]"); } else { if (LOG.isDebugEnabled()) { LOG.debug("Got recycled decompressor"); } } return decompressor; }
From source file:org.apache.tez.runtime.library.common.shuffle.TestShuffleUtils.java
License:Apache License
@Test public void testInternalErrorTranslation() throws Exception { String codecErrorMsg = "codec failure"; CompressionInputStream mockCodecStream = mock(CompressionInputStream.class); when(mockCodecStream.read(any(byte[].class), anyInt(), anyInt())) .thenThrow(new InternalError(codecErrorMsg)); Decompressor mockDecoder = mock(Decompressor.class); CompressionCodec mockCodec = mock(CompressionCodec.class); when(mockCodec.createDecompressor()).thenReturn(mockDecoder); when(mockCodec.createInputStream(any(InputStream.class), any(Decompressor.class))) .thenReturn(mockCodecStream); byte[] header = new byte[] { (byte) 'T', (byte) 'I', (byte) 'F', (byte) 1 }; try {//from www . ja v a2s .c o m ShuffleUtils.shuffleToMemory(new byte[1024], new ByteArrayInputStream(header), 1024, 128, mockCodec, false, 0, mock(Logger.class), "identifier"); Assert.fail("shuffle was supposed to throw!"); } catch (IOException e) { Assert.assertTrue(e.getCause() instanceof InternalError); Assert.assertTrue(e.getMessage().contains(codecErrorMsg)); } }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./*from w w w . jav a 2 s . c o m*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); // if our codec is splittable, we can (tentatively) say that // we too are splittable. // // if we get a bgzfenhancedcodec, the codec might not actually // be splittable. however, if we get a non-splittable gz file, // several things happen: // // 1. the input format will detect this, and will not split the // file // 2. the bgzfenhancedcodec will check the underlying data type // (BGZF vs GZIP) at input stream creation time, and will // apply the appropriate codec. // // if we get an unsplittable codec, really all that we do differently // is skip the positioning check, since we know that we're at the // start of the file and can get to reading immediately isSplittable = (codec instanceof SplittableCompressionCodec); if (codec == null) { // no codec. Uncompressed file. int bytesToSkip = positionAtFirstRecord(fileIn, null); inputStream = fileIn; inputStream.skip(bytesToSkip); lineReader = new LineReader(inputStream); } else if (isSplittable) { // file is compressed, but uses a splittable codec isCompressed = true; int bytesToSkip = positionAtFirstRecord(fileIn, codec); // apparent fun finding: if you don't seek back to 0, // SplittableCompressionCodec.createInputStream will seek in the stream // to a start position, and funny things happen.. fileIn.seek(0); inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(), start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); inputStream.skip(bytesToSkip); lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf); } else { // unsplittable compressed file // expect a single split, first record at offset 0 isCompressed = true; inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file lineReader = new LineReader(inputStream); } }