List of usage examples for org.apache.hadoop.io.compress GzipCodec GzipCodec
GzipCodec
From source file:com.cloudera.flume.handlers.hdfs.TestEscapedCustomOutputDfs.java
License:Apache License
/** * Test to write few log lines, compress using gzip, write to disk, read back * the compressed file and verify the written lines. This test alone doesn't * test GZipCodec with its Native Libs. java.library.path must contain the * path to the hadoop native libs for this to happen. * * @throws IOException// w w w.java2s.c om * @throws InterruptedException */ @Test public void testGZipCodec() throws IOException, InterruptedException { checkOutputFormat("syslog", new SyslogEntryFormat(), "GzipCodec", new GzipCodec()); }
From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java
License:Apache License
public void testNonSplittingGzipFile() throws IOException { SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "nonsplit-", 0, new GzipCodec()); SplittableBufferedWriter w = new SplittableBufferedWriter(os, true); try {/*from w w w. j a v a 2s.c o m*/ w.allowSplit(); w.write("This is a string!"); w.newLine(); w.write("This is another string!"); w.allowSplit(); } finally { w.close(); } // Ensure we made exactly one file. Path writePath = new Path(getWritePath(), "nonsplit-00000.gz"); Path badPath = new Path(getWritePath(), "nonsplit-00001.gz"); verifyFileExists(writePath); verifyFileDoesNotExist(badPath); // Ensure we didn't make a second file. // Now ensure all the data got there. String[] expectedLines = { "This is a string!", "This is another string!", }; verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "nonsplit-00000.gz"))), expectedLines); }
From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java
License:Apache License
public void testSplittingGzipFile() throws IOException { SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "splitz-", 3, new GzipCodec()); SplittableBufferedWriter w = new SplittableBufferedWriter(os, true); try {//from ww w . j a v a 2 s . co m w.write("This is a string!"); w.newLine(); w.write("This is another string!"); } finally { w.close(); } // Ensure we made exactly two files. Path writePath = new Path(getWritePath(), "splitz-00000.gz"); Path writePath2 = new Path(getWritePath(), "splitz-00001.gz"); Path badPath = new Path(getWritePath(), "splitz-00002.gz"); verifyFileExists(writePath); verifyFileExists(writePath2); verifyFileDoesNotExist(badPath); // Ensure we didn't make three files. // Now ensure all the data got there. String[] expectedLines0 = { "This is a string!", }; verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00000.gz"))), expectedLines0); String[] expectedLines1 = { "This is another string!", }; verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00001.gz"))), expectedLines1); }
From source file:com.cloudera.sqoop.TestCompression.java
License:Apache License
public void runSequenceFileCompressionTest(CompressionCodec codec, int expectedNum) throws Exception { String[] columns = HsqldbTestServer.getFieldNames(); ClassLoader prevClassLoader = null; SequenceFile.Reader reader = null; String[] argv = getArgv(true, columns, codec, "--as-sequencefile"); runImport(argv);//from w w w . j ava 2s. c o m try { SqoopOptions opts = new ImportTool().parseArguments(getArgv(false, columns, codec, "--as-sequencefile"), null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); LOG.debug("Got jar from import job: " + jarFileName); prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, getTableName()); reader = SeqFileReader.getSeqFileReader(getDataFilePath().toString()); if (codec == null) { codec = new GzipCodec(); } assertTrue("Block compressed", reader.isBlockCompressed()); assertEquals(codec.getClass(), reader.getCompressionCodec().getClass()); // here we can actually instantiate (k, v) pairs. Configuration conf = new Configuration(); Object key = ReflectionUtils.newInstance(reader.getKeyClass(), conf); Object val = ReflectionUtils.newInstance(reader.getValueClass(), conf); // We know that these values are two ints separated by a ',' character. // Since this is all dynamic, though, we don't want to actually link // against the class and use its methods. So we just parse this back // into int fields manually. Sum them up and ensure that we get the // expected total for the first column, to verify that we got all the // results from the db into the file. // Sum up everything in the file. int numLines = 0; while (reader.next(key) != null) { reader.getCurrentValue(val); numLines++; } assertEquals(expectedNum, numLines); } finally { IOUtils.closeStream(reader); if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
From source file:com.cloudera.sqoop.TestCompression.java
License:Apache License
public void runTextCompressionTest(CompressionCodec codec, int expectedNum) throws IOException { String[] columns = HsqldbTestServer.getFieldNames(); String[] argv = getArgv(true, columns, codec, "--as-textfile"); runImport(argv);/*from w w w.j a v a 2 s. co m*/ Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); if (codec == null) { codec = new GzipCodec(); } ReflectionUtils.setConf(codec, getConf()); Path p = new Path(getDataFilePath().toString() + codec.getDefaultExtension()); InputStream is = codec.createInputStream(fs.open(p)); BufferedReader r = new BufferedReader(new InputStreamReader(is)); int numLines = 0; while (true) { String ln = r.readLine(); if (ln == null) { break; } numLines++; } r.close(); assertEquals(expectedNum, numLines); }
From source file:com.dataartisans.flink.cascading.runtime.spilling.SpillingTupleCollectionFactory.java
License:Apache License
public void initialize(FlowProcess<? extends Configuration> flowProcess) { this.spillThreshold = SpillableTupleList.getThreshold(flowProcess, 10000); this.codec = new GzipCodec(); this.tupleSerialization = new TupleSerialization(flowProcess); }
From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java
License:Apache License
void openFile() throws IOException { start = split.getStart();// w w w . ja v a 2s .c o m end = start + split.getLength(); final Path file = split.getPath(); LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing=" + fileEncodeing + " " + split.getStart() + ":" + split.getLength()); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); if (file.getName().endsWith(".zip")) { LOG.info("use ZipInputStream read file " + split.getPath()); ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing)); in = new LineReader(zin, job); filePosition = fileIn; codec = new GzipCodec(); return; } if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, // decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { String filename = file.getName(); if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); String filename = file.getName(); if (filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(fileIn), job); } else { in = new LineReader(fileIn, job); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java
License:Apache License
private static Option withCompression(Logger logger) { String prop = System.getProperty(HoplogConfig.COMPRESSION); if (prop != null) { CompressionCodec codec;//from w ww . ja v a2 s .co m if (prop.equalsIgnoreCase("SNAPPY")) { codec = new SnappyCodec(); } else if (prop.equalsIgnoreCase("LZ4")) { codec = new Lz4Codec(); } else if (prop.equals("GZ")) { codec = new GzipCodec(); } else { throw new IllegalStateException("Unsupported codec: " + prop); } if (logger.isDebugEnabled()) logger.debug("{}Using compression codec " + codec, logPrefix); return SequenceFile.Writer.compression(CompressionType.BLOCK, codec); } return SequenceFile.Writer.compression(CompressionType.NONE, null); }
From source file:com.pinterest.secor.common.FileRegistryTest.java
License:Apache License
private void createCompressedWriter() throws IOException { PowerMockito.mockStatic(FileUtil.class); PowerMockito.mockStatic(FileSystem.class); FileSystem fs = Mockito.mock(FileSystem.class); Mockito.when(FileSystem.get(Mockito.any(Configuration.class))).thenReturn(fs); PowerMockito.mockStatic(SequenceFile.class); Path fsPath = new Path(PATH_GZ); SequenceFile.Writer writer = Mockito.mock(SequenceFile.Writer.class); Mockito.when(SequenceFile.createWriter(Mockito.eq(fs), Mockito.any(Configuration.class), Mockito.eq(fsPath), Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class), Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class))).thenReturn(writer); Mockito.when(writer.getLength()).thenReturn(123L); SequenceFile.Writer createdWriter = mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec()); assertTrue(createdWriter == writer); }
From source file:com.pinterest.secor.common.FileRegistryTest.java
License:Apache License
public void testGetOrCreateWriterCompressed() throws Exception { createCompressedWriter();//from ww w.j av a 2 s . c om mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec()); // Verify that the method has been called exactly once (the default). PowerMockito.verifyStatic(); FileSystem.get(Mockito.any(Configuration.class)); PowerMockito.verifyStatic(); FileUtil.delete(PATH_GZ); PowerMockito.verifyStatic(); FileUtil.delete(CRC_PATH); Path fsPath = new Path(PATH_GZ); PowerMockito.verifyStatic(); SequenceFile.createWriter(Mockito.any(FileSystem.class), Mockito.any(Configuration.class), Mockito.eq(fsPath), Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class), Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class)); TopicPartition topicPartition = new TopicPartition("some_topic", 0); Collection<TopicPartition> topicPartitions = mRegistry.getTopicPartitions(); assertEquals(1, topicPartitions.size()); assertTrue(topicPartitions.contains(topicPartition)); Collection<LogFilePath> logFilePaths = mRegistry.getPaths(topicPartition); assertEquals(1, logFilePaths.size()); assertTrue(logFilePaths.contains(mLogFilePath)); }