Example usage for org.apache.hadoop.io.compress GzipCodec GzipCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress GzipCodec GzipCodec.

Prototype

GzipCodec

Source Link

Usage

From source file:com.cloudera.flume.handlers.hdfs.TestEscapedCustomOutputDfs.java

License:Apache License

/**
 * Test to write few log lines, compress using gzip, write to disk, read back
 * the compressed file and verify the written lines. This test alone doesn't
 * test GZipCodec with its Native Libs. java.library.path must contain the
 * path to the hadoop native libs for this to happen.
 *
 * @throws IOException//  w  w  w.java2s.c om
 * @throws InterruptedException
 */
@Test
public void testGZipCodec() throws IOException, InterruptedException {
    checkOutputFormat("syslog", new SyslogEntryFormat(), "GzipCodec", new GzipCodec());
}

From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java

License:Apache License

public void testNonSplittingGzipFile() throws IOException {
    SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "nonsplit-", 0,
            new GzipCodec());
    SplittableBufferedWriter w = new SplittableBufferedWriter(os, true);
    try {/*from   w w w.  j a v  a  2s.c  o  m*/
        w.allowSplit();
        w.write("This is a string!");
        w.newLine();
        w.write("This is another string!");
        w.allowSplit();
    } finally {
        w.close();
    }

    // Ensure we made exactly one file.
    Path writePath = new Path(getWritePath(), "nonsplit-00000.gz");
    Path badPath = new Path(getWritePath(), "nonsplit-00001.gz");
    verifyFileExists(writePath);
    verifyFileDoesNotExist(badPath); // Ensure we didn't make a second file.

    // Now ensure all the data got there.
    String[] expectedLines = { "This is a string!", "This is another string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "nonsplit-00000.gz"))),
            expectedLines);
}

From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java

License:Apache License

public void testSplittingGzipFile() throws IOException {
    SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "splitz-", 3,
            new GzipCodec());
    SplittableBufferedWriter w = new SplittableBufferedWriter(os, true);
    try {//from  ww  w  . j  a  v  a  2 s . co m
        w.write("This is a string!");
        w.newLine();
        w.write("This is another string!");
    } finally {
        w.close();
    }

    // Ensure we made exactly two files.
    Path writePath = new Path(getWritePath(), "splitz-00000.gz");
    Path writePath2 = new Path(getWritePath(), "splitz-00001.gz");
    Path badPath = new Path(getWritePath(), "splitz-00002.gz");
    verifyFileExists(writePath);
    verifyFileExists(writePath2);
    verifyFileDoesNotExist(badPath); // Ensure we didn't make three files.

    // Now ensure all the data got there.
    String[] expectedLines0 = { "This is a string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00000.gz"))),
            expectedLines0);

    String[] expectedLines1 = { "This is another string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00001.gz"))),
            expectedLines1);
}

From source file:com.cloudera.sqoop.TestCompression.java

License:Apache License

public void runSequenceFileCompressionTest(CompressionCodec codec, int expectedNum) throws Exception {

    String[] columns = HsqldbTestServer.getFieldNames();
    ClassLoader prevClassLoader = null;
    SequenceFile.Reader reader = null;

    String[] argv = getArgv(true, columns, codec, "--as-sequencefile");
    runImport(argv);//from w  w  w . j  ava  2s.  c  o  m
    try {
        SqoopOptions opts = new ImportTool().parseArguments(getArgv(false, columns, codec, "--as-sequencefile"),
                null, null, true);

        CompilationManager compileMgr = new CompilationManager(opts);
        String jarFileName = compileMgr.getJarFilename();
        LOG.debug("Got jar from import job: " + jarFileName);

        prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, getTableName());

        reader = SeqFileReader.getSeqFileReader(getDataFilePath().toString());

        if (codec == null) {
            codec = new GzipCodec();
        }
        assertTrue("Block compressed", reader.isBlockCompressed());
        assertEquals(codec.getClass(), reader.getCompressionCodec().getClass());

        // here we can actually instantiate (k, v) pairs.
        Configuration conf = new Configuration();
        Object key = ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        Object val = ReflectionUtils.newInstance(reader.getValueClass(), conf);

        // We know that these values are two ints separated by a ',' character.
        // Since this is all dynamic, though, we don't want to actually link
        // against the class and use its methods. So we just parse this back
        // into int fields manually.  Sum them up and ensure that we get the
        // expected total for the first column, to verify that we got all the
        // results from the db into the file.

        // Sum up everything in the file.
        int numLines = 0;
        while (reader.next(key) != null) {
            reader.getCurrentValue(val);
            numLines++;
        }

        assertEquals(expectedNum, numLines);
    } finally {
        IOUtils.closeStream(reader);

        if (null != prevClassLoader) {
            ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
        }
    }
}

From source file:com.cloudera.sqoop.TestCompression.java

License:Apache License

public void runTextCompressionTest(CompressionCodec codec, int expectedNum) throws IOException {

    String[] columns = HsqldbTestServer.getFieldNames();
    String[] argv = getArgv(true, columns, codec, "--as-textfile");
    runImport(argv);/*from   w  w w.j a  v  a 2 s.  co  m*/

    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);

    if (codec == null) {
        codec = new GzipCodec();
    }
    ReflectionUtils.setConf(codec, getConf());
    Path p = new Path(getDataFilePath().toString() + codec.getDefaultExtension());
    InputStream is = codec.createInputStream(fs.open(p));
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
        String ln = r.readLine();
        if (ln == null) {
            break;
        }
        numLines++;
    }
    r.close();
    assertEquals(expectedNum, numLines);
}

From source file:com.dataartisans.flink.cascading.runtime.spilling.SpillingTupleCollectionFactory.java

License:Apache License

public void initialize(FlowProcess<? extends Configuration> flowProcess) {
    this.spillThreshold = SpillableTupleList.getThreshold(flowProcess, 10000);
    this.codec = new GzipCodec();
    this.tupleSerialization = new TupleSerialization(flowProcess);
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();//  w  w  w .  ja v  a 2s .c  o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java

License:Apache License

private static Option withCompression(Logger logger) {
    String prop = System.getProperty(HoplogConfig.COMPRESSION);
    if (prop != null) {
        CompressionCodec codec;//from  w  ww . ja v a2  s  .co  m
        if (prop.equalsIgnoreCase("SNAPPY")) {
            codec = new SnappyCodec();
        } else if (prop.equalsIgnoreCase("LZ4")) {
            codec = new Lz4Codec();
        } else if (prop.equals("GZ")) {
            codec = new GzipCodec();
        } else {
            throw new IllegalStateException("Unsupported codec: " + prop);
        }
        if (logger.isDebugEnabled())
            logger.debug("{}Using compression codec " + codec, logPrefix);
        return SequenceFile.Writer.compression(CompressionType.BLOCK, codec);
    }
    return SequenceFile.Writer.compression(CompressionType.NONE, null);
}

From source file:com.pinterest.secor.common.FileRegistryTest.java

License:Apache License

private void createCompressedWriter() throws IOException {
    PowerMockito.mockStatic(FileUtil.class);

    PowerMockito.mockStatic(FileSystem.class);
    FileSystem fs = Mockito.mock(FileSystem.class);
    Mockito.when(FileSystem.get(Mockito.any(Configuration.class))).thenReturn(fs);

    PowerMockito.mockStatic(SequenceFile.class);
    Path fsPath = new Path(PATH_GZ);
    SequenceFile.Writer writer = Mockito.mock(SequenceFile.Writer.class);
    Mockito.when(SequenceFile.createWriter(Mockito.eq(fs), Mockito.any(Configuration.class), Mockito.eq(fsPath),
            Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class),
            Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class))).thenReturn(writer);

    Mockito.when(writer.getLength()).thenReturn(123L);

    SequenceFile.Writer createdWriter = mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec());
    assertTrue(createdWriter == writer);
}

From source file:com.pinterest.secor.common.FileRegistryTest.java

License:Apache License

public void testGetOrCreateWriterCompressed() throws Exception {
    createCompressedWriter();//from  ww  w.j av  a  2 s .  c  om

    mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec());

    // Verify that the method has been called exactly once (the default).
    PowerMockito.verifyStatic();
    FileSystem.get(Mockito.any(Configuration.class));

    PowerMockito.verifyStatic();
    FileUtil.delete(PATH_GZ);
    PowerMockito.verifyStatic();
    FileUtil.delete(CRC_PATH);

    Path fsPath = new Path(PATH_GZ);
    PowerMockito.verifyStatic();
    SequenceFile.createWriter(Mockito.any(FileSystem.class), Mockito.any(Configuration.class),
            Mockito.eq(fsPath), Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class),
            Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class));

    TopicPartition topicPartition = new TopicPartition("some_topic", 0);
    Collection<TopicPartition> topicPartitions = mRegistry.getTopicPartitions();
    assertEquals(1, topicPartitions.size());
    assertTrue(topicPartitions.contains(topicPartition));

    Collection<LogFilePath> logFilePaths = mRegistry.getPaths(topicPartition);
    assertEquals(1, logFilePaths.size());
    assertTrue(logFilePaths.contains(mLogFilePath));
}