Example usage for org.apache.hadoop.io.compress GzipCodec GzipCodec

List of usage examples for org.apache.hadoop.io.compress GzipCodec GzipCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress GzipCodec GzipCodec.

Prototype

GzipCodec

Source Link

Usage

From source file:com.cloudera.flume.handlers.hdfs.TestEscapedCustomOutputDfs.java

License:Apache License

/**
 * Test to write few log lines, compress using gzip, write to disk, read back
 * the compressed file and verify the written lines. This test alone doesn't
 * test GZipCodec with its Native Libs. java.library.path must contain the
 * path to the hadoop native libs for this to happen.
 *
 * @throws IOException//  w  w  w.java2s.c om
 * @throws InterruptedException
 */
@Test
public void testGZipCodec() throws IOException, InterruptedException {
    checkOutputFormat("syslog", new SyslogEntryFormat(), "GzipCodec", new GzipCodec());
}

From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java

License:Apache License

public void testNonSplittingGzipFile() throws IOException {
    SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "nonsplit-", 0,
            new GzipCodec());
    SplittableBufferedWriter w = new SplittableBufferedWriter(os, true);
    try {/*from   w w w.  j a v  a  2s.c  o  m*/
        w.allowSplit();
        w.write("This is a string!");
        w.newLine();
        w.write("This is another string!");
        w.allowSplit();
    } finally {
        w.close();
    }

    // Ensure we made exactly one file.
    Path writePath = new Path(getWritePath(), "nonsplit-00000.gz");
    Path badPath = new Path(getWritePath(), "nonsplit-00001.gz");
    verifyFileExists(writePath);
    verifyFileDoesNotExist(badPath); // Ensure we didn't make a second file.

    // Now ensure all the data got there.
    String[] expectedLines = { "This is a string!", "This is another string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "nonsplit-00000.gz"))),
            expectedLines);
}

From source file:com.cloudera.sqoop.io.TestSplittableBufferedWriter.java

License:Apache License

public void testSplittingGzipFile() throws IOException {
    SplittingOutputStream os = new SplittingOutputStream(getConf(), getWritePath(), "splitz-", 3,
            new GzipCodec());
    SplittableBufferedWriter w = new SplittableBufferedWriter(os, true);
    try {//from  ww  w  . j  a  v  a  2 s . co m
        w.write("This is a string!");
        w.newLine();
        w.write("This is another string!");
    } finally {
        w.close();
    }

    // Ensure we made exactly two files.
    Path writePath = new Path(getWritePath(), "splitz-00000.gz");
    Path writePath2 = new Path(getWritePath(), "splitz-00001.gz");
    Path badPath = new Path(getWritePath(), "splitz-00002.gz");
    verifyFileExists(writePath);
    verifyFileExists(writePath2);
    verifyFileDoesNotExist(badPath); // Ensure we didn't make three files.

    // Now ensure all the data got there.
    String[] expectedLines0 = { "This is a string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00000.gz"))),
            expectedLines0);

    String[] expectedLines1 = { "This is another string!", };
    verifyFileContents(new GZIPInputStream(new FileInputStream(new File(getWriteDir(), "splitz-00001.gz"))),
            expectedLines1);
}

From source file:com.cloudera.sqoop.TestCompression.java

License:Apache License

public void runSequenceFileCompressionTest(CompressionCodec codec, int expectedNum) throws Exception {

    String[] columns = HsqldbTestServer.getFieldNames();
    ClassLoader prevClassLoader = null;
    SequenceFile.Reader reader = null;

    String[] argv = getArgv(true, columns, codec, "--as-sequencefile");
    runImport(argv);//from w  w  w . j  ava  2s.  c  o  m
    try {
        SqoopOptions opts = new ImportTool().parseArguments(getArgv(false, columns, codec, "--as-sequencefile"),
                null, null, true);

        CompilationManager compileMgr = new CompilationManager(opts);
        String jarFileName = compileMgr.getJarFilename();
        LOG.debug("Got jar from import job: " + jarFileName);

        prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, getTableName());

        reader = SeqFileReader.getSeqFileReader(getDataFilePath().toString());

        if (codec == null) {
            codec = new GzipCodec();
        }
        assertTrue("Block compressed", reader.isBlockCompressed());
        assertEquals(codec.getClass(), reader.getCompressionCodec().getClass());

        // here we can actually instantiate (k, v) pairs.
        Configuration conf = new Configuration();
        Object key = ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        Object val = ReflectionUtils.newInstance(reader.getValueClass(), conf);

        // We know that these values are two ints separated by a ',' character.
        // Since this is all dynamic, though, we don't want to actually link
        // against the class and use its methods. So we just parse this back
        // into int fields manually.  Sum them up and ensure that we get the
        // expected total for the first column, to verify that we got all the
        // results from the db into the file.

        // Sum up everything in the file.
        int numLines = 0;
        while (reader.next(key) != null) {
            reader.getCurrentValue(val);
            numLines++;
        }

        assertEquals(expectedNum, numLines);
    } finally {
        IOUtils.closeStream(reader);

        if (null != prevClassLoader) {
            ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
        }
    }
}

From source file:com.cloudera.sqoop.TestCompression.java

License:Apache License

public void runTextCompressionTest(CompressionCodec codec, int expectedNum) throws IOException {

    String[] columns = HsqldbTestServer.getFieldNames();
    String[] argv = getArgv(true, columns, codec, "--as-textfile");
    runImport(argv);/*from   w  w w.j a  v  a 2 s.  co  m*/

    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);

    if (codec == null) {
        codec = new GzipCodec();
    }
    ReflectionUtils.setConf(codec, getConf());
    Path p = new Path(getDataFilePath().toString() + codec.getDefaultExtension());
    InputStream is = codec.createInputStream(fs.open(p));
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
        String ln = r.readLine();
        if (ln == null) {
            break;
        }
        numLines++;
    }
    r.close();
    assertEquals(expectedNum, numLines);
}

From source file:com.dataartisans.flink.cascading.runtime.spilling.SpillingTupleCollectionFactory.java

License:Apache License

public void initialize(FlowProcess<? extends Configuration> flowProcess) {
    this.spillThreshold = SpillableTupleList.getThreshold(flowProcess, 10000);
    this.codec = new GzipCodec();
    this.tupleSerialization = new TupleSerialization(flowProcess);
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();//  w  w  w .  ja v  a 2s .c  o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java

License:Apache License

private static Option withCompression(Logger logger) {
    String prop = System.getProperty(HoplogConfig.COMPRESSION);
    if (prop != null) {
        CompressionCodec codec;//from  w  ww . ja v a2  s  .co  m
        if (prop.equalsIgnoreCase("SNAPPY")) {
            codec = new SnappyCodec();
        } else if (prop.equalsIgnoreCase("LZ4")) {
            codec = new Lz4Codec();
        } else if (prop.equals("GZ")) {
            codec = new GzipCodec();
        } else {
            throw new IllegalStateException("Unsupported codec: " + prop);
        }
        if (logger.isDebugEnabled())
            logger.debug("{}Using compression codec " + codec, logPrefix);
        return SequenceFile.Writer.compression(CompressionType.BLOCK, codec);
    }
    return SequenceFile.Writer.compression(CompressionType.NONE, null);
}

From source file:com.pinterest.secor.common.FileRegistryTest.java

License:Apache License

private void createCompressedWriter() throws IOException {
    PowerMockito.mockStatic(FileUtil.class);

    PowerMockito.mockStatic(FileSystem.class);
    FileSystem fs = Mockito.mock(FileSystem.class);
    Mockito.when(FileSystem.get(Mockito.any(Configuration.class))).thenReturn(fs);

    PowerMockito.mockStatic(SequenceFile.class);
    Path fsPath = new Path(PATH_GZ);
    SequenceFile.Writer writer = Mockito.mock(SequenceFile.Writer.class);
    Mockito.when(SequenceFile.createWriter(Mockito.eq(fs), Mockito.any(Configuration.class), Mockito.eq(fsPath),
            Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class),
            Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class))).thenReturn(writer);

    Mockito.when(writer.getLength()).thenReturn(123L);

    SequenceFile.Writer createdWriter = mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec());
    assertTrue(createdWriter == writer);
}

From source file:com.pinterest.secor.common.FileRegistryTest.java

License:Apache License

public void testGetOrCreateWriterCompressed() throws Exception {
    createCompressedWriter();//from  ww  w.j av  a  2 s .  c  om

    mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec());

    // Verify that the method has been called exactly once (the default).
    PowerMockito.verifyStatic();
    FileSystem.get(Mockito.any(Configuration.class));

    PowerMockito.verifyStatic();
    FileUtil.delete(PATH_GZ);
    PowerMockito.verifyStatic();
    FileUtil.delete(CRC_PATH);

    Path fsPath = new Path(PATH_GZ);
    PowerMockito.verifyStatic();
    SequenceFile.createWriter(Mockito.any(FileSystem.class), Mockito.any(Configuration.class),
            Mockito.eq(fsPath), Mockito.eq(LongWritable.class), Mockito.eq(BytesWritable.class),
            Mockito.eq(SequenceFile.CompressionType.BLOCK), Mockito.any(GzipCodec.class));

    TopicPartition topicPartition = new TopicPartition("some_topic", 0);
    Collection<TopicPartition> topicPartitions = mRegistry.getTopicPartitions();
    assertEquals(1, topicPartitions.size());
    assertTrue(topicPartitions.contains(topicPartition));

    Collection<LogFilePath> logFilePaths = mRegistry.getPaths(topicPartition);
    assertEquals(1, logFilePaths.size());
    assertTrue(logFilePaths.contains(mLogFilePath));
}