Example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec.

Prototype

public CompressionCodec getCodec(Path file) 

Source Link

Document

Find the relevant compression codec for the given file based on its filename suffix.

Usage

From source file:org.apache.tajo.engine.query.TestInsertQuery.java

License:Apache License

@Test
public final void testInsertOverwriteLocationWithCompression() throws Exception {
    if (!testingCluster.isHCatalogStoreRunning()) {
        ResultSet res = executeQuery();
        res.close();/*w w  w  . j  a  va 2s.  co  m*/
        FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
        Path path = new Path("/tajo-data/testInsertOverwriteLocationWithCompression");
        assertTrue(fs.exists(path));
        assertEquals(1, fs.listStatus(path).length);

        CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
        for (FileStatus file : fs.listStatus(path)) {
            CompressionCodec codec = factory.getCodec(file.getPath());
            assertTrue(codec instanceof DeflateCodec);
        }
    }
}

From source file:org.apache.tajo.engine.query.TestInsertQuery.java

License:Apache License

@Test
public final void testInsertOverwritePathWithNonFromQuery() throws Exception {
    ResultSet res = executeString("insert overwrite into location "
            + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' "
            + "USING csv WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "select 1::INT4, 2.1::FLOAT4, 'test'");

    res.close();// w w  w . j  a v a2  s .c o  m
    FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
    Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery");
    assertTrue(fs.exists(path));
    assertEquals(1, fs.listStatus(path).length);

    CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
    FileStatus file = fs.listStatus(path)[0];
    CompressionCodec codec = factory.getCodec(file.getPath());
    assertTrue(codec instanceof DeflateCodec);

    BufferedReader reader = new BufferedReader(
            new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))));

    try {
        String line = reader.readLine();
        assertNotNull(line);

        String[] tokens = line.split("\\|");

        assertEquals(3, tokens.length);
        assertEquals("1", tokens[0]);
        assertEquals("2.1", tokens[1]);
        assertEquals("test", tokens[2]);
    } finally {
        reader.close();
    }
}

From source file:org.apache.tajo.engine.query.TestTablePartitions.java

License:Apache License

@Test
public final void testColumnPartitionedTableByOneColumnsWithCompression() throws Exception {
    String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableByOneColumnsWithCompression");
    ResultSet res = executeString("create table " + tableName + " (col2 int4, col3 float8) USING csv "
            + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "PARTITION BY column(col1 int4)");
    res.close();/*from w  w w  .  j a  v a2 s .  c  o  m*/
    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString(
            "insert overwrite into " + tableName + " select l_partkey, l_quantity, l_orderkey from lineitem");
    res.close();
    TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
    if (!testingCluster.isHCatalogStoreRunning()) {
        assertEquals(5, desc.getStats().getNumRows().intValue());
    }

    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(new Path(desc.getPath())));
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);

    Path path = new Path(desc.getPath());
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));

    for (FileStatus partition : fs.listStatus(path)) {
        assertTrue(fs.isDirectory(partition.getPath()));
        for (FileStatus file : fs.listStatus(partition.getPath())) {
            CompressionCodec codec = factory.getCodec(file.getPath());
            assertTrue(codec instanceof DeflateCodec);
        }
    }
}

From source file:org.apache.tajo.engine.query.TestTablePartitions.java

License:Apache License

@Test
public final void testColumnPartitionedTableByTwoColumnsWithCompression() throws Exception {
    String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableByTwoColumnsWithCompression");
    ResultSet res = executeString("create table " + tableName + " (col3 float8, col4 text) USING csv "
            + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "PARTITION by column(col1 int4, col2 int4)");
    res.close();/*w  w  w  .  j a v a2  s. co  m*/

    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString("insert overwrite into " + tableName
            + " select  l_quantity, l_returnflag, l_orderkey, l_partkey from lineitem");
    res.close();
    TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
    if (!testingCluster.isHCatalogStoreRunning()) {
        assertEquals(5, desc.getStats().getNumRows().intValue());
    }

    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(new Path(desc.getPath())));
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);

    Path path = new Path(desc.getPath());
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));

    for (FileStatus partition1 : fs.listStatus(path)) {
        assertTrue(fs.isDirectory(partition1.getPath()));
        for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
            assertTrue(fs.isDirectory(partition2.getPath()));
            for (FileStatus file : fs.listStatus(partition2.getPath())) {
                CompressionCodec codec = factory.getCodec(file.getPath());
                assertTrue(codec instanceof DeflateCodec);
            }
        }
    }
}

From source file:org.apache.tajo.engine.query.TestTablePartitions.java

License:Apache License

@Test
public final void testColumnPartitionedTableByThreeColumnsWithCompression() throws Exception {
    String tableName = CatalogUtil
            .normalizeIdentifier("testColumnPartitionedTableByThreeColumnsWithCompression");
    ResultSet res = executeString("create table " + tableName + " (col4 text) USING csv "
            + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "partition by column(col1 int4, col2 int4, col3 float8)");
    res.close();/*from   w ww  . ja  v a2  s .  c om*/

    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString("insert overwrite into " + tableName
            + " select l_returnflag, l_orderkey, l_partkey, l_quantity from lineitem");
    res.close();
    TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
    if (!testingCluster.isHCatalogStoreRunning()) {
        assertEquals(5, desc.getStats().getNumRows().intValue());
    }

    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(new Path(desc.getPath())));
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);

    Path path = new Path(desc.getPath());
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1/col3=17.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2/col3=38.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2/col3=45.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3/col3=49.0")));

    for (FileStatus partition1 : fs.listStatus(path)) {
        assertTrue(fs.isDirectory(partition1.getPath()));
        for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
            assertTrue(fs.isDirectory(partition2.getPath()));
            for (FileStatus partition3 : fs.listStatus(partition2.getPath())) {
                assertTrue(fs.isDirectory(partition3.getPath()));
                for (FileStatus file : fs.listStatus(partition3.getPath())) {
                    CompressionCodec codec = factory.getCodec(file.getPath());
                    assertTrue(codec instanceof DeflateCodec);
                }
            }
        }
    }

    res = executeString("select * from " + tableName + " where col2 = 2");

    Map<Double, int[]> resultRows1 = Maps.newHashMap();
    resultRows1.put(45.0d, new int[] { 3, 2 });
    resultRows1.put(38.0d, new int[] { 2, 2 });

    int i = 0;
    while (res.next()) {
        assertEquals(resultRows1.get(res.getDouble(4))[0], res.getInt(2));
        assertEquals(resultRows1.get(res.getDouble(4))[1], res.getInt(3));
        i++;
    }
    res.close();
    assertEquals(2, i);

    Map<Double, int[]> resultRows2 = Maps.newHashMap();
    resultRows2.put(49.0d, new int[] { 3, 3 });
    resultRows2.put(45.0d, new int[] { 3, 2 });
    resultRows2.put(38.0d, new int[] { 2, 2 });

    res = executeString("select * from " + tableName + " where (col1 = 2 or col1 = 3) and col2 >= 2");
    i = 0;
    while (res.next()) {
        assertEquals(resultRows2.get(res.getDouble(4))[0], res.getInt(2));
        assertEquals(resultRows2.get(res.getDouble(4))[1], res.getInt(3));
        i++;
    }

    res.close();
    assertEquals(3, i);
}

From source file:org.apache.tajo.engine.query.TestTablePartitions.java

License:Apache License

@Test
public final void testColumnPartitionedTableNoMatchedPartition() throws Exception {
    String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableNoMatchedPartition");
    ResultSet res = executeString("create table " + tableName + " (col4 text) USING csv "
            + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "partition by column(col1 int4, col2 int4, col3 float8)");
    res.close();/* w ww.j  ava2 s .co  m*/

    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString("insert overwrite into " + tableName
            + " select l_returnflag , l_orderkey, l_partkey, l_quantity from lineitem");
    res.close();
    TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
    if (!testingCluster.isHCatalogStoreRunning()) {
        assertEquals(5, desc.getStats().getNumRows().intValue());
    }

    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(new Path(desc.getPath())));
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);

    Path path = new Path(desc.getPath());
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1/col3=17.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2/col3=38.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2/col3=45.0")));
    assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3/col3=49.0")));

    for (FileStatus partition1 : fs.listStatus(path)) {
        assertTrue(fs.isDirectory(partition1.getPath()));
        for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
            assertTrue(fs.isDirectory(partition2.getPath()));
            for (FileStatus partition3 : fs.listStatus(partition2.getPath())) {
                assertTrue(fs.isDirectory(partition3.getPath()));
                for (FileStatus file : fs.listStatus(partition3.getPath())) {
                    CompressionCodec codec = factory.getCodec(file.getPath());
                    assertTrue(codec instanceof DeflateCodec);
                }
            }
        }
    }

    res = executeString("select * from " + tableName + " where col2 = 9");
    assertFalse(res.next());
    res.close();
}

From source file:org.apache.tajo.storage.http.ExampleHttpJsonLineReader.java

License:Apache License

public ExampleHttpJsonLineReader(Configuration conf, AbstractFileFragment fragment, int bufferSize) {
    this.conf = conf;
    this.fragment = (ExampleHttpFileFragment) fragment;
    this.bufferSize = bufferSize;

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    codec = factory.getCodec(fragment.getPath());
    if (this.codec instanceof SplittableCompressionCodec) {
        // bzip2 does not support multi-thread model
        throw new TajoRuntimeException(new UnsupportedException(codec.getDefaultExtension()));
    }//from   w  w  w.ja  va  2  s.co m
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static void decompressFile(final FileSystem fs, final String inFile, final String outFile,
        boolean deletePrevious) throws IOException {
    final Path inPath = new Path(inFile);
    final Path outPath = new Path(outFile);
    final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = factory.getCodec(inPath);
    final OutputStream out = fs.create(outPath);
    final InputStream in = codec.createInputStream(fs.open(inPath));
    IOUtils.copyBytes(in, out, 8192);/*from w  w  w.j av  a  2  s .c om*/
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);

    if (deletePrevious)
        fs.delete(new Path(inFile), true);

}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system.//from w  w w. j  a va2  s .  co m
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}

From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.CNVDepthReport.java

License:Open Source License

public void toReport(BamQualityControlOptions options, FileSystem fs, Configuration conf, String sampleName)
        throws IOException {
    for (int i = 0; i < depths.length; i++) {
        Map<String, WrappedIntArray> sampleDepth = depths[i].laneDepth;
        for (String chrName : depths[i].laneDepth.keySet()) {
            StringBuffer cnvDepthFilePath = new StringBuffer();
            cnvDepthFilePath.append(options.getOutputPath());
            cnvDepthFilePath.append("/");
            cnvDepthFilePath.append("cnvDepth");
            cnvDepthFilePath.append("/");
            cnvDepthFilePath.append(sampleName);
            cnvDepthFilePath.append("-lane");
            cnvDepthFilePath.append(i);/*from  w  w  w  . j  av  a 2 s  .com*/
            cnvDepthFilePath.append("/");
            cnvDepthFilePath.append(chrName);
            cnvDepthFilePath.append(".dep.gz");
            Path cnvDepthPath = new Path(cnvDepthFilePath.toString());
            FSDataOutputStream cnvDepthStream = fs.create(cnvDepthPath);
            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = codecFactory.getCodec(cnvDepthPath);
            CompressionOutputStream compressedOutput = codec.createOutputStream(cnvDepthStream);
            //ChrLaneDepth laneChrDepths = depths[i].laneDepth.get(chrName);
            //Map<Integer, Integer> depthLanePos = laneChrDepths.depth;
            int[] depth = sampleDepth.get(chrName).getArray();
            StringBuilder sb = new StringBuilder();
            for (int j = 0; j < depth.length; j += 2) {
                sb.append(chrName);
                sb.append("\t");
                sb.append(depth[j] + 1);
                sb.append("\t");
                sb.append(depth[j + 1]);
                sb.append("\n");
            }
            compressedOutput.write(sb.toString().getBytes());
            compressedOutput.close();
            cnvDepthStream.close();
        }
    }
}