List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:org.apache.tajo.engine.query.TestInsertQuery.java
License:Apache License
@Test public final void testInsertOverwriteLocationWithCompression() throws Exception { if (!testingCluster.isHCatalogStoreRunning()) { ResultSet res = executeQuery(); res.close();/*w w w . j a va 2s. co m*/ FileSystem fs = FileSystem.get(testingCluster.getConfiguration()); Path path = new Path("/tajo-data/testInsertOverwriteLocationWithCompression"); assertTrue(fs.exists(path)); assertEquals(1, fs.listStatus(path).length); CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration()); for (FileStatus file : fs.listStatus(path)) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } } }
From source file:org.apache.tajo.engine.query.TestInsertQuery.java
License:Apache License
@Test public final void testInsertOverwritePathWithNonFromQuery() throws Exception { ResultSet res = executeString("insert overwrite into location " + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' " + "USING csv WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "select 1::INT4, 2.1::FLOAT4, 'test'"); res.close();// w w w . j a v a2 s .c o m FileSystem fs = FileSystem.get(testingCluster.getConfiguration()); Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery"); assertTrue(fs.exists(path)); assertEquals(1, fs.listStatus(path).length); CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration()); FileStatus file = fs.listStatus(path)[0]; CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); BufferedReader reader = new BufferedReader( new InputStreamReader(codec.createInputStream(fs.open(file.getPath())))); try { String line = reader.readLine(); assertNotNull(line); String[] tokens = line.split("\\|"); assertEquals(3, tokens.length); assertEquals("1", tokens[0]); assertEquals("2.1", tokens[1]); assertEquals("test", tokens[2]); } finally { reader.close(); } }
From source file:org.apache.tajo.engine.query.TestTablePartitions.java
License:Apache License
@Test public final void testColumnPartitionedTableByOneColumnsWithCompression() throws Exception { String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableByOneColumnsWithCompression"); ResultSet res = executeString("create table " + tableName + " (col2 int4, col3 float8) USING csv " + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "PARTITION BY column(col1 int4)"); res.close();/*from w w w . j a v a2 s . c o m*/ assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName)); res = executeString( "insert overwrite into " + tableName + " select l_partkey, l_quantity, l_orderkey from lineitem"); res.close(); TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName); if (!testingCluster.isHCatalogStoreRunning()) { assertEquals(5, desc.getStats().getNumRows().intValue()); } FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(desc.getPath()))); CompressionCodecFactory factory = new CompressionCodecFactory(conf); Path path = new Path(desc.getPath()); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3"))); for (FileStatus partition : fs.listStatus(path)) { assertTrue(fs.isDirectory(partition.getPath())); for (FileStatus file : fs.listStatus(partition.getPath())) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } } }
From source file:org.apache.tajo.engine.query.TestTablePartitions.java
License:Apache License
@Test public final void testColumnPartitionedTableByTwoColumnsWithCompression() throws Exception { String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableByTwoColumnsWithCompression"); ResultSet res = executeString("create table " + tableName + " (col3 float8, col4 text) USING csv " + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "PARTITION by column(col1 int4, col2 int4)"); res.close();/*w w w . j a v a2 s. co m*/ assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName)); res = executeString("insert overwrite into " + tableName + " select l_quantity, l_returnflag, l_orderkey, l_partkey from lineitem"); res.close(); TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName); if (!testingCluster.isHCatalogStoreRunning()) { assertEquals(5, desc.getStats().getNumRows().intValue()); } FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(desc.getPath()))); CompressionCodecFactory factory = new CompressionCodecFactory(conf); Path path = new Path(desc.getPath()); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3"))); for (FileStatus partition1 : fs.listStatus(path)) { assertTrue(fs.isDirectory(partition1.getPath())); for (FileStatus partition2 : fs.listStatus(partition1.getPath())) { assertTrue(fs.isDirectory(partition2.getPath())); for (FileStatus file : fs.listStatus(partition2.getPath())) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } } } }
From source file:org.apache.tajo.engine.query.TestTablePartitions.java
License:Apache License
@Test public final void testColumnPartitionedTableByThreeColumnsWithCompression() throws Exception { String tableName = CatalogUtil .normalizeIdentifier("testColumnPartitionedTableByThreeColumnsWithCompression"); ResultSet res = executeString("create table " + tableName + " (col4 text) USING csv " + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "partition by column(col1 int4, col2 int4, col3 float8)"); res.close();/*from w ww . ja v a2 s . c om*/ assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName)); res = executeString("insert overwrite into " + tableName + " select l_returnflag, l_orderkey, l_partkey, l_quantity from lineitem"); res.close(); TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName); if (!testingCluster.isHCatalogStoreRunning()) { assertEquals(5, desc.getStats().getNumRows().intValue()); } FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(desc.getPath()))); CompressionCodecFactory factory = new CompressionCodecFactory(conf); Path path = new Path(desc.getPath()); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1/col3=17.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2/col3=38.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2/col3=45.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3/col3=49.0"))); for (FileStatus partition1 : fs.listStatus(path)) { assertTrue(fs.isDirectory(partition1.getPath())); for (FileStatus partition2 : fs.listStatus(partition1.getPath())) { assertTrue(fs.isDirectory(partition2.getPath())); for (FileStatus partition3 : fs.listStatus(partition2.getPath())) { assertTrue(fs.isDirectory(partition3.getPath())); for (FileStatus file : fs.listStatus(partition3.getPath())) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } } } } res = executeString("select * from " + tableName + " where col2 = 2"); Map<Double, int[]> resultRows1 = Maps.newHashMap(); resultRows1.put(45.0d, new int[] { 3, 2 }); resultRows1.put(38.0d, new int[] { 2, 2 }); int i = 0; while (res.next()) { assertEquals(resultRows1.get(res.getDouble(4))[0], res.getInt(2)); assertEquals(resultRows1.get(res.getDouble(4))[1], res.getInt(3)); i++; } res.close(); assertEquals(2, i); Map<Double, int[]> resultRows2 = Maps.newHashMap(); resultRows2.put(49.0d, new int[] { 3, 3 }); resultRows2.put(45.0d, new int[] { 3, 2 }); resultRows2.put(38.0d, new int[] { 2, 2 }); res = executeString("select * from " + tableName + " where (col1 = 2 or col1 = 3) and col2 >= 2"); i = 0; while (res.next()) { assertEquals(resultRows2.get(res.getDouble(4))[0], res.getInt(2)); assertEquals(resultRows2.get(res.getDouble(4))[1], res.getInt(3)); i++; } res.close(); assertEquals(3, i); }
From source file:org.apache.tajo.engine.query.TestTablePartitions.java
License:Apache License
@Test public final void testColumnPartitionedTableNoMatchedPartition() throws Exception { String tableName = CatalogUtil.normalizeIdentifier("testColumnPartitionedTableNoMatchedPartition"); ResultSet res = executeString("create table " + tableName + " (col4 text) USING csv " + "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "partition by column(col1 int4, col2 int4, col3 float8)"); res.close();/* w ww.j ava2 s .co m*/ assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName)); res = executeString("insert overwrite into " + tableName + " select l_returnflag , l_orderkey, l_partkey, l_quantity from lineitem"); res.close(); TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName); if (!testingCluster.isHCatalogStoreRunning()) { assertEquals(5, desc.getStats().getNumRows().intValue()); } FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(desc.getPath()))); CompressionCodecFactory factory = new CompressionCodecFactory(conf); Path path = new Path(desc.getPath()); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1/col3=17.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2/col3=38.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2/col3=45.0"))); assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3/col3=49.0"))); for (FileStatus partition1 : fs.listStatus(path)) { assertTrue(fs.isDirectory(partition1.getPath())); for (FileStatus partition2 : fs.listStatus(partition1.getPath())) { assertTrue(fs.isDirectory(partition2.getPath())); for (FileStatus partition3 : fs.listStatus(partition2.getPath())) { assertTrue(fs.isDirectory(partition3.getPath())); for (FileStatus file : fs.listStatus(partition3.getPath())) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } } } } res = executeString("select * from " + tableName + " where col2 = 9"); assertFalse(res.next()); res.close(); }
From source file:org.apache.tajo.storage.http.ExampleHttpJsonLineReader.java
License:Apache License
public ExampleHttpJsonLineReader(Configuration conf, AbstractFileFragment fragment, int bufferSize) { this.conf = conf; this.fragment = (ExampleHttpFileFragment) fragment; this.bufferSize = bufferSize; CompressionCodecFactory factory = new CompressionCodecFactory(conf); codec = factory.getCodec(fragment.getPath()); if (this.codec instanceof SplittableCompressionCodec) { // bzip2 does not support multi-thread model throw new TajoRuntimeException(new UnsupportedException(codec.getDefaultExtension())); }//from w w w.ja va 2 s.co m }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java
License:Apache License
public static void decompressFile(final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious) throws IOException { final Path inPath = new Path(inFile); final Path outPath = new Path(outFile); final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); final CompressionCodec codec = factory.getCodec(inPath); final OutputStream out = fs.create(outPath); final InputStream in = codec.createInputStream(fs.open(inPath)); IOUtils.copyBytes(in, out, 8192);/*from w w w.j av a 2 s .c om*/ IOUtils.closeStream(in); IOUtils.closeStream(out); if (deletePrevious) fs.delete(new Path(inFile), true); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system.//from w w w. j a va2 s . co m * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); // if our codec is splittable, we can (tentatively) say that // we too are splittable. // // if we get a bgzfenhancedcodec, the codec might not actually // be splittable. however, if we get a non-splittable gz file, // several things happen: // // 1. the input format will detect this, and will not split the // file // 2. the bgzfenhancedcodec will check the underlying data type // (BGZF vs GZIP) at input stream creation time, and will // apply the appropriate codec. // // if we get an unsplittable codec, really all that we do differently // is skip the positioning check, since we know that we're at the // start of the file and can get to reading immediately isSplittable = (codec instanceof SplittableCompressionCodec); if (codec == null) { // no codec. Uncompressed file. int bytesToSkip = positionAtFirstRecord(fileIn, null); inputStream = fileIn; inputStream.skip(bytesToSkip); lineReader = new LineReader(inputStream); } else if (isSplittable) { // file is compressed, but uses a splittable codec isCompressed = true; int bytesToSkip = positionAtFirstRecord(fileIn, codec); // apparent fun finding: if you don't seek back to 0, // SplittableCompressionCodec.createInputStream will seek in the stream // to a start position, and funny things happen.. fileIn.seek(0); inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(), start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); inputStream.skip(bytesToSkip); lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf); } else { // unsplittable compressed file // expect a single split, first record at offset 0 isCompressed = true; inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file lineReader = new LineReader(inputStream); } }
From source file:org.bgi.flexlab.gaea.tools.bamqualtiycontrol.report.CNVDepthReport.java
License:Open Source License
public void toReport(BamQualityControlOptions options, FileSystem fs, Configuration conf, String sampleName) throws IOException { for (int i = 0; i < depths.length; i++) { Map<String, WrappedIntArray> sampleDepth = depths[i].laneDepth; for (String chrName : depths[i].laneDepth.keySet()) { StringBuffer cnvDepthFilePath = new StringBuffer(); cnvDepthFilePath.append(options.getOutputPath()); cnvDepthFilePath.append("/"); cnvDepthFilePath.append("cnvDepth"); cnvDepthFilePath.append("/"); cnvDepthFilePath.append(sampleName); cnvDepthFilePath.append("-lane"); cnvDepthFilePath.append(i);/*from w w w . j av a 2 s .com*/ cnvDepthFilePath.append("/"); cnvDepthFilePath.append(chrName); cnvDepthFilePath.append(".dep.gz"); Path cnvDepthPath = new Path(cnvDepthFilePath.toString()); FSDataOutputStream cnvDepthStream = fs.create(cnvDepthPath); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(cnvDepthPath); CompressionOutputStream compressedOutput = codec.createOutputStream(cnvDepthStream); //ChrLaneDepth laneChrDepths = depths[i].laneDepth.get(chrName); //Map<Integer, Integer> depthLanePos = laneChrDepths.depth; int[] depth = sampleDepth.get(chrName).getArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < depth.length; j += 2) { sb.append(chrName); sb.append("\t"); sb.append(depth[j] + 1); sb.append("\t"); sb.append(depth[j + 1]); sb.append("\n"); } compressedOutput.write(sb.toString().getBytes()); compressedOutput.close(); cnvDepthStream.close(); } } }