List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./* ww w.ja va 2 s. c om*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java
License:Apache License
/** * use case: % hadoop FileDecompressor file.gz * @param args/* w ww .j a va2 s .c o m*/ */ public static void main(String[] args) { FileSystem fs = null; String uri = args[0]; Path inputPath = null; Configuration conf = new Configuration(); CompressionCodecFactory factory = null; InputStream in = null; OutputStream out = null; try { fs = FileSystem.get(URI.create(uri), conf); inputPath = new Path(uri); factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart();/*w ww.ja va2s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.cloudera.sqoop.TestExport.java
License:Apache License
/** * Create a data file that gets exported to the db. * @param fileNum the number of the file (for multi-file export) * @param numRecords how many records to write to the file. * @param gzip is true if the file should be gzipped. *//* www . ja v a 2s .c om*/ protected void createTextFile(int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols) throws IOException { int startId = fileNum * numRecords; String ext = ".txt"; if (gzip) { ext = ext + ".gz"; } Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum + ext); Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); if (gzip) { CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(filePath); os = codec.createOutputStream(os); } BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int i = 0; i < numRecords; i++) { w.write(getRecordLine(startId + i, extraCols)); } w.close(); os.close(); if (gzip) { verifyCompressedFile(filePath, numRecords); } }
From source file:com.cloudera.sqoop.TestExport.java
License:Apache License
private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException { Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); }/* ww w . ja v a 2 s .c o m*/ FileSystem fs = FileSystem.get(conf); InputStream is = fs.open(f); CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(f); LOG.info("gzip check codec is " + codec); Decompressor decompressor = CodecPool.getDecompressor(codec); if (null == decompressor) { LOG.info("Verifying gzip sanity with null decompressor"); } else { LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString()); } is = codec.createInputStream(is, decompressor); BufferedReader r = new BufferedReader(new InputStreamReader(is)); int numLines = 0; while (true) { String ln = r.readLine(); if (ln == null) { break; } numLines++; } r.close(); assertEquals("Did not read back correct number of lines", expectedNumLines, numLines); LOG.info("gzip sanity check returned " + numLines + " lines; ok."); }
From source file:com.datascience.hadoop.CsvInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { String charsetName = conf.get(CHARSET); Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8; FileSplit split = (FileSplit) inputSplit; Path path = split.getPath();//from w w w. j a v a 2 s . c o m FileSystem fs = path.getFileSystem(conf); InputStream is = fs.open(path); // If the input is compressed, load the compression codec. CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); is = codec.createInputStream(is, decompressor); } return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(), conf.getBoolean(STRICT_MODE, true)); }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static String getCodecNameFromPath(Configuration conf, String path) { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(new Path(path)); if (codec == null) return null; else// w w w . j a v a 2s . com return codec.getDefaultExtension(); }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(new Path(path)); if (codec == null) return in; System.out.println("Getting InputStream : " + codec.getDefaultExtension()); System.out.println("Getting InputStream : " + codec); Decompressor compressor = codec.createDecompressor(); in = codec.createInputStream(in, compressor); return in;//from w ww .j a v a 2 s.co m }
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./* w ww. jav a 2s.co m*/ * * @param fs File system that contains the file. * @param lzoFile the lzo file to index. For filename.lzo, the created index file will be * filename.lzo.index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); if (null == codec) { throw new IOException("Could not find codec for file " + lzoFile + " - you may need to add the LZO codec to your io.compression.codecs " + "configuration in core-site.xml"); } ((Configurable) codec).setConf(conf); FSDataInputStream is = null; FSDataOutputStream os = null; Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX); Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX); // Track whether an exception was thrown or not, so we know to either // delete the tmp index file on failure, or rename it to the new index file on success. boolean indexingSucceeded = false; try { is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // Solely for reading the header codec.createInputStream(is, decompressor); int numCompressedChecksums = decompressor.getCompressedChecksumsCount(); int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } // See LzopInputStream.getCompressedData boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize); int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums; long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip)); } // If we're here, indexing was successful. indexingSucceeded = true; } finally { // Close any open streams. if (is != null) { is.close(); } if (os != null) { os.close(); } if (!indexingSucceeded) { // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves. fs.delete(tmpOutputFile, false); } else { // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index. fs.rename(tmpOutputFile, outputFile); } } }
From source file:com.hadoop.mapreduce.FourMcLineRecordReader.java
License:BSD License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*from w ww .j a va 2 s . c om*/ end = start + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopUtils.getConfiguration(context); maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }