List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:com.hadoop.mapreduce.LzoLineRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();// w w w. jav a 2 s.com end = start + split.getLength(); final Path file = split.getPath(); Configuration job = context.getConfiguration(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs.//from w w w . j ava2s.com * * @param fs * File system that contains the file. * @param lzoFile * the lzo file to index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); InputStream lzoIs = null; FSDataOutputStream os = null; Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX); Path tmpOutputFile = outputFile.suffix(".tmp"); try { FSDataInputStream is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // for reading the header lzoIs = codec.createInputStream(is, decompressor); int numChecksums = decompressor.getChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksums)); } } finally { if (lzoIs != null) { lzoIs.close(); } if (os != null) { os.close(); } } fs.rename(tmpOutputFile, outputFile); }
From source file:com.inmobi.conduit.CompressedFileReaderTest.java
License:Apache License
private void uncompress(String fileName) throws Exception { Configuration conf = new Configuration(); FileSystem fs;//from w w w . j a va2 s . co m fs = FileSystem.getLocal(conf); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(new Path(fileName)); if (codec == null) { System.out.println("cant find codec"); System.exit(1); } LOG.info("Using compression codec [" + codec.toString() + "]"); CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName))); OutputStream out = null; try { String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension()); out = fs.create(new Path(outputURI + "-uncompressed")); org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf); } finally { org.apache.hadoop.io.IOUtils.closeStream(out); IOUtils.closeStream(is); } }
From source file:com.jeffy.hdfs.compression.FileDecompressor.java
License:Apache License
/** * @param args// w w w .j a v a 2s .co m * * @throws IOException */ public static void main(String[] args) throws IOException { //?? Configuration conf = new Configuration(); // ? CompressionCodecFactory factory = new CompressionCodecFactory(conf); for (String uri : args) { FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); // ??????io.compression.codecs CompressionCodec codec = factory.getCodec(inputPath); // ?? if (codec == null) { System.err.println("No codec found for " + uri); continue; } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); try (InputStream in = codec.createInputStream(fs.open(inputPath)); OutputStream out = fs.create(new Path(outputUri))) { IOUtils.copyBytes(in, out, conf); } } }
From source file:com.matthewrathbone.hadoop.MRTester.java
License:Apache License
public List<String> collectStrings(Path location) throws Exception { CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { if (item.getPath().getName().startsWith("_")) { continue; }//ww w. j a va 2 s. co m CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); String[] resulting = raw.split("\n"); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:com.taobao.datax.plugins.common.DFSUtils.java
License:Open Source License
/** * Check file type in hdfs./*from ww w . j av a2s . c o m*/ * * @param fs * handle of {@link FileSystem} * * @param path * hdfs {@link Path} * * @param conf * {@link Configuration} * * @return {@link HdfsFileType} TXT, TXT_COMP, SEQ * */ public static HdfsFileType checkFileType(FileSystem fs, Path path, Configuration conf) throws IOException { FSDataInputStream is = null; try { is = fs.open(path); /* file is empty, use TXT readerup */ if (0 == is.available()) { return HdfsFileType.TXT; } switch (is.readShort()) { case 0x5345: if (is.readByte() == 'Q') { // TODO: add RCFile return HdfsFileType.SEQ; } default: is.seek(0); CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(path); if (null == codec) return HdfsFileType.TXT; else { return HdfsFileType.COMP_TXT; } } } catch (IOException e) { throw e; } finally { if (null != is) { try { is.close(); } catch (Exception ex) { } } } }
From source file:com.yahoo.glimmer.util.MapReducePartInputStreamEnumeration.java
License:Open Source License
public MapReducePartInputStreamEnumeration(FileSystem fileSystem, Path srcPath) throws IOException { this.fileSystem = fileSystem; CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf()); codecIfAny = factory.getCodec(srcPath); FileStatus srcFileStatus = fileSystem.getFileStatus(srcPath); if (srcFileStatus.isDirectory()) { // returns FileStatus objects sorted by filename. String partFilenamePattern = "part-?-?????"; if (codecIfAny != null) { partFilenamePattern += codecIfAny.getDefaultExtension(); }//from w ww . j a v a2 s .c om Path partPathGlob = new Path(srcPath, partFilenamePattern); partFileStatuses = fileSystem.globStatus(partPathGlob); } else { partFileStatuses = new FileStatus[] { srcFileStatus }; } }
From source file:com.yahoo.glimmer.util.MergeSortTool.java
License:Open Source License
public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath, CompressionCodecFactory compressionCodecFactory) throws IOException { assert sourcePaths.size() > 0 : "No source paths given."; LOG.info("Sorted merge into " + outputPath.toString()); OutputStream outputStream = fs.create(outputPath); CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0)); if (inputCompressionCodec != null) { LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName()); }/*from w w w . jav a2s . c om*/ CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath); if (outputCompressionCodec != null) { LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName()); outputStream = outputCompressionCodec.createOutputStream(outputStream); } List<BufferedReader> readers = new ArrayList<BufferedReader>(); OutputStreamWriter writer = new OutputStreamWriter(outputStream); for (Path partPath : sourcePaths) { LOG.info("\tAdding source " + partPath.toString()); InputStream inputStream = fs.open(partPath); if (inputCompressionCodec != null) { inputStream = inputCompressionCodec.createInputStream(inputStream); } BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); readers.add(reader); } int count = ReadersWriterMergeSort.mergeSort(readers, writer); writer.close(); for (BufferedReader reader : readers) { reader.close(); } readers.clear(); LOG.info("Processed " + count + " lines into " + outputPath.toString()); return count; }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { String uri = args[0];/* w w w . jav a 2 s. c o m*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:crunch.MaxTemperature.java
License:Apache License
private void createFile(Path input, int records, int recordLength) throws IOException { long fileSize = 4096; OutputStream out = fs.create(input, true, 4096, (short) 1, fileSize); CompressionCodecFactory codecFactory = new CompressionCodecFactory(new Configuration()); CompressionCodec codec = codecFactory.getCodec(input); if (codec != null) { out = codec.createOutputStream(out); }/*from w w w.jav a2 s . com*/ Writer writer = new OutputStreamWriter(out); try { for (int n = 0; n < records; n++) { writer.write(line(n, recordLength)); writer.write("\n"); } } finally { writer.close(); } }