List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory CompressionCodecFactory
public CompressionCodecFactory(Configuration conf)
From source file:StreamWikiDumpInputFormat.java
License:Apache License
public void configure(JobConf conf) { this.compressionCodecs = new CompressionCodecFactory(conf); }
From source file:be.ugent.intec.halvade.uploader.HalvadeUploader.java
License:Open Source License
public void parseArguments(String[] args) throws ParseException { createOptions();//w w w. j a v a2 s . c o m CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); manifest = line.getOptionValue("1"); if (!manifest.endsWith(".manifest")) { file1 = manifest; manifest = null; } outputDir = line.getOptionValue("O"); if (!outputDir.endsWith("/")) outputDir += "/"; if (line.hasOption("2")) file2 = line.getOptionValue("2"); if (line.hasOption("profile")) profile = line.getOptionValue("profile"); if (line.hasOption("t")) mthreads = Integer.parseInt(line.getOptionValue("t")); if (line.hasOption("i")) isInterleaved = true; if (line.hasOption("sse")) SSE = true; if (line.hasOption("snappy")) { CompressionCodecFactory codecFactory = new CompressionCodecFactory(getConf()); codec = codecFactory.getCodecByClassName("org.apache.hadoop.io.compress.SnappyCodec"); } if (line.hasOption("lz4")) { CompressionCodecFactory codecFactory = new CompressionCodecFactory(getConf()); codec = codecFactory.getCodecByClassName("org.apache.hadoop.io.compress.Lz4Codec"); } if (codec != null) Logger.DEBUG("Hadoop encryption: " + codec.getDefaultExtension().substring(1)); if (line.hasOption("size")) bestFileSize = Integer.parseInt(line.getOptionValue("size")) * 1024 * 1024; }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./* w w w . ja v a 2 s . c o m*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
From source file:ca.sparkera.adapters.mapreduce.MainframeVBRecordReader.java
License:Apache License
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart;//from w w w . java 2 s . c o m end = start + splitLength; LOG.info("Start of the split:" + start + "-End of split:" + end); LOG.debug("VLR initialize started: start pos:" + start + "endpos:" + end); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; numBytesRemainingInSplit = splitLength; LOG.info("Variable length input; cannot compute number of records in the split"); } this.pos = start; }
From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java
License:Apache License
/** * use case: % hadoop FileDecompressor file.gz * @param args// w w w .j a v a2 s . c o m */ public static void main(String[] args) { FileSystem fs = null; String uri = args[0]; Path inputPath = null; Configuration conf = new Configuration(); CompressionCodecFactory factory = null; InputStream in = null; OutputStream out = null; try { fs = FileSystem.get(URI.create(uri), conf); inputPath = new Path(uri); factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();// w ww . j a v a2 s .c o m end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key?? key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameTextInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); if (null == codec) { return true; }//from w ww . ja va 2s. c o m return codec instanceof SplittableCompressionCodec; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w w w . j a v a2 s .c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key? key = new Text(file.toString()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { Configuration conf = context.getConfiguration(); Path path = new Path(conf.get(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH)); final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); return (null == codec) ? true : codec instanceof SplittableCompressionCodec; }
From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java
License:Apache License
/** * //from www . j a va 2s. com * @param delimiter * @param column * * */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF); this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }