List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./*from w w w . j a v a2s . c om*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java
License:Apache License
/** * use case: % hadoop FileDecompressor file.gz * @param args/*from w w w .j a va2 s.co m*/ */ public static void main(String[] args) { FileSystem fs = null; String uri = args[0]; Path inputPath = null; Configuration conf = new Configuration(); CompressionCodecFactory factory = null; InputStream in = null; OutputStream out = null; try { fs = FileSystem.get(URI.create(uri), conf); inputPath = new Path(uri); factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java
License:Apache License
/** * //from w w w .ja va 2s . co m * @param delimiter * @param column * * */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF); this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java
License:Apache License
protected void initialize(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from www . j a v a 2 s . c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; String rowDelim = job.get("textinputformat.record.delimiter", null); if (codec != null) { if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(codec.createInputStream(fileIn), job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(fileIn, job); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart();/* w w w.j a v a2 s .c o m*/ end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.asakusafw.runtime.io.text.directio.AbstractTextStreamFormat.java
License:Apache License
private InputStream decorate(InputStream stream, long offset, long splitSize) throws IOException { InputSplitter splitter = getInputSplitter(); if (splitter != null) { assert getCompressionCodecClass() == null; return splitter.trim(stream, offset, splitSize != -1L ? splitSize : Long.MAX_VALUE); }// ww w . j a v a 2s . c om Class<? extends CompressionCodec> codecClass = getCompressionCodecClass(); if (codecClass != null) { CompressionCodec codec = ReflectionUtils.newInstance(codecClass, getConf()); return codec.createInputStream(stream); } return stream; }
From source file:com.cloudera.flume.handlers.hdfs.TestEscapedCustomOutputDfs.java
License:Apache License
void checkOutputFormat(String format, OutputFormat of, String codecName, CompressionCodec codec) throws IOException, InterruptedException { // set the output format. FlumeConfiguration conf = FlumeConfiguration.get(); conf.set(FlumeConfiguration.COLLECTOR_OUTPUT_FORMAT, format); conf.set(FlumeConfiguration.COLLECTOR_DFS_COMPRESS_CODEC, codecName); // build a sink that outputs to that format. File f = FileUtil.mktempdir(); SinkBuilder builder = EscapedCustomDfsSink.builder(); EventSink snk = builder.create(new Context(), "file:///" + f.getPath() + "/sub-%{service}"); Event e = new EventImpl("this is a test message".getBytes()); Attributes.setString(e, "service", "foo"); snk.open();//www .j a v a 2 s . c o m snk.append(e); snk.close(); ByteArrayOutputStream exWriter = new ByteArrayOutputStream(); of.format(exWriter, e); exWriter.close(); String expected = new String(exWriter.toByteArray()); // check the output to make sure it is what we expected. // handle compression codec / extensions when checking. String ext = ""; // file extension if (codec != null) { ext = codec.getDefaultExtension(); } InputStream in = new FileInputStream(f.getPath() + "/sub-foo" + ext); if (codec != null) { in = codec.createInputStream(in); } byte[] buf = new byte[1]; StringBuilder output = new StringBuilder(); // read the file while ((in.read(buf)) > 0) { output.append(new String(buf)); } in.close(); // Must close for windows to delete assertEquals(expected, output.toString()); // This doesn't get deleted in windows but the core test succeeds assertTrue("temp folder successfully deleted", FileUtil.rmr(f)); }
From source file:com.cloudera.sqoop.TestCompression.java
License:Apache License
public void runTextCompressionTest(CompressionCodec codec, int expectedNum) throws IOException { String[] columns = HsqldbTestServer.getFieldNames(); String[] argv = getArgv(true, columns, codec, "--as-textfile"); runImport(argv);// w w w. j a v a 2 s. c o m Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); if (codec == null) { codec = new GzipCodec(); } ReflectionUtils.setConf(codec, getConf()); Path p = new Path(getDataFilePath().toString() + codec.getDefaultExtension()); InputStream is = codec.createInputStream(fs.open(p)); BufferedReader r = new BufferedReader(new InputStreamReader(is)); int numLines = 0; while (true) { String ln = r.readLine(); if (ln == null) { break; } numLines++; } r.close(); assertEquals(expectedNum, numLines); }
From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java
License:Open Source License
public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException { LOG.warn("split start: " + split.getStart()); LOG.warn("split length: " + split.getLength()); String[] locs = split.getLocations(); for (String loc : locs) { LOG.warn("location: " + loc); }//from w w w . j a v a 2s. c o m start = split.getStart(); end = start + split.getLength(); LOG.warn("split end: " + end); final Path file = split.getPath(); LOG.warn("file: " + file.getName()); LOG.warn("INT split start: " + (int) split.getStart()); LOG.warn("INT split length: " + (int) split.getLength()); LOG.warn("INT split end: " + (int) end); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); LOG.warn("codec: " + codec.toString()); LOG.warn("config: " + conf.toString()); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); LOG.warn("fileIn position: " + fileIn.getPos()); LOG.warn("buffer size: " + conf.get("io.file.buffer.size")); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.facebook.presto.hadoop.TestHadoopNative.java
License:Apache License
private static byte[] decompress(CompressionCodec codec, byte[] input) throws IOException { ByteArrayOutputStream bytes = new ByteArrayOutputStream(); try (InputStream in = codec.createInputStream(new ByteArrayInputStream(input))) { int b;/* w w w. jav a2 s .c o m*/ while ((b = in.read()) != -1) { bytes.write(b); } } return bytes.toByteArray(); }