List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:com.hadoop.mapred.DeprecatedLzoLineRecordReader.java
License:Open Source License
DeprecatedLzoLineRecordReader(Configuration conf, FileSplit split) throws IOException { start = split.getStart();//w ww. jav a 2 s. c om end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.hadoop.mapreduce.FourMcLineRecordReader.java
License:BSD License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();//from w w w . ja v a 2s . com end = start + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopUtils.getConfiguration(context); maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.hadoop.mapreduce.LzoLineRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();//from w w w . j a v a2 s . c om end = start + split.getLength(); final Path file = split.getPath(); Configuration job = context.getConfiguration(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java
License:Apache License
/** * Verifies that the work dir has the expected output. */// www .ja v a 2 s . c om private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec, String... fileNames) throws IOException { /* * Read format table * * \ out format * \ * in format \ seq | text * ---------------------------- * seq | Custom | ascii | * -------------------------- - * text | Text | ascii | * ---------------------------- */ if (Format.TEXT == outFmt) { /* * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want. * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the * keys and values should be the text representations of what went in. */ BufferedReader reader; Path crushOut; if (null == codec) { Path path = new Path(dir + "/" + crushOutMask); FileStatus[] globStatus = getFileSystem().globStatus(path); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask + " was not found in " + path); } crushOut = globStatus[0].getPath(); reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut))); } else { Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension()); FileStatus[] globStatus = getFileSystem().globStatus(path); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } crushOut = globStatus[0].getPath(); reader = new BufferedReader( new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut)))); } Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) { String expectedLine = String.format("%d\t%d", key, value); assertThat(expectedLine, expected.add(expectedLine), is(true)); String actualLine = reader.readLine(); assertThat(actualLine, actual.add(actualLine), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue()); reader.close(); assertThat(actual, equalTo(expected)); } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) { /* * Record reader will produce keys that are custom writables and values that are custom writable. */ FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask)); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } Path crushOut = globStatus[0].getPath(); Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf()); assertThat(reader.isBlockCompressed(), is(true)); assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass())); CustomWritable key = new CustomWritable(); CustomWritable value = new CustomWritable(); Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { reader.next(key, value); assertThat(expected.add(String.format("%s\t%s", k, v)), is(true)); assertThat(actual.add(String.format("%s\t%s", key, value)), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false)); reader.close(); assertThat(actual, equalTo(expected)); } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) { FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask)); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } Path crushOut = globStatus[0].getPath(); Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf()); assertThat(reader.isCompressed(), is(true)); assertThat(reader.isBlockCompressed(), is(true)); assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass())); Text key = new Text(); Text value = new Text(); Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { reader.next(key, value); assertThat(expected.add(String.format("%s\t%s", k, v)), is(true)); assertThat(actual.add(String.format("%s\t%s", key, value)), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false)); reader.close(); assertThat(actual, equalTo(expected)); } else { fail(); } }
From source file:com.inmobi.conduit.CompressedFileReaderTest.java
License:Apache License
private void uncompress(String fileName) throws Exception { Configuration conf = new Configuration(); FileSystem fs;/*from w ww .ja va 2s . c om*/ fs = FileSystem.getLocal(conf); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(new Path(fileName)); if (codec == null) { System.out.println("cant find codec"); System.exit(1); } LOG.info("Using compression codec [" + codec.toString() + "]"); CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName))); OutputStream out = null; try { String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension()); out = fs.create(new Path(outputURI + "-uncompressed")); org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf); } finally { org.apache.hadoop.io.IOUtils.closeStream(out); IOUtils.closeStream(is); } }
From source file:com.inmobi.conduit.distcp.tools.mapred.RetriableFileCopyCommand.java
License:Apache License
private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream, int bufferSize, Mapper.Context context, Map<Long, Long> received) throws IOException { Path source = sourceFileStatus.getPath(); ThrottledInputStream inStream = null; final CompressionCodec codec = compressionCodecs.getCodec(source); InputStream compressedIn = null; OutputStream commpressedOut = null; BufferedReader reader = null; long numberOfLinesRead = 0; try {//from w w w . j a v a 2s . c o m inStream = getInputStream(source, HadoopCompat.getTaskConfiguration(context)); compressedIn = codec.createInputStream(inStream); commpressedOut = codec.createOutputStream(outStream); // LineReader reader = new LineReader(compressedIn, // context.getConfiguration(), null); reader = new BufferedReader(new InputStreamReader(compressedIn)); byte[] bytesRead = readLine(reader); while (bytesRead != null) { numberOfLinesRead++; commpressedOut.write(bytesRead); commpressedOut.write("\n".getBytes()); updateContextStatus(inStream.getTotalBytesRead(), context, sourceFileStatus, numberOfLinesRead); if (received != null) { byte[] decodedMsg = Base64.decodeBase64(bytesRead); incrementReceived(decodedMsg, received); } bytesRead = readLine(reader); } HadoopCompat.incrementCounter(HadoopCompat.getCounter(context, CopyMapper.Counter.SLEEP_TIME_MS), inStream.getTotalSleepTime()); LOG.info("STATS: " + inStream); } finally { IOUtils.cleanup(LOG, inStream, reader, compressedIn); try { if (commpressedOut != null) commpressedOut.close(); outStream.close(); } catch (IOException exception) { LOG.error("Could not close output-stream. ", exception); throw exception; } } return inStream.getTotalBytesRead();// totalBytesRead; }
From source file:com.jeffy.hdfs.compression.FileDecompressor.java
License:Apache License
/** * @param args//w w w . ja va 2 s .c o m * * @throws IOException */ public static void main(String[] args) throws IOException { //?? Configuration conf = new Configuration(); // ? CompressionCodecFactory factory = new CompressionCodecFactory(conf); for (String uri : args) { FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); // ??????io.compression.codecs CompressionCodec codec = factory.getCodec(inputPath); // ?? if (codec == null) { System.err.println("No codec found for " + uri); continue; } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); try (InputStream in = codec.createInputStream(fs.open(inputPath)); OutputStream out = fs.create(new Path(outputUri))) { IOUtils.copyBytes(in, out, conf); } } }
From source file:com.knewton.mrtool.io.JsonRecordReader.java
License:Apache License
/** * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line * by line. This separate method helps with testing too. * // w w w . j ava 2 s. c om * @param fileSplit * @param conf * @return * @throws IOException */ protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException { final Path file = fileSplit.getPath(); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(fileSplit.getPath()); seekableIn = fileIn; boolean skipFirstLine = false; LineReader lineReader; if (codec != null) { lineReader = new LineReader(codec.createInputStream(fileIn), conf); } else { // if the start is not the beginning of the file then skip the first line to get the // next complete json record. The previous json record will be read by the record reader // that got assigned the previous InputSplit. if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new LineReader(fileIn, conf); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } return lineReader; }
From source file:com.linkedin.cubert.block.BlockUtils.java
License:Open Source License
@SuppressWarnings("unchecked") public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json, BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException { Block block;//from w w w . j a va 2s .c o m if (indexEntry == null) { if (emptyForMissing) return new EmptyBlock(props); throw new IOException(String.format("Index entry is null")); } // populate props props.setBlockId(indexEntry.getBlockId()); props.setNumRecords(indexEntry.getNumRecords()); // Open the file and seek to the offset for this block Path file = new Path(indexEntry.getFile()); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE); fsin.seek(indexEntry.getOffset()); // Gather information needed to read this block Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass(); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file); // Load the block now if (isInMemoryBlock) { print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId()); ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry); if (byteBuffer == null) { int read = 0; byte[] data = new byte[(int) indexEntry.getLength()]; while (read != data.length) { read += fsin.read(data, read, data.length - read); } fsin.close(); byteBuffer = ByteBuffer.wrap(data); inMemoryBlockCache.put(indexEntry, byteBuffer); } else { print.f("REUSED FROM CACHE!!"); byteBuffer.rewind(); } block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType); block.configure(json); return block; } else { print.f("STREAMING the block %d", indexEntry.getBlockId()); InputStream in = new BlockInputStream(fsin, indexEntry.getLength()); if (codec != null) { in = codec.createInputStream(in); } block = new CubertBlock(props, new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema())); block.configure(json); print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(), indexEntry.getOffset(), indexEntry.getLength()); return block; } }
From source file:com.linkedin.cubert.io.rubix.RubixMemoryBlock.java
License:Open Source License
public RubixMemoryBlock(BlockProperties props, Configuration conf, ByteBuffer byteBuffer, Class<Tuple> valueClass, CompressionCodec codec, BlockSerializationType serializationType) throws IOException { this.props = props; this.serializationType = serializationType; this.byteBuffer = byteBuffer; this.tupleCreator = new RubixTupleCreator(); this.inputStream = new ByteBufferBackedInputStream(byteBuffer); switch (serializationType) { case DEFAULT: SerializationFactory serializationFactory = new SerializationFactory(conf); deserializer = serializationFactory.getDeserializer(valueClass); break;//from w w w. ja v a2 s . c om case COMPACT: deserializer = new CompactDeserializer<Tuple>(props.getSchema()); break; } if (codec == null) { deserializer.open(inputStream); } else { deserializer.open(codec.createInputStream(inputStream)); } this.mark = this.currentTuplePosition = 0; }