List of usage examples for org.apache.hadoop.io.compress SplittableCompressionCodec createInputStream
SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException;
From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java
License:Apache License
/** * This test checks if reading the file in a splitted way results * in the same lines as reading the file as a single 'split'. *//* w w w . j a v a 2 s . c o m*/ private void validateSplitSeams(final Configuration conf, final FileSystem fs, final Path filename, final Class<? extends SplittableCompressionCodec> codecClass, final long splitSize, final long recordsInFile, final long lastSplitSizeLimit) throws IOException { // To make the test predictable conf.setInt("io.file.buffer.size", BUFFER_SIZE); final FileStatus infile = fs.getFileStatus(filename); final long inputLength = infile.getLen(); if (inputLength > Integer.MAX_VALUE) { fail("Bad test file length."); } LOG.info("Input is " + inputLength + " bytes. " + "making a split every " + splitSize + " bytes."); if (inputLength <= splitSize) { fail("The compressed test file is too small to do any useful testing."); } final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); /* * The validation is done as follows: * 1) We open the entire file as a single split as the reference * 2) We create a sequence of splits and validate each line with the * reference split. * The lines from these two must match 100%. */ final Text refLine = new Text(); final Decompressor refDcmp = CodecPool.getDecompressor(codec); assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp); final SplitCompressionInputStream refStream = codec.createInputStream(fs.open(infile.getPath()), refDcmp, 0, inputLength, SplittableCompressionCodec.READ_MODE.BYBLOCK); final LineReader refReader = new LineReader(refStream, conf); final Text line = new Text(); final Decompressor dcmp = CodecPool.getDecompressor(codec); assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp); try { long start = 0; long end = splitSize; int splitCount = 0; long refLineNumber = 0; long splitLineNumber; while (end <= inputLength) { splitLineNumber = 0; ++splitCount; LOG.debug("-------------------------------------------------------"); dcmp.reset(); // Reset the Decompressor for reuse with the new stream final SplitCompressionInputStream splitStream = codec.createInputStream(fs.open(infile.getPath()), dcmp, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); final long adjustedStart = splitStream.getAdjustedStart(); final long adjustedEnd = splitStream.getAdjustedEnd(); if (LOG.isDebugEnabled()) { LOG.debug("Doing split " + splitCount + " on range " + " (" + start + "-" + end + ")" + " adjusted to (" + adjustedStart + "-" + adjustedEnd + ")"); } final LineReader lreader = new LineReader(splitStream, conf); if (start != 0) { // Not the first split so we discard the first (incomplete) line. int readChars = lreader.readLine(line); if (LOG.isTraceEnabled()) { LOG.trace("DISCARD LINE " + 0 + " in split " + splitCount + " pos=" + splitStream.getPos() + " length=" + readChars + ": \"" + line + "\""); } } // Now read until the end of this split while (nextKeyValue(splitStream, lreader, adjustedEnd, line)) { ++splitLineNumber; // Get the reference value if (!nextKeyValue(refStream, refReader, inputLength, refLine)) { LOG.error(String.format("S>%05d: %s", splitLineNumber, line)); fail("Split goes beyond the end of the reference with line number " + splitLineNumber); } ++refLineNumber; if (LOG.isDebugEnabled() && refLineNumber > (recordsInFile - 10)) { LOG.debug(String.format("R<%05d: %s", refLineNumber, refLine)); LOG.debug(String.format("S>%05d: %s", splitLineNumber, line)); } assertEquals("Line must be same in reference and in split at line " + refLineNumber, refLine, line); if (LOG.isTraceEnabled()) { LOG.trace("LINE " + splitLineNumber + " in split " + splitCount + " (" + refLineNumber + ") pos=" + splitStream.getPos() + " length=" + line.getLength() + ": \"" + line + "\""); } } // We just read through the entire split LOG.debug("Checked split " + splitCount + " (" + adjustedStart + "-" + adjustedEnd + ") " + "containing " + splitLineNumber + " lines."); if (end == inputLength) { LOG.info("====================> Finished the last split <===================="); break; // We've reached the end of the last split } // Determine start and end for the next split start = end; if ((end + lastSplitSizeLimit) > inputLength) { end = inputLength; LOG.info("====================> Starting the last split (" + start + " - " + end + ") <===================="); } else { end += splitSize; LOG.info("====================> Starting the next split (" + start + " - " + end + ") <===================="); } } if (nextKeyValue(refStream, refReader, inputLength, refLine)) { ++refLineNumber; LOG.error(String.format("R<%05d: %s", refLineNumber, refLine)); fail("The reference is at least one line longer than the last split ( " + "splitSize=" + splitSize + ", " + "inputLength= " + inputLength + ", " + "split start=" + start + ", " + "split end=" + end + ", " + "line=" + refLineNumber + ")"); } LOG.info("Verified " + refLineNumber + " lines in " + splitCount + " splits."); } finally { CodecPool.returnDecompressor(dcmp); CodecPool.returnDecompressor(refDcmp); } }
From source file:org.hedera.util.SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { CompressionCodec codec = compressionCodecs.getCodec(path); FSDataInputStream din = fs.open(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec; SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); return new SeekableInputStream(cin); } else {//from ww w.j a v a 2 s.c om // non-splittable compression input stream // no seeking or offsetting is needed assert start == 0; CompressionInputStream cin = codec.createInputStream(din, decompressor); return new SeekableInputStream(cin, din); } } else { // non compression input stream // we seek to the start of the split din.seek(start); return new SeekableInputStream(din); } }