List of usage examples for org.apache.hadoop.io.compress CodecPool getDecompressor
public static Decompressor getDecompressor(CompressionCodec codec)
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * //from w ww. ja va 2s .co m * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Checks whether a file is indexed using an R-tree or not. This allows * an operation to use the R-tree to speedup the processing if it exists. * This function opens the specified file and reads the first eight bytes * which include the R-tree signature. If the signatures matches with the * R-tree signature, true is returned. Otherwise, false is returned. * If the parameter is a path to a directory, only the first data file in that * directory is tested./* w w w . j a v a 2 s . c o m*/ * @param fs * @param path * @return * @throws IOException */ public static boolean isRTree(FileSystem fs, Path path) throws IOException { if (FileUtil.getExtensionWithoutCompression(path).equals("rtree")) return true; FileStatus file = fs.getFileStatus(path); Path fileToCheck; if (file.isDir()) { // Check any cell (e.g., first cell) GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path); if (gIndex == null) return false; fileToCheck = new Path(path, gIndex.iterator().next().filename); } else { fileToCheck = file.getPath(); } InputStream fileIn = fs.open(fileToCheck); // Check if file is compressed CompressionCodec codec = compressionCodecs.getCodec(fileToCheck); Decompressor decompressor = null; if (codec != null) { synchronized (compressionCodecs) { // CodecPool is not thread-safe decompressor = CodecPool.getDecompressor(codec); } fileIn = codec.createInputStream(fileIn, decompressor); } byte[] signature = new byte[RTreeFileMarkerB.length]; fileIn.read(signature); fileIn.close(); if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB); }
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java
License:Open Source License
/** * Initialize from a path and file range * @param job//from www . j a v a 2s . c om * @param s * @param l * @param p * @throws IOException */ public SpatialRecordReader(Configuration job, long s, long l, Path p) throws IOException { this.start = s; this.end = s + l; this.path = p; LOG.info("Open a SpatialRecordReader to file: " + p + "[" + s + "," + (s + l) + ")"); this.fs = this.path.getFileSystem(job); this.directIn = fs.open(this.path); this.blockSize = fs.getFileStatus(this.path).getBlockSize(); this.cellMbr = new Rectangle(); codec = new CompressionCodecFactory(job).getCodec(this.path); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { in = codec.createInputStream(directIn, decompressor); filePosition = directIn; } } else { directIn.seek(start); in = directIn; filePosition = directIn; } this.pos = start; this.maxShapesInOneRead = job.getInt(SpatialSite.MaxShapesInOneRead, 1000000); this.maxBytesInOneRead = job.getInt(SpatialSite.MaxBytesInOneRead, 32 * 1024 * 1024); initializeReader(); }
From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java
License:Open Source License
public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException { LOG.info("Open a SpatialRecordReader to split: " + split); FileSplit fsplit = (FileSplit) split; this.path = fsplit.getPath(); this.start = fsplit.getStart(); this.end = this.start + split.getLength(); this.fs = this.path.getFileSystem(conf); this.directIn = fs.open(this.path); codec = new CompressionCodecFactory(conf).getCodec(this.path); if (codec != null) { // Input is compressed, create a decompressor to decompress it decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new DataInputStream(cIn); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd();//from ww w.ja v a 2 s . c o m // take pos from compressed stream as we adjusted both start and end // to match with the compressed file filePosition = cIn; } else { // Non-splittable input, need to start from the beginning CompressionInputStream cIn = codec.createInputStream(directIn, decompressor); in = new DataInputStream(cIn); filePosition = cIn; } } else { // Non-compressed file, seek to the desired position and use this stream // to get the progress and position directIn.seek(start); in = directIn; filePosition = directIn; } byte[] signature = new byte[8]; in.readFully(signature); if (!Arrays.equals(signature, SpatialSite.RTreeFileMarkerB)) { throw new RuntimeException("Incorrect signature for RTree"); } this.stockShape = (V) OperationsParams.getShape(conf, "shape"); if (conf.get(SpatialInputFormat3.InputQueryRange) != null) { // Retrieve the input query range to apply on all records this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange); this.inputQueryMBR = this.inputQueryRange.getMBR(); } // Check if there is an associated global index to read cell boundaries GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent()); if (gindex == null) { cellMBR = new Partition(); cellMBR.invalidate(); } else { // Set from the associated partition in the global index for (Partition p : gindex) { if (p.filename.equals(this.path.getName())) cellMBR = p; } } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java
License:Open Source License
public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) split; if (compressionCodecFactory == null) compressionCodecFactory = new CompressionCodecFactory(conf); LOG.info("Open a SpatialRecordReader to split: " + split); this.path = fsplit.getPath(); this.start = fsplit.getStart(); this.end = this.start + split.getLength(); this.fs = this.path.getFileSystem(conf); this.directIn = fs.open(this.path); codec = compressionCodecFactory.getCodec(this.path); if (codec != null) { // Input is compressed, create a decompressor to decompress it decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn;/* w ww .j a v a2 s .co m*/ start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); // take pos from compressed stream as we adjusted both start and end // to match with the compressed file progressPosition = cIn; } else { // Non-splittable input, need to start from the beginning CompressionInputStream cIn = codec.createInputStream(directIn, decompressor); in = cIn; progressPosition = cIn; } } else { // Non-compressed file, seek to the desired position and use this stream // to get the progress and position directIn.seek(start); in = directIn; progressPosition = directIn; } this.stockShape = (V) OperationsParams.getShape(conf, "shape"); this.tempLine = new Text(); this.lineReader = new LineReader(in); bytesRead = 0; if (this.start != 0) { // Skip until first end-of-line reached bytesRead += lineReader.readLine(tempLine); } if (conf.get(SpatialInputFormat3.InputQueryRange) != null) { // Retrieve the input query range to apply on all records this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange); this.inputQueryMBR = this.inputQueryRange.getMBR(); } // Check if there is an associated global index to read cell boundaries GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent()); if (gindex == null) { cellMBR = new Partition(); cellMBR.filename = path.getName(); cellMBR.invalidate(); } else { // Set from the associated partition in the global index for (Partition p : gindex) { if (p.filename.equals(this.path.getName())) cellMBR = p; } } this.value = new ShapeIterator<V>(); value.setShape(stockShape); }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample a specific number of lines from a given file * @param fs//from w w w . j a va 2 s.c o m * @param file * @param count * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); // Open the file and read the sample FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); int sampledLines = 0; if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); sampledLines = sampleStreamByCount(in, end - start, count, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output); } } else { long pos = 0; // Current position in file // Generate random offsets and keep them sorted for IO efficiency Random rand = new Random(seed); long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers Text line = new Text2(); for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } return sampledLines; } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample text lines from the given split with the given sampling ratio * @param fs/* w w w. j a va2 s.c om*/ * @param file * @param ratio * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; int sampledLines; Text line = new Text2(); try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); // Skip first line if needed if (file.getStart() > 0) start += readUntilEOL(cIn, line); sampledLines = sampleStreamByRatio(in, ratio, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); // No need to skip first line because we actually read the file from // the beginning sampledLines = sampleStreamByRatio(in, ratio, seed, output); } } else { // Not a compressed file. Apply a more efficient, though approximate, // solution // Open the file and read the sample long pos = 0; // Current position in file if (file.getStart() > 0) { pos += in.skip(file.getStart()); pos += readUntilEOL(in, line); } // Initialize the random variable which is used for sampling Random rand = new Random(seed); sampledLines = 0; // Read the first 10 lines to estimate the average record size long end = file.getStart() + file.getLength(); for (int i = 0; i < 10 && pos < end; i++) { line.clear(); pos += readUntilEOL(in, line); if (rand.nextFloat() < ratio) { sampledLines++; if (output != null) output.collect(line); } } int averageLineSize = (int) ((pos - file.getStart()) / 10); int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines; long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } in.close(); return sampledLines; }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testMultipleClose() throws IOException { URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2"); assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context);/*w ww .j a v a 2s. co m*/ //noinspection StatementWithEmptyBody while (reader.nextKeyValue()) ; reader.close(); reader.close(); BZip2Codec codec = new BZip2Codec(); codec.setConf(conf); Set<Decompressor> decompressors = new HashSet<Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.add(CodecPool.getDecompressor(codec)); } assertEquals(10, decompressors.size()); }
From source file:format.OverlapLengthRecordReader.java
License:Apache License
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart;/* w w w . j a v a 2s . co m*/ end = start + splitLength; long partialRecordLength = start % recordLength; long numBytesToSkip = 0; /* This if check is not necessary since for this, we will read one entire split */ /* if (partialRecordLength != 0) { numBytesToSkip = recordLength - partialRecordLength; } */ // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; numRecordsRemainingInSplit = Long.MAX_VALUE; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; long splitSize = end - start - numBytesToSkip; /* This remains to be observed, since we are assuming recordLength = splitSize */ // numRecordsRemainingInSplit = (splitSize + recordLength - 1)/recordLength; numRecordsRemainingInSplit = 1; if (numRecordsRemainingInSplit < 0) { numRecordsRemainingInSplit = 0; } LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes"); } if (numBytesToSkip != 0) { start += inputStream.skip(numBytesToSkip); } this.pos = start; }
From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w ww .j av a2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitFastqLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }