List of usage examples for org.apache.hadoop.io.compress CodecPool returnDecompressor
public static void returnDecompressor(Decompressor decompressor)
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Checks whether a file is indexed using an R-tree or not. This allows * an operation to use the R-tree to speedup the processing if it exists. * This function opens the specified file and reads the first eight bytes * which include the R-tree signature. If the signatures matches with the * R-tree signature, true is returned. Otherwise, false is returned. * If the parameter is a path to a directory, only the first data file in that * directory is tested.//ww w . ja va 2 s.c o m * @param fs * @param path * @return * @throws IOException */ public static boolean isRTree(FileSystem fs, Path path) throws IOException { if (FileUtil.getExtensionWithoutCompression(path).equals("rtree")) return true; FileStatus file = fs.getFileStatus(path); Path fileToCheck; if (file.isDir()) { // Check any cell (e.g., first cell) GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path); if (gIndex == null) return false; fileToCheck = new Path(path, gIndex.iterator().next().filename); } else { fileToCheck = file.getPath(); } InputStream fileIn = fs.open(fileToCheck); // Check if file is compressed CompressionCodec codec = compressionCodecs.getCodec(fileToCheck); Decompressor decompressor = null; if (codec != null) { synchronized (compressionCodecs) { // CodecPool is not thread-safe decompressor = CodecPool.getDecompressor(codec); } fileIn = codec.createInputStream(fileIn, decompressor); } byte[] signature = new byte[RTreeFileMarkerB.length]; fileIn.read(signature); fileIn.close(); if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB); }
From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java
License:Open Source License
@Override public void close() throws IOException { try {/*from ww w .j a v a 2 s . co m*/ in.close(); in = null; } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample a specific number of lines from a given file * @param fs/* w w w . ja v a2 s . c o m*/ * @param file * @param count * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); // Open the file and read the sample FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); int sampledLines = 0; if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); sampledLines = sampleStreamByCount(in, end - start, count, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output); } } else { long pos = 0; // Current position in file // Generate random offsets and keep them sorted for IO efficiency Random rand = new Random(seed); long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers Text line = new Text2(); for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } return sampledLines; } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample text lines from the given split with the given sampling ratio * @param fs/* w w w . ja va2s . c om*/ * @param file * @param ratio * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; int sampledLines; Text line = new Text2(); try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); // Skip first line if needed if (file.getStart() > 0) start += readUntilEOL(cIn, line); sampledLines = sampleStreamByRatio(in, ratio, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); // No need to skip first line because we actually read the file from // the beginning sampledLines = sampleStreamByRatio(in, ratio, seed, output); } } else { // Not a compressed file. Apply a more efficient, though approximate, // solution // Open the file and read the sample long pos = 0; // Current position in file if (file.getStart() > 0) { pos += in.skip(file.getStart()); pos += readUntilEOL(in, line); } // Initialize the random variable which is used for sampling Random rand = new Random(seed); sampledLines = 0; // Read the first 10 lines to estimate the average record size long end = file.getStart() + file.getLength(); for (int i = 0; i < 10 && pos < end; i++) { line.clear(); pos += readUntilEOL(in, line); if (rand.nextFloat() < ratio) { sampledLines++; if (output != null) output.collect(line); } } int averageLineSize = (int) ((pos - file.getStart()) / 10); int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines; long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } in.close(); return sampledLines; }
From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
public synchronized void close() throws IOException { try {/*from www . ja v a 2s . c om*/ if (in != null) { in.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } if (Constant.DEBUG_MODE) System.out.println("Number of errors: " + numErrors); } }
From source file:mr.MyFileRecordReader2.java
License:Apache License
public synchronized void close() throws IOException { try {//from w w w. j av a2s . co m if (in != null) { in.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); decompressor = null; } } }
From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java
License:Apache License
/** * This test checks if reading the file in a splitted way results * in the same lines as reading the file as a single 'split'. *//*from ww w .j a v a 2 s . c o m*/ private void validateSplitSeams(final Configuration conf, final FileSystem fs, final Path filename, final Class<? extends SplittableCompressionCodec> codecClass, final long splitSize, final long recordsInFile, final long lastSplitSizeLimit) throws IOException { // To make the test predictable conf.setInt("io.file.buffer.size", BUFFER_SIZE); final FileStatus infile = fs.getFileStatus(filename); final long inputLength = infile.getLen(); if (inputLength > Integer.MAX_VALUE) { fail("Bad test file length."); } LOG.info("Input is " + inputLength + " bytes. " + "making a split every " + splitSize + " bytes."); if (inputLength <= splitSize) { fail("The compressed test file is too small to do any useful testing."); } final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); /* * The validation is done as follows: * 1) We open the entire file as a single split as the reference * 2) We create a sequence of splits and validate each line with the * reference split. * The lines from these two must match 100%. */ final Text refLine = new Text(); final Decompressor refDcmp = CodecPool.getDecompressor(codec); assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp); final SplitCompressionInputStream refStream = codec.createInputStream(fs.open(infile.getPath()), refDcmp, 0, inputLength, SplittableCompressionCodec.READ_MODE.BYBLOCK); final LineReader refReader = new LineReader(refStream, conf); final Text line = new Text(); final Decompressor dcmp = CodecPool.getDecompressor(codec); assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp); try { long start = 0; long end = splitSize; int splitCount = 0; long refLineNumber = 0; long splitLineNumber; while (end <= inputLength) { splitLineNumber = 0; ++splitCount; LOG.debug("-------------------------------------------------------"); dcmp.reset(); // Reset the Decompressor for reuse with the new stream final SplitCompressionInputStream splitStream = codec.createInputStream(fs.open(infile.getPath()), dcmp, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); final long adjustedStart = splitStream.getAdjustedStart(); final long adjustedEnd = splitStream.getAdjustedEnd(); if (LOG.isDebugEnabled()) { LOG.debug("Doing split " + splitCount + " on range " + " (" + start + "-" + end + ")" + " adjusted to (" + adjustedStart + "-" + adjustedEnd + ")"); } final LineReader lreader = new LineReader(splitStream, conf); if (start != 0) { // Not the first split so we discard the first (incomplete) line. int readChars = lreader.readLine(line); if (LOG.isTraceEnabled()) { LOG.trace("DISCARD LINE " + 0 + " in split " + splitCount + " pos=" + splitStream.getPos() + " length=" + readChars + ": \"" + line + "\""); } } // Now read until the end of this split while (nextKeyValue(splitStream, lreader, adjustedEnd, line)) { ++splitLineNumber; // Get the reference value if (!nextKeyValue(refStream, refReader, inputLength, refLine)) { LOG.error(String.format("S>%05d: %s", splitLineNumber, line)); fail("Split goes beyond the end of the reference with line number " + splitLineNumber); } ++refLineNumber; if (LOG.isDebugEnabled() && refLineNumber > (recordsInFile - 10)) { LOG.debug(String.format("R<%05d: %s", refLineNumber, refLine)); LOG.debug(String.format("S>%05d: %s", splitLineNumber, line)); } assertEquals("Line must be same in reference and in split at line " + refLineNumber, refLine, line); if (LOG.isTraceEnabled()) { LOG.trace("LINE " + splitLineNumber + " in split " + splitCount + " (" + refLineNumber + ") pos=" + splitStream.getPos() + " length=" + line.getLength() + ": \"" + line + "\""); } } // We just read through the entire split LOG.debug("Checked split " + splitCount + " (" + adjustedStart + "-" + adjustedEnd + ") " + "containing " + splitLineNumber + " lines."); if (end == inputLength) { LOG.info("====================> Finished the last split <===================="); break; // We've reached the end of the last split } // Determine start and end for the next split start = end; if ((end + lastSplitSizeLimit) > inputLength) { end = inputLength; LOG.info("====================> Starting the last split (" + start + " - " + end + ") <===================="); } else { end += splitSize; LOG.info("====================> Starting the next split (" + start + " - " + end + ") <===================="); } } if (nextKeyValue(refStream, refReader, inputLength, refLine)) { ++refLineNumber; LOG.error(String.format("R<%05d: %s", refLineNumber, refLine)); fail("The reference is at least one line longer than the last split ( " + "splitSize=" + splitSize + ", " + "inputLength= " + inputLength + ", " + "split start=" + start + ", " + "split end=" + end + ", " + "line=" + refLineNumber + ")"); } LOG.info("Verified " + refLineNumber + " lines in " + splitCount + " splits."); } finally { CodecPool.returnDecompressor(dcmp); CodecPool.returnDecompressor(refDcmp); } }
From source file:nl.surfsara.warcutils.WarcRecordReader.java
License:Apache License
@Override public synchronized void close() throws IOException { try {//w w w.j av a 2 s . c o m if (in != null) { in.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:org.apache.hawq.pxf.plugins.hdfs.ChunkRecordReader.java
License:Apache License
/** * Closes the input stream./* w ww .j ava2 s. co m*/ */ @Override public synchronized void close() throws IOException { try { if (in != null) { in.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:org.apache.jena.grande.mapreduce.io.TripleRecordReader.java
License:Apache License
@Override public void close() throws IOException { try {//from w ww. j ava2 s . c o m if (in != null) { in.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }