List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * //from w ww. j av a 2s . c o m * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:edu.uci.ics.hyracks.imru.file.HDFSUtils.java
License:Apache License
/** * Open a file in HDFS for reading, performing automatic * decompression as necessary./* w w w .j a v a 2s . co m*/ * * @param dfs * The HDFS file system object. * @param conf * The HDFS configuration. * @param path * The path to the file. * @return An InputStream for reading the file. * @throws IOException */ public static InputStream open(FileSystem dfs, Configuration conf, Path path) throws IOException { FSDataInputStream fin = dfs.open(path); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(path); if (codec != null) { return codec.createInputStream(fin); } else { return fin; } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample a specific number of lines from a given file * @param fs/* ww w . j a va 2s .c o m*/ * @param file * @param count * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); // Open the file and read the sample FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); int sampledLines = 0; if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); sampledLines = sampleStreamByCount(in, end - start, count, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output); } } else { long pos = 0; // Current position in file // Generate random offsets and keep them sorted for IO efficiency Random rand = new Random(seed); long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers Text line = new Text2(); for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } return sampledLines; } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample text lines from the given split with the given sampling ratio * @param fs//from www .j av a2s . c o m * @param file * @param ratio * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; int sampledLines; Text line = new Text2(); try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); // Skip first line if needed if (file.getStart() > 0) start += readUntilEOL(cIn, line); sampledLines = sampleStreamByRatio(in, ratio, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); // No need to skip first line because we actually read the file from // the beginning sampledLines = sampleStreamByRatio(in, ratio, seed, output); } } else { // Not a compressed file. Apply a more efficient, though approximate, // solution // Open the file and read the sample long pos = 0; // Current position in file if (file.getStart() > 0) { pos += in.skip(file.getStart()); pos += readUntilEOL(in, line); } // Initialize the random variable which is used for sampling Random rand = new Random(seed); sampledLines = 0; // Read the first 10 lines to estimate the average record size long end = file.getStart() + file.getLength(); for (int i = 0; i < 10 && pos < end; i++) { line.clear(); pos += readUntilEOL(in, line); if (rand.nextFloat() < ratio) { sampledLines++; if (output != null) output.collect(line); } } int averageLineSize = (int) ((pos - file.getStart()) / 10); int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines; long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } in.close(); return sampledLines; }
From source file:format.OverlapRecordReader.java
License:BSD License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*from w w w. j ava2s . co m*/ end = start + split.getLength(); final Path file = split.getPath(); //Configuration job = HadoopUtils.getConfiguration(context); Configuration job = context.getConfiguration(); maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils.java
License:LGPL
/** * Copy bytes from an InputStream to a path. * @param is the InputStream to read from * @param destPath destination path//from w ww . j a v a2 s . c o m * @param conf Configuration object * @return the number of bytes copied * @throws IOException In case of an I/O problem */ public static boolean copyAndCompressInputStreamToPath(final InputStream is, final Path destPath, final Configuration conf) throws IOException { if (is == null) { throw new NullPointerException("The input stream is null"); } if (destPath == null) { throw new NullPointerException("The destination path is null"); } if (conf == null) { throw new NullPointerException("The configuration object is null"); } final FileSystem fs = FileSystem.get(destPath.toUri(), conf); final CompressionCodecFactory factory = new CompressionCodecFactory(conf); final CompressionCodec codec = factory.getCodec(destPath); if (codec == null) { throw new IOException("No codec found for: " + destPath); } final OutputStream os = codec.createOutputStream(fs.create(destPath)); FileUtils.copy(is, os); return true; }
From source file:gobblin.source.extractor.hadoop.HadoopFsHelper.java
License:Apache License
/** * Returns an {@link InputStream} to the specified file. * <p>//www.jav a 2 s. c o m * Note: It is the caller's responsibility to close the returned {@link InputStream}. * </p> * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e); } }
From source file:hdfsIO.fileInteractions.java
public List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) { return new ArrayList<String>(); }/*from ww w . j a v a 2s .c om*/ List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); String[] resulting = raw.split("\n"); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:hivemall.utils.hadoop.HadoopUtils.java
License:Open Source License
public static BufferedReader getBufferedReader(File file, MapredContext context) throws IOException { URI fileuri = file.toURI();//from www .j a v a 2 s . c om Path path = new Path(fileuri); Configuration conf = context.getJobConf(); CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(path); if (codec == null) { return new BufferedReader(new FileReader(file)); } else { Decompressor decompressor = CodecPool.getDecompressor(codec); FileInputStream fis = new FileInputStream(file); CompressionInputStream cis = codec.createInputStream(fis, decompressor); BufferedReader br = new BufferedReaderExt(new InputStreamReader(cis), decompressor); return br; } }
From source file:io.aos.hdfs.FileDecompressor.java
License:Apache License
public static void main(String... args) throws Exception { String uri = args[0];// www .ja v a 2s .co m Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }