List of usage examples for org.apache.hadoop.io.compress CodecPool getDecompressor
public static Decompressor getDecompressor(CompressionCodec codec)
From source file:org.springframework.data.hadoop.store.input.AbstractDataStreamReader.java
License:Apache License
/** * Gets the input.//from w w w . jav a 2 s. c om * * @param inputPath the input path * @return the input * @throws IOException Signals that an I/O exception has occurred. */ protected StreamsHolder<InputStream> getInput(Path inputPath) throws IOException { log.info("Creating new InputStream"); StreamsHolder<InputStream> holder = new StreamsHolder<InputStream>(); final FileSystem fs = getPath().getFileSystem(getConfiguration()); Path p = inputPath.isAbsolute() ? inputPath : new Path(getPath(), inputPath); if (!fs.exists(p)) { throw new StoreException("Path " + p + " does not exist"); } if (!isCompressed()) { InputStream input = fs.open(p); holder.setStream(input); } else { // TODO: will isCompressed() really guard for npe against getCodec() Class<?> clazz = ClassUtils.resolveClassName(getCodec().getCodecClass(), getClass().getClassLoader()); CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz, getConfiguration()); Decompressor decompressor = CodecPool.getDecompressor(compressionCodec); FSDataInputStream winput = fs.open(p); InputStream input = compressionCodec.createInputStream(winput, decompressor); holder.setWrappedStream(winput); holder.setStream(input); } return holder; }
From source file:org.zuinnote.hadoop.bitcoin.format.AbstractBitcoinRecordReader.java
License:Apache License
/** * Creates an Abstract Record Reader for Bitcoin blocks * @param split Split to use (assumed to be a file split) * @param job Configuration:// w w w. ja v a 2 s . c o m * io.file.buffer.size: Size of in-memory specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize (maximum size of a bitcoin block) will be used. The configuration hadoopcryptoledger.bitcoinblockinputformat.filter.magic allows specifying the magic identifier of the block. The magic is a comma-separated list of Hex-values (e.g. F9BEB4D9,FABFB5DA,0B110907,0B110907). The default magic is always F9BEB4D9. One needs to specify at least one magic, otherwise it will be difficult to find blocks in splits. Furthermore, one may specify hadoopcryptoledger.bitcoinblockinputformat.maxblocksize, which defines the maximum size a bitcoin block may have. By default it is 1M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.bitcoinblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. * @param reporter Reporter * * * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop * @throws org.zuinnote.hadoop.bitcoin.format.exception.HadoopCryptoLedgerConfigurationException in case of an invalid HadoopCryptoLedger-specific configuration of the inputformat * @throws org.zuinnote.hadoop.bitcoin.format.exception.BitcoinBlockReadException in case the Bitcoin data contains invalid blocks (e.g. magic might be different) * */ public AbstractBitcoinRecordReader(FileSplit split, JobConf job, Reporter reporter) throws IOException, HadoopCryptoLedgerConfigurationException, BitcoinBlockReadException { // parse configuration this.conf = job; this.maxSizeBitcoinBlock = conf.getInt(this.CONF_MAXBLOCKSIZE, this.DEFAULT_MAXSIZE_BITCOINBLOCK); this.bufferSize = conf.getInt(this.CONF_BUFFERSIZE, this.DEFAULT_BUFFERSIZE); this.specificMagic = conf.get(this.CONF_FILTERMAGIC); // we need to provide at least if ((this.specificMagic == null) || (this.specificMagic.length() == 0)) this.specificMagic = this.DEFAULT_MAGIC; if ((this.specificMagic != null) && (this.specificMagic.length() > 0)) { this.specificMagicStringArray = specificMagic.split(","); specificMagicByteArray = new byte[specificMagicStringArray.length][4]; // each magic is always 4 byte for (int i = 0; i < specificMagicStringArray.length; i++) { byte[] currentMagicNo = BitcoinUtil.convertHexStringToByteArray(specificMagicStringArray[i]); if (currentMagicNo.length != 4) throw new HadoopCryptoLedgerConfigurationException( "Error: Configuration. Magic number has not a length of 4 bytes. Index: " + i); specificMagicByteArray[i] = currentMagicNo; } } this.useDirectBuffer = conf.getBoolean(this.CONF_USEDIRECTBUFFER, this.DEFAULT_USEDIRECTBUFFER); // Initialize start and end of split start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = new CompressionCodecFactory(job).getCodec(file); final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); // open stream if (isCompressedInput()) { // decompress decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock, this.bufferSize, this.specificMagicByteArray, this.useDirectBuffer); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { bbr = new BitcoinBlockReader(codec.createInputStream(fileIn, decompressor), this.maxSizeBitcoinBlock, this.bufferSize, this.specificMagicByteArray, this.useDirectBuffer); filePosition = fileIn; } } else { fileIn.seek(start); bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock, this.bufferSize, this.specificMagicByteArray, this.useDirectBuffer); filePosition = fileIn; } // initialize reader this.pos = start; // seek to block start (for the case a block overlaps a split) bbr.seekBlockStart(); }
From source file:org.zuinnote.hadoop.office.example.MapReduceExcelInputIntegrationTest.java
License:Apache License
private InputStream openFile(Path path) throws IOException { CompressionCodec codec = new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path); FSDataInputStream fileIn = dfsCluster.getFileSystem().open(path); // check if compressed if (codec == null) { // uncompressed return fileIn; } else { // compressed Decompressor decompressor = CodecPool.getDecompressor(codec); this.openDecompressors.add(decompressor); // to be returned later using close if (codec instanceof SplittableCompressionCodec) { long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, 0, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); return cIn; } else {/*from w ww . j a v a 2 s. c om*/ return codec.createInputStream(fileIn, decompressor); } } }
From source file:org.zuinnote.hadoop.office.format.common.HadoopFileReader.java
License:Apache License
public InputStream openFile(Path path) throws IOException { CompressionCodec codec = compressionCodecs.getCodec(path); FSDataInputStream fileIn = fs.open(path); // check if compressed if (codec == null) { // uncompressed LOG.debug("Reading from an uncompressed file \"" + path + "\""); return fileIn; } else { // compressed Decompressor decompressor = CodecPool.getDecompressor(codec); this.openDecompressors.add(decompressor); // to be returned later using close if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \"" + path + "\" with splittable compression codec"); long end = fs.getFileStatus(path).getLen(); return ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, 0, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); } else {/*from w ww . java 2 s . com*/ LOG.debug("Reading from a compressed file \"" + path + "\" with non-splittable compression codec"); return codec.createInputStream(fileIn, decompressor); } } }
From source file:org.zuinnote.hadoop.office.format.mapred.AbstractSpreadSheetDocumentRecordReader.java
License:Apache License
/** * Creates an Abstract Record Reader for tables from various document formats * @param split Split to use (assumed to be a file split) * @param job Configuration:/*from w ww.j av a 2 s .c o m*/ * hadoopoffice.read.mimeType: Mimetype of the document * hadoopoffice.read.locale: Locale of the document (e.g. needed for interpreting spreadsheets) in the BCP47 format (cf. https://tools.ietf.org/html/bcp47). If not specified then default system locale will be used. * hadoopoffice.read.sheets: A ":" separated list of sheets to be read. If not specified then all sheets will be read one after the other * hadoopoffice.read.linkedworkbooks: true if linkedworkbooks should be fetched. They must be in the same folder as the main workbook. Linked Workbooks will be processed together with the main workbook on one node and thus it should be avoided to have a lot of linked workbooks. It does only read the linked workbooks that are directly linked to the main workbook. Default: false * hadoopoffice.read.ignoremissinglinkedworkbooks: true if missing linked workbooks should be ignored. Default: false * hadoopoffice.read.security.crypt.password: if set then hadoopoffice will try to decrypt the file * hadoopoffice.read.security.crypt.linkedworkbooks.*: if set then hadoopoffice will try to decrypt all the linked workbooks where a password has been specified. If no password is specified then it is assumed that the linked workbook is not encrypted. Example: Property key for file "linkedworkbook1.xlsx" is "hadoopoffice.read.security.crypt.linkedworkbooks.linkedworkbook1.xslx". Value is the password. You must not include path or protocol information in the filename * hadoopoffice.read.filter.metadata: filters documents according to metadata. For example, hadoopoffice.read.filter.metadata.author will filter by author and the filter defined as value. Filtering is done by the parser and it is recommended that it supports regular expression for filtering, but this is up to the parser! * @param reporter Reporter * * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop * @throws org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException in case the document has an invalid format * */ public AbstractSpreadSheetDocumentRecordReader(FileSplit split, JobConf job, Reporter reporter) throws IOException, FormatNotUnderstoodException, GeneralSecurityException { // parse configuration this.conf = job; this.reporter = reporter; this.reporter.setStatus("Initialize Configuration"); this.hocr = new HadoopOfficeReadConfiguration(this.conf); // Initialize start and end of split start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); this.hocr.setFileName(file.getName()); this.readKeyStore(job); this.readTrustStore(job); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); FSDataInputStream fileIn = file.getFileSystem(job).open(file); // open stream if (isCompressedInput()) { // decompress decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec"); final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); officeReader = new OfficeReader(cIn, this.hocr); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec"); officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr); filePosition = fileIn; } } else { LOG.debug("Reading from an uncompressed file \"" + file + "\""); fileIn.seek(start); officeReader = new OfficeReader(fileIn, this.hocr); filePosition = fileIn; } this.reporter.setStatus("Parsing document"); // initialize reader this.officeReader.parse(); // read linked workbooks this.reporter.setStatus("Reading linked documents"); if (this.hocr.getReadLinkedWorkbooks()) { // get current path Path currentPath = split.getPath(); Path parentPath = currentPath.getParent(); if (!"".equals(this.hocr.getLinkedWorkbookLocation())) { // use a custom location for linked workbooks parentPath = new Path(this.hocr.getLinkedWorkbookLocation()); } // read linked workbook filenames List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks(); this.currentHFR = new HadoopFileReader(job); for (String listItem : linkedWorkbookList) { LOG.info("Adding linked workbook \"" + listItem + "\""); String sanitizedListItem = new Path(listItem).getName(); // read file from hadoop file Path currentFile = new Path(parentPath, sanitizedListItem); InputStream currentIn = this.currentHFR.openFile(currentFile); this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn, this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem)); } } }
From source file:org.zuinnote.hadoop.office.format.mapreduce.AbstractSpreadSheetDocumentRecordReader.java
License:Apache License
/** * Initializes reader//w w w . j a v a 2s. c o m * @param split Split to use (assumed to be a file split) * @param context context of the job * * * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop * @throws java.lang.InterruptedException in case of thread interruption * */ @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { try { FileSplit fSplit = (FileSplit) split; // Initialize start and end of split start = fSplit.getStart(); end = start + fSplit.getLength(); final Path file = fSplit.getPath(); codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); this.hocr.setFileName(file.getName()); this.readKeyStore(context.getConfiguration()); this.readTrustStore(context.getConfiguration()); FSDataInputStream fileIn = file.getFileSystem(conf).open(file); // open stream if (isCompressedInput()) { // decompress decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec"); final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); officeReader = new OfficeReader(cIn, this.hocr); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec"); officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr); filePosition = fileIn; } } else { LOG.debug("Reading from an uncompressed file \"" + file + "\""); fileIn.seek(start); officeReader = new OfficeReader(fileIn, this.hocr); filePosition = fileIn; } // initialize reader this.officeReader.parse(); // read linked workbooks if (this.hocr.getReadLinkedWorkbooks()) { // get current path Path currentPath = fSplit.getPath(); Path parentPath = currentPath.getParent(); if (!"".equals(this.hocr.getLinkedWorkbookLocation())) { // use a custom location for linked workbooks parentPath = new Path(this.hocr.getLinkedWorkbookLocation()); } // read linked workbook filenames List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks(); LOG.debug(linkedWorkbookList.size()); this.currentHFR = new HadoopFileReader(context.getConfiguration()); for (String listItem : linkedWorkbookList) { LOG.info("Adding linked workbook \"" + listItem + "\""); String sanitizedListItem = new Path(listItem).getName(); // read file from hadoop file Path currentFile = new Path(parentPath, sanitizedListItem); InputStream currentIn = this.currentHFR.openFile(currentFile); this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn, this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem)); } } } catch (FormatNotUnderstoodException fnue) { LOG.error(fnue); this.close(); throw new InterruptedException(); } }
From source file:skewtune.mapreduce.lib.input.MapOutputInputStream.java
License:Apache License
MapOutputInputStream(Configuration conf, TaskID reduceId, Counter inputCounter, SecretKey jobTokenSecret, List<MapOutputSplit> splits) throws IOException { if (conf.getBoolean(JobContext.MAP_OUTPUT_COMPRESS, false)) { Class<? extends CompressionCodec> codecClass = getMapOutputCompressorClass(conf, DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); decompressor = CodecPool.getDecompressor(codec); } else {//from w w w. j a v a2 s .co m codec = null; decompressor = null; } this.inputCounter = inputCounter; this.jobTokenSecret = jobTokenSecret; this.reduceTaskId = reduceId; int totalBufSz = conf.getInt("skewtune.map.io.inputbuf", 4 * 1024 * 1024); // 4 MB PACKET_SIZE = conf.getInt("skewtune.map.io.packetsize", 128 * 1024); // 128KB final int numBuf = totalBufSz / PACKET_SIZE; buffers = new ByteBuffer[numBuf]; for (int i = 0; i < numBuf; ++i) { buffers[i] = ByteBuffer.allocate(PACKET_SIZE); } this.splits = splits; this.q = new ArrayBlockingQueue<ByteBuffer>(numBuf - 2); // producer and consumer may keep one buffer at their hands this.fetcher = new Fetcher(conf, reduceId); this.fetcher.start(); progress = new Progress(); progress.addPhases(splits.size()); }