Example usage for org.apache.hadoop.io.compress CodecPool getDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CodecPool getDecompressor.

Prototype

public static Decompressor getDecompressor(CompressionCodec codec)

Source Link

Document

Get a Decompressor for the given CompressionCodec from the pool or a new one.

Usage

From source file:org.springframework.data.hadoop.store.input.AbstractDataStreamReader.java

License:Apache License

/**
 * Gets the input.//from   w w w  .  jav  a  2  s.  c om
 *
 * @param inputPath the input path
 * @return the input
 * @throws IOException Signals that an I/O exception has occurred.
 */
protected StreamsHolder<InputStream> getInput(Path inputPath) throws IOException {
    log.info("Creating new InputStream");
    StreamsHolder<InputStream> holder = new StreamsHolder<InputStream>();
    final FileSystem fs = getPath().getFileSystem(getConfiguration());
    Path p = inputPath.isAbsolute() ? inputPath : new Path(getPath(), inputPath);
    if (!fs.exists(p)) {
        throw new StoreException("Path " + p + " does not exist");
    }
    if (!isCompressed()) {
        InputStream input = fs.open(p);
        holder.setStream(input);
    } else {
        // TODO: will isCompressed() really guard for npe against getCodec()
        Class<?> clazz = ClassUtils.resolveClassName(getCodec().getCodecClass(), getClass().getClassLoader());
        CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz,
                getConfiguration());
        Decompressor decompressor = CodecPool.getDecompressor(compressionCodec);
        FSDataInputStream winput = fs.open(p);
        InputStream input = compressionCodec.createInputStream(winput, decompressor);
        holder.setWrappedStream(winput);
        holder.setStream(input);
    }
    return holder;
}

From source file:org.zuinnote.hadoop.bitcoin.format.AbstractBitcoinRecordReader.java

License:Apache License

/**
* Creates an Abstract Record Reader for Bitcoin blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration:// w w  w.  ja  v a  2 s  . c o  m
* io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize (maximum size of a bitcoin block) will be used. The configuration hadoopcryptoledger.bitcoinblockinputformat.filter.magic allows specifying the magic identifier of the block. The magic is a comma-separated list of Hex-values (e.g. F9BEB4D9,FABFB5DA,0B110907,0B110907). The default magic is always F9BEB4D9. One needs to specify at least one magic, otherwise it will be difficult to find blocks in splits. Furthermore, one may specify hadoopcryptoledger.bitcoinblockinputformat.maxblocksize, which defines the maximum size a bitcoin block may have. By default it is 1M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.bitcoinblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.bitcoin.format.exception.HadoopCryptoLedgerConfigurationException in case of an invalid HadoopCryptoLedger-specific configuration of the inputformat
* @throws org.zuinnote.hadoop.bitcoin.format.exception.BitcoinBlockReadException in case the Bitcoin data contains invalid blocks (e.g. magic might be different)
*
*/
public AbstractBitcoinRecordReader(FileSplit split, JobConf job, Reporter reporter)
        throws IOException, HadoopCryptoLedgerConfigurationException, BitcoinBlockReadException {
    // parse configuration
    this.conf = job;
    this.maxSizeBitcoinBlock = conf.getInt(this.CONF_MAXBLOCKSIZE, this.DEFAULT_MAXSIZE_BITCOINBLOCK);
    this.bufferSize = conf.getInt(this.CONF_BUFFERSIZE, this.DEFAULT_BUFFERSIZE);
    this.specificMagic = conf.get(this.CONF_FILTERMAGIC);
    // we need to provide at least 
    if ((this.specificMagic == null) || (this.specificMagic.length() == 0))
        this.specificMagic = this.DEFAULT_MAGIC;
    if ((this.specificMagic != null) && (this.specificMagic.length() > 0)) {
        this.specificMagicStringArray = specificMagic.split(",");
        specificMagicByteArray = new byte[specificMagicStringArray.length][4]; // each magic is always 4 byte
        for (int i = 0; i < specificMagicStringArray.length; i++) {
            byte[] currentMagicNo = BitcoinUtil.convertHexStringToByteArray(specificMagicStringArray[i]);
            if (currentMagicNo.length != 4)
                throw new HadoopCryptoLedgerConfigurationException(
                        "Error: Configuration. Magic number has not a length of 4 bytes. Index: " + i);
            specificMagicByteArray[i] = currentMagicNo;
        }
    }
    this.useDirectBuffer = conf.getBoolean(this.CONF_USEDIRECTBUFFER, this.DEFAULT_USEDIRECTBUFFER);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
    if (isCompressedInput()) { // decompress
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {

            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock, this.bufferSize,
                    this.specificMagicByteArray, this.useDirectBuffer);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            bbr = new BitcoinBlockReader(codec.createInputStream(fileIn, decompressor),
                    this.maxSizeBitcoinBlock, this.bufferSize, this.specificMagicByteArray,
                    this.useDirectBuffer);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock, this.bufferSize,
                this.specificMagicByteArray, this.useDirectBuffer);
        filePosition = fileIn;
    }
    // initialize reader
    this.pos = start;
    // seek to block start (for the case a block overlaps a split)
    bbr.seekBlockStart();
}

From source file:org.zuinnote.hadoop.office.example.MapReduceExcelInputIntegrationTest.java

License:Apache License

private InputStream openFile(Path path) throws IOException {
    CompressionCodec codec = new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
    FSDataInputStream fileIn = dfsCluster.getFileSystem().open(path);
    // check if compressed
    if (codec == null) { // uncompressed
        return fileIn;
    } else { // compressed
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        this.openDecompressors.add(decompressor); // to be returned later using close
        if (codec instanceof SplittableCompressionCodec) {
            long end = dfsCluster.getFileSystem().getFileStatus(path).getLen();
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, 0, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            return cIn;
        } else {/*from   w ww  .  j  a v a  2  s.  c  om*/
            return codec.createInputStream(fileIn, decompressor);
        }
    }
}

From source file:org.zuinnote.hadoop.office.format.common.HadoopFileReader.java

License:Apache License

public InputStream openFile(Path path) throws IOException {
    CompressionCodec codec = compressionCodecs.getCodec(path);
    FSDataInputStream fileIn = fs.open(path);
    // check if compressed
    if (codec == null) { // uncompressed
        LOG.debug("Reading from an uncompressed file \"" + path + "\"");
        return fileIn;
    } else { // compressed
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        this.openDecompressors.add(decompressor); // to be returned later using close
        if (codec instanceof SplittableCompressionCodec) {
            LOG.debug("Reading from a compressed file \"" + path + "\" with splittable compression codec");
            long end = fs.getFileStatus(path).getLen();
            return ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, 0, end,
                    SplittableCompressionCodec.READ_MODE.CONTINUOUS);
        } else {/*from  w ww  .  java  2  s .  com*/
            LOG.debug("Reading from a compressed file \"" + path + "\" with non-splittable compression codec");
            return codec.createInputStream(fileIn, decompressor);
        }
    }
}

From source file:org.zuinnote.hadoop.office.format.mapred.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Creates an Abstract Record Reader for tables from various document formats
* @param split Split to use (assumed to be a file split)
* @param job Configuration:/*from w ww.j  av a  2  s  .c o  m*/
* hadoopoffice.read.mimeType: Mimetype of the document
* hadoopoffice.read.locale: Locale of the document (e.g. needed for interpreting spreadsheets) in the BCP47 format (cf. https://tools.ietf.org/html/bcp47). If not specified then default system locale will be used.
* hadoopoffice.read.sheets: A ":" separated list of sheets to be read. If not specified then all sheets will be read one after the other
* hadoopoffice.read.linkedworkbooks: true if linkedworkbooks should be fetched. They must be in the same folder as the main workbook. Linked Workbooks will be processed together with the main workbook on one node and thus it should be avoided to have a lot of linked workbooks. It does only read the linked workbooks that are directly linked to the main workbook. Default: false
* hadoopoffice.read.ignoremissinglinkedworkbooks: true if missing linked workbooks should be ignored. Default: false
* hadoopoffice.read.security.crypt.password: if set then hadoopoffice will try to decrypt the file
* hadoopoffice.read.security.crypt.linkedworkbooks.*: if set then hadoopoffice will try to decrypt all the linked workbooks where a password has been specified. If no password is specified then it is assumed that the linked workbook is not encrypted. Example: Property key for file "linkedworkbook1.xlsx" is  "hadoopoffice.read.security.crypt.linkedworkbooks.linkedworkbook1.xslx". Value is the password. You must not include path or protocol information in the filename 
* hadoopoffice.read.filter.metadata: filters documents according to metadata. For example, hadoopoffice.read.filter.metadata.author will filter by author and the filter defined as value. Filtering is done by the parser and it is recommended that it supports regular expression for filtering, but this is up to the parser!
* @param reporter Reporter
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException in case the document has an invalid format
*
*/
public AbstractSpreadSheetDocumentRecordReader(FileSplit split, JobConf job, Reporter reporter)
        throws IOException, FormatNotUnderstoodException, GeneralSecurityException {
    // parse configuration
    this.conf = job;
    this.reporter = reporter;
    this.reporter.setStatus("Initialize Configuration");
    this.hocr = new HadoopOfficeReadConfiguration(this.conf);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    this.hocr.setFileName(file.getName());
    this.readKeyStore(job);
    this.readTrustStore(job);
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    FSDataInputStream fileIn = file.getFileSystem(job).open(file);
    // open stream
    if (isCompressedInput()) { // decompress
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            officeReader = new OfficeReader(cIn, this.hocr);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec");
            officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
            filePosition = fileIn;
        }
    } else {
        LOG.debug("Reading from an uncompressed file \"" + file + "\"");
        fileIn.seek(start);
        officeReader = new OfficeReader(fileIn, this.hocr);
        filePosition = fileIn;
    }
    this.reporter.setStatus("Parsing document");
    // initialize reader
    this.officeReader.parse();
    // read linked workbooks
    this.reporter.setStatus("Reading linked documents");
    if (this.hocr.getReadLinkedWorkbooks()) {
        // get current path
        Path currentPath = split.getPath();
        Path parentPath = currentPath.getParent();
        if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
            // use a custom location for linked workbooks
            parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
        }

        // read linked workbook filenames
        List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
        this.currentHFR = new HadoopFileReader(job);
        for (String listItem : linkedWorkbookList) {
            LOG.info("Adding linked workbook \"" + listItem + "\"");
            String sanitizedListItem = new Path(listItem).getName();
            // read file from hadoop file
            Path currentFile = new Path(parentPath, sanitizedListItem);
            InputStream currentIn = this.currentHFR.openFile(currentFile);
            this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                    this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
        }
    }
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Initializes reader//w  w w  . j  a  v  a  2s. c  o m
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    try {
        FileSplit fSplit = (FileSplit) split;
        // Initialize start and end of split
        start = fSplit.getStart();
        end = start + fSplit.getLength();
        final Path file = fSplit.getPath();
        codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
        this.hocr.setFileName(file.getName());
        this.readKeyStore(context.getConfiguration());
        this.readTrustStore(context.getConfiguration());
        FSDataInputStream fileIn = file.getFileSystem(conf).open(file);
        // open stream
        if (isCompressedInput()) { // decompress
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
                officeReader = new OfficeReader(cIn, this.hocr);
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn; // take pos from compressed stream
            } else {
                LOG.debug("Reading from a compressed file \"" + file
                        + "\" with non-splittable compression codec");
                officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
                filePosition = fileIn;
            }
        } else {
            LOG.debug("Reading from an uncompressed file \"" + file + "\"");
            fileIn.seek(start);
            officeReader = new OfficeReader(fileIn, this.hocr);
            filePosition = fileIn;
        }
        // initialize reader
        this.officeReader.parse();
        // read linked workbooks
        if (this.hocr.getReadLinkedWorkbooks()) {
            // get current path
            Path currentPath = fSplit.getPath();
            Path parentPath = currentPath.getParent();
            if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
                // use a custom location for linked workbooks
                parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
            }
            // read linked workbook filenames
            List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
            LOG.debug(linkedWorkbookList.size());
            this.currentHFR = new HadoopFileReader(context.getConfiguration());
            for (String listItem : linkedWorkbookList) {
                LOG.info("Adding linked workbook \"" + listItem + "\"");
                String sanitizedListItem = new Path(listItem).getName();
                // read file from hadoop file
                Path currentFile = new Path(parentPath, sanitizedListItem);
                InputStream currentIn = this.currentHFR.openFile(currentFile);
                this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                        this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
            }
        }
    } catch (FormatNotUnderstoodException fnue) {
        LOG.error(fnue);
        this.close();
        throw new InterruptedException();
    }
}

From source file:skewtune.mapreduce.lib.input.MapOutputInputStream.java

License:Apache License

MapOutputInputStream(Configuration conf, TaskID reduceId, Counter inputCounter, SecretKey jobTokenSecret,
        List<MapOutputSplit> splits) throws IOException {
    if (conf.getBoolean(JobContext.MAP_OUTPUT_COMPRESS, false)) {
        Class<? extends CompressionCodec> codecClass = getMapOutputCompressorClass(conf, DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, conf);
        decompressor = CodecPool.getDecompressor(codec);
    } else {//from   w  w w. j a  v  a2 s  .co m
        codec = null;
        decompressor = null;
    }

    this.inputCounter = inputCounter;
    this.jobTokenSecret = jobTokenSecret;
    this.reduceTaskId = reduceId;

    int totalBufSz = conf.getInt("skewtune.map.io.inputbuf", 4 * 1024 * 1024); // 4 MB
    PACKET_SIZE = conf.getInt("skewtune.map.io.packetsize", 128 * 1024); // 128KB

    final int numBuf = totalBufSz / PACKET_SIZE;
    buffers = new ByteBuffer[numBuf];
    for (int i = 0; i < numBuf; ++i) {
        buffers[i] = ByteBuffer.allocate(PACKET_SIZE);
    }
    this.splits = splits;

    this.q = new ArrayBlockingQueue<ByteBuffer>(numBuf - 2); // producer and consumer may keep one buffer at their hands
    this.fetcher = new Fetcher(conf, reduceId);
    this.fetcher.start();

    progress = new Progress();
    progress.addPhases(splits.size());
}