Example usage for org.apache.hadoop.io.compress SplitCompressionInputStream getAdjustedStart

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress SplitCompressionInputStream getAdjustedStart.

Prototype

public long getAdjustedStart()

Source Link

Document

After calling createInputStream, the values of start or end might change.

Usage

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from w  w  w.j  a  va 2 s. co m*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in);

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
    }
    return splits;
}

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

private static List<Long> getPageBytes(FileSplit split, FileSystem fs,
        CompressionCodecFactory compressionCodecs, Reporter reporter) throws IOException {
    SeekableInputStream in = null;/*from   w  w w  . jav a  2  s .  c o m*/
    try {
        in = SeekableInputStream.getInstance(split, fs, compressionCodecs);
        long start = split.getStart();
        long end = start + split.getLength();
        SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
        if (cin != null) {
            start = cin.getAdjustedStart();
            end = cin.getAdjustedEnd() + 1;
        }
        ByteMatcher matcher = new ByteMatcher(in, in);
        List<Long> ret = new ArrayList<Long>();
        while (true) {
            if (matcher.getPos() >= end || !matcher.readUntilMatch(pageBeginPattern, null, end)) {
                break;
            }
            ret.add(matcher.getReadBytes() - pageBeginPattern.getBytes("UTF-8").length);
            if (matcher.getPos() >= end || !matcher.readUntilMatch(pageEndPattern, null, end)) {
                System.err.println("could not find " + pageEndPattern + ", page over a split?  pos="
                        + matcher.getPos() + " bytes=" + matcher.getReadBytes());
                //ret.add(end);
                break;
            }
            ret.add(matcher.getReadBytes() - pageEndPattern.getBytes("UTF-8").length);
            String report = String.format(
                    "StreamWikiDumpInputFormat: find page %6d start=%d pos=%d end=%d bytes=%d", ret.size(),
                    start, matcher.getPos(), end, matcher.getReadBytes());
            reporter.setStatus(report);
            reporter.incrCounter(WikiDumpCounters.FOUND_PAGES, 1);
            LOG.info(report);
        }
        if (ret.size() % 2 == 0) {
            ret.add(matcher.getReadBytes());
        }
        //System.err.println("getPageBytes " + ret);//!
        return ret;
    } finally {
        if (in != null) {
            in.close();
        }
    }
}

From source file:org.zuinnote.hadoop.bitcoin.format.AbstractBitcoinRecordReader.java

License:Apache License

/**
* Creates an Abstract Record Reader for Bitcoin blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration://from   w ww  .  j  a v  a  2s  .  c o m
* io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize (maximum size of a bitcoin block) will be used. The configuration hadoopcryptoledger.bitcoinblockinputformat.filter.magic allows specifying the magic identifier of the block. The magic is a comma-separated list of Hex-values (e.g. F9BEB4D9,FABFB5DA,0B110907,0B110907). The default magic is always F9BEB4D9. One needs to specify at least one magic, otherwise it will be difficult to find blocks in splits. Furthermore, one may specify hadoopcryptoledger.bitcoinblockinputformat.maxblocksize, which defines the maximum size a bitcoin block may have. By default it is 1M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.bitcoinblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.bitcoin.format.exception.HadoopCryptoLedgerConfigurationException in case of an invalid HadoopCryptoLedger-specific configuration of the inputformat
* @throws org.zuinnote.hadoop.bitcoin.format.exception.BitcoinBlockReadException in case the Bitcoin data contains invalid blocks (e.g. magic might be different)
*
*/
public AbstractBitcoinRecordReader(FileSplit split, JobConf job, Reporter reporter)
        throws IOException, HadoopCryptoLedgerConfigurationException, BitcoinBlockReadException {
    // parse configuration
    this.conf = job;
    this.maxSizeBitcoinBlock = conf.getInt(this.CONF_MAXBLOCKSIZE, this.DEFAULT_MAXSIZE_BITCOINBLOCK);
    this.bufferSize = conf.getInt(this.CONF_BUFFERSIZE, this.DEFAULT_BUFFERSIZE);
    this.specificMagic = conf.get(this.CONF_FILTERMAGIC);
    // we need to provide at least 
    if ((this.specificMagic == null) || (this.specificMagic.length() == 0))
        this.specificMagic = this.DEFAULT_MAGIC;
    if ((this.specificMagic != null) && (this.specificMagic.length() > 0)) {
        this.specificMagicStringArray = specificMagic.split(",");
        specificMagicByteArray = new byte[specificMagicStringArray.length][4]; // each magic is always 4 byte
        for (int i = 0; i < specificMagicStringArray.length; i++) {
            byte[] currentMagicNo = BitcoinUtil.convertHexStringToByteArray(specificMagicStringArray[i]);
            if (currentMagicNo.length != 4)
                throw new HadoopCryptoLedgerConfigurationException(
                        "Error: Configuration. Magic number has not a length of 4 bytes. Index: " + i);
            specificMagicByteArray[i] = currentMagicNo;
        }
    }
    this.useDirectBuffer = conf.getBoolean(this.CONF_USEDIRECTBUFFER, this.DEFAULT_USEDIRECTBUFFER);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
    if (isCompressedInput()) { // decompress
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {

            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock, this.bufferSize,
                    this.specificMagicByteArray, this.useDirectBuffer);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            bbr = new BitcoinBlockReader(codec.createInputStream(fileIn, decompressor),
                    this.maxSizeBitcoinBlock, this.bufferSize, this.specificMagicByteArray,
                    this.useDirectBuffer);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock, this.bufferSize,
                this.specificMagicByteArray, this.useDirectBuffer);
        filePosition = fileIn;
    }
    // initialize reader
    this.pos = start;
    // seek to block start (for the case a block overlaps a split)
    bbr.seekBlockStart();
}

From source file:org.zuinnote.hadoop.office.format.mapred.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Creates an Abstract Record Reader for tables from various document formats
* @param split Split to use (assumed to be a file split)
* @param job Configuration:/*from   w  ww  .j av a2  s. c o  m*/
* hadoopoffice.read.mimeType: Mimetype of the document
* hadoopoffice.read.locale: Locale of the document (e.g. needed for interpreting spreadsheets) in the BCP47 format (cf. https://tools.ietf.org/html/bcp47). If not specified then default system locale will be used.
* hadoopoffice.read.sheets: A ":" separated list of sheets to be read. If not specified then all sheets will be read one after the other
* hadoopoffice.read.linkedworkbooks: true if linkedworkbooks should be fetched. They must be in the same folder as the main workbook. Linked Workbooks will be processed together with the main workbook on one node and thus it should be avoided to have a lot of linked workbooks. It does only read the linked workbooks that are directly linked to the main workbook. Default: false
* hadoopoffice.read.ignoremissinglinkedworkbooks: true if missing linked workbooks should be ignored. Default: false
* hadoopoffice.read.security.crypt.password: if set then hadoopoffice will try to decrypt the file
* hadoopoffice.read.security.crypt.linkedworkbooks.*: if set then hadoopoffice will try to decrypt all the linked workbooks where a password has been specified. If no password is specified then it is assumed that the linked workbook is not encrypted. Example: Property key for file "linkedworkbook1.xlsx" is  "hadoopoffice.read.security.crypt.linkedworkbooks.linkedworkbook1.xslx". Value is the password. You must not include path or protocol information in the filename 
* hadoopoffice.read.filter.metadata: filters documents according to metadata. For example, hadoopoffice.read.filter.metadata.author will filter by author and the filter defined as value. Filtering is done by the parser and it is recommended that it supports regular expression for filtering, but this is up to the parser!
* @param reporter Reporter
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException in case the document has an invalid format
*
*/
public AbstractSpreadSheetDocumentRecordReader(FileSplit split, JobConf job, Reporter reporter)
        throws IOException, FormatNotUnderstoodException, GeneralSecurityException {
    // parse configuration
    this.conf = job;
    this.reporter = reporter;
    this.reporter.setStatus("Initialize Configuration");
    this.hocr = new HadoopOfficeReadConfiguration(this.conf);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    this.hocr.setFileName(file.getName());
    this.readKeyStore(job);
    this.readTrustStore(job);
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    FSDataInputStream fileIn = file.getFileSystem(job).open(file);
    // open stream
    if (isCompressedInput()) { // decompress
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            officeReader = new OfficeReader(cIn, this.hocr);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec");
            officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
            filePosition = fileIn;
        }
    } else {
        LOG.debug("Reading from an uncompressed file \"" + file + "\"");
        fileIn.seek(start);
        officeReader = new OfficeReader(fileIn, this.hocr);
        filePosition = fileIn;
    }
    this.reporter.setStatus("Parsing document");
    // initialize reader
    this.officeReader.parse();
    // read linked workbooks
    this.reporter.setStatus("Reading linked documents");
    if (this.hocr.getReadLinkedWorkbooks()) {
        // get current path
        Path currentPath = split.getPath();
        Path parentPath = currentPath.getParent();
        if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
            // use a custom location for linked workbooks
            parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
        }

        // read linked workbook filenames
        List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
        this.currentHFR = new HadoopFileReader(job);
        for (String listItem : linkedWorkbookList) {
            LOG.info("Adding linked workbook \"" + listItem + "\"");
            String sanitizedListItem = new Path(listItem).getName();
            // read file from hadoop file
            Path currentFile = new Path(parentPath, sanitizedListItem);
            InputStream currentIn = this.currentHFR.openFile(currentFile);
            this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                    this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
        }
    }
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Initializes reader/*from w  w  w  .  j ava 2  s .c o  m*/
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    try {
        FileSplit fSplit = (FileSplit) split;
        // Initialize start and end of split
        start = fSplit.getStart();
        end = start + fSplit.getLength();
        final Path file = fSplit.getPath();
        codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
        this.hocr.setFileName(file.getName());
        this.readKeyStore(context.getConfiguration());
        this.readTrustStore(context.getConfiguration());
        FSDataInputStream fileIn = file.getFileSystem(conf).open(file);
        // open stream
        if (isCompressedInput()) { // decompress
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
                officeReader = new OfficeReader(cIn, this.hocr);
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn; // take pos from compressed stream
            } else {
                LOG.debug("Reading from a compressed file \"" + file
                        + "\" with non-splittable compression codec");
                officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
                filePosition = fileIn;
            }
        } else {
            LOG.debug("Reading from an uncompressed file \"" + file + "\"");
            fileIn.seek(start);
            officeReader = new OfficeReader(fileIn, this.hocr);
            filePosition = fileIn;
        }
        // initialize reader
        this.officeReader.parse();
        // read linked workbooks
        if (this.hocr.getReadLinkedWorkbooks()) {
            // get current path
            Path currentPath = fSplit.getPath();
            Path parentPath = currentPath.getParent();
            if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
                // use a custom location for linked workbooks
                parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
            }
            // read linked workbook filenames
            List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
            LOG.debug(linkedWorkbookList.size());
            this.currentHFR = new HadoopFileReader(context.getConfiguration());
            for (String listItem : linkedWorkbookList) {
                LOG.info("Adding linked workbook \"" + listItem + "\"");
                String sanitizedListItem = new Path(listItem).getName();
                // read file from hadoop file
                Path currentFile = new Path(parentPath, sanitizedListItem);
                InputStream currentIn = this.currentHFR.openFile(currentFile);
                this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                        this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
            }
        }
    } catch (FormatNotUnderstoodException fnue) {
        LOG.error(fnue);
        this.close();
        throw new InterruptedException();
    }
}

From source file:wiki.hadoop.mapred.lib.input.StreamWikiDumpInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize)
        throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Path path = file.getPath();/*from  w  ww  . jav  a 2 s . c  om*/
    long length = file.getLen();
    FileSystem fs = file.getPath().getFileSystem(job);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(fs, path)) {

        long bytesRemaining = length;
        SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs);
        SplitCompressionInputStream is = in.getSplitCompressionInputStream();
        long start = 0;
        long skip = 0;
        if (is != null) {
            start = is.getAdjustedStart();
            length = is.getAdjustedEnd();
            is.close();
            in = null;
        }
        LOG.info("locations=" + Arrays.asList(blkLocations));
        FileSplit split = null;
        Set<Long> processedPageEnds = new HashSet<Long>();
        float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F);

        READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) {
            // prepare matcher
            ByteMatcher matcher;
            {
                long st = Math.min(start + skip + splitSize, length - 1);
                split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations);
                System.err.println("split move to: " + split);
                if (in != null)
                    in.close();
                if (split.getLength() <= 1) {
                    break;
                }
                in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs);
                SplitCompressionInputStream cin = in.getSplitCompressionInputStream();
            }
            matcher = new ByteMatcher(in, split.getStart());

            // read until the next page end in the look-ahead split
            boolean reach = false;
            while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) {
                if (matcher.getPos() >= length || split.getLength() == length - split.getStart())
                    break READLOOP;
                reach = false;
                split = makeSplit(path, split.getStart(),
                        Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap,
                        blkLocations);
                System.err.println("split extend to: " + split);
            }
            System.err.println(
                    path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos()
                            + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes()
                            + " current=" + start + " remaining=" + bytesRemaining + " split=" + split);
            if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos()
                    && !processedPageEnds.contains(matcher.getPos())) {
                splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations));
                processedPageEnds.add(matcher.getPos());
                long newstart = Math.max(matcher.getLastUnmatchPos(), start);
                bytesRemaining = length - newstart;
                start = newstart;
                skip = 0;
            } else {
                skip = matcher.getPos() - start;
            }
        }

        if (bytesRemaining > 0 && !processedPageEnds.contains(length)) {
            System.err.println(
                    pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length);
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
        if (in != null)
            in.close();
    } else if (length != 0) {
        splits.add(makeSplit(path, 0, length, clusterMap, blkLocations));
    } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
    return splits;
}