Example usage for org.apache.hadoop.io.compress CodecPool getDecompressor

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CodecPool getDecompressor.

Prototype

public static Decompressor getDecompressor(CompressionCodec codec)

Source Link

Document

Get a Decompressor for the given CompressionCodec from the pool or a new one.

Usage

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //from   w ww.  ja va  2s .co m
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Checks whether a file is indexed using an R-tree or not. This allows
 * an operation to use the R-tree to speedup the processing if it exists.
 * This function opens the specified file and reads the first eight bytes
 * which include the R-tree signature. If the signatures matches with the
 * R-tree signature, true is returned. Otherwise, false is returned.
 * If the parameter is a path to a directory, only the first data file in that
 * directory is tested./*  w  w  w .  j  a v a 2 s .  c  o  m*/
 * @param fs
 * @param path
 * @return
 * @throws IOException
 */
public static boolean isRTree(FileSystem fs, Path path) throws IOException {
    if (FileUtil.getExtensionWithoutCompression(path).equals("rtree"))
        return true;

    FileStatus file = fs.getFileStatus(path);
    Path fileToCheck;
    if (file.isDir()) {
        // Check any cell (e.g., first cell)
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return false;
        fileToCheck = new Path(path, gIndex.iterator().next().filename);
    } else {
        fileToCheck = file.getPath();
    }
    InputStream fileIn = fs.open(fileToCheck);

    // Check if file is compressed
    CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
    Decompressor decompressor = null;
    if (codec != null) {
        synchronized (compressionCodecs) {
            // CodecPool is not thread-safe
            decompressor = CodecPool.getDecompressor(codec);
        }
        fileIn = codec.createInputStream(fileIn, decompressor);
    }
    byte[] signature = new byte[RTreeFileMarkerB.length];
    fileIn.read(signature);
    fileIn.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java

License:Open Source License

/**
 * Initialize from a path and file range
 * @param job//from www  . j  a v a 2s . c om
 * @param s
 * @param l
 * @param p
 * @throws IOException
 */
public SpatialRecordReader(Configuration job, long s, long l, Path p) throws IOException {
    this.start = s;
    this.end = s + l;
    this.path = p;
    LOG.info("Open a SpatialRecordReader to file: " + p + "[" + s + "," + (s + l) + ")");
    this.fs = this.path.getFileSystem(job);
    this.directIn = fs.open(this.path);
    this.blockSize = fs.getFileStatus(this.path).getBlockSize();
    this.cellMbr = new Rectangle();

    codec = new CompressionCodecFactory(job).getCodec(this.path);

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = cIn;
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = codec.createInputStream(directIn, decompressor);
            filePosition = directIn;
        }
    } else {
        directIn.seek(start);
        in = directIn;
        filePosition = directIn;
    }
    this.pos = start;
    this.maxShapesInOneRead = job.getInt(SpatialSite.MaxShapesInOneRead, 1000000);
    this.maxBytesInOneRead = job.getInt(SpatialSite.MaxBytesInOneRead, 32 * 1024 * 1024);

    initializeReader();
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    LOG.info("Open a SpatialRecordReader to split: " + split);
    FileSplit fsplit = (FileSplit) split;
    this.path = fsplit.getPath();
    this.start = fsplit.getStart();
    this.end = this.start + split.getLength();
    this.fs = this.path.getFileSystem(conf);
    this.directIn = fs.open(this.path);
    codec = new CompressionCodecFactory(conf).getCodec(this.path);

    if (codec != null) {
        // Input is compressed, create a decompressor to decompress it
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            // A splittable compression codec, can seek to the desired input pos
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new DataInputStream(cIn);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();//from ww  w.ja v a  2 s  . c  o m
            // take pos from compressed stream as we adjusted both start and end
            // to match with the compressed file
            filePosition = cIn;
        } else {
            // Non-splittable input, need to start from the beginning
            CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
            in = new DataInputStream(cIn);
            filePosition = cIn;
        }
    } else {
        // Non-compressed file, seek to the desired position and use this stream
        // to get the progress and position
        directIn.seek(start);
        in = directIn;
        filePosition = directIn;
    }
    byte[] signature = new byte[8];
    in.readFully(signature);
    if (!Arrays.equals(signature, SpatialSite.RTreeFileMarkerB)) {
        throw new RuntimeException("Incorrect signature for RTree");
    }
    this.stockShape = (V) OperationsParams.getShape(conf, "shape");

    if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
        // Retrieve the input query range to apply on all records
        this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
        this.inputQueryMBR = this.inputQueryRange.getMBR();
    }

    // Check if there is an associated global index to read cell boundaries
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
    if (gindex == null) {
        cellMBR = new Partition();
        cellMBR.invalidate();
    } else {
        // Set from the associated partition in the global index
        for (Partition p : gindex) {
            if (p.filename.equals(this.path.getName()))
                cellMBR = p;
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    FileSplit fsplit = (FileSplit) split;
    if (compressionCodecFactory == null)
        compressionCodecFactory = new CompressionCodecFactory(conf);

    LOG.info("Open a SpatialRecordReader to split: " + split);
    this.path = fsplit.getPath();
    this.start = fsplit.getStart();
    this.end = this.start + split.getLength();
    this.fs = this.path.getFileSystem(conf);
    this.directIn = fs.open(this.path);
    codec = compressionCodecFactory.getCodec(this.path);

    if (codec != null) {
        // Input is compressed, create a decompressor to decompress it
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            // A splittable compression codec, can seek to the desired input pos
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = cIn;/*  w ww .j  a  v a2  s  .co m*/
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            // take pos from compressed stream as we adjusted both start and end
            // to match with the compressed file
            progressPosition = cIn;
        } else {
            // Non-splittable input, need to start from the beginning
            CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
            in = cIn;
            progressPosition = cIn;
        }
    } else {
        // Non-compressed file, seek to the desired position and use this stream
        // to get the progress and position
        directIn.seek(start);
        in = directIn;
        progressPosition = directIn;
    }
    this.stockShape = (V) OperationsParams.getShape(conf, "shape");
    this.tempLine = new Text();

    this.lineReader = new LineReader(in);
    bytesRead = 0;

    if (this.start != 0) {
        // Skip until first end-of-line reached
        bytesRead += lineReader.readLine(tempLine);
    }
    if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
        // Retrieve the input query range to apply on all records
        this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
        this.inputQueryMBR = this.inputQueryRange.getMBR();
    }

    // Check if there is an associated global index to read cell boundaries
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
    if (gindex == null) {
        cellMBR = new Partition();
        cellMBR.filename = path.getName();
        cellMBR.invalidate();
    } else {
        // Set from the associated partition in the global index
        for (Partition p : gindex) {
            if (p.filename.equals(this.path.getName()))
                cellMBR = p;
        }
    }

    this.value = new ShapeIterator<V>();
    value.setShape(stockShape);
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample a specific number of lines from a given file
 * @param fs//from   w  w  w .  j a va 2 s.c  o m
 * @param file
 * @param count
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed,
        ResultCollector<Text> output) throws IOException {
    InputStream in = null;
    Decompressor decompressor = null;
    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());

        // Open the file and read the sample
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());
        int sampledLines = 0;

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                sampledLines = sampleStreamByCount(in, end - start, count, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output);
            }
        } else {
            long pos = 0; // Current position in file

            // Generate random offsets and keep them sorted for IO efficiency
            Random rand = new Random(seed);
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            Text line = new Text2();
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }

        return sampledLines;
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample text lines from the given split with the given sampling ratio
 * @param fs/*  w w w. j  a  va2 s.c  om*/
 * @param file
 * @param ratio
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed,
        ResultCollector<Text> output) throws IOException {

    InputStream in = null;
    Decompressor decompressor = null;
    int sampledLines;
    Text line = new Text2();

    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                // Skip first line if needed
                if (file.getStart() > 0)
                    start += readUntilEOL(cIn, line);

                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                // No need to skip first line because we actually read the file from
                // the beginning
                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            }
        } else {
            // Not a compressed file. Apply a more efficient, though approximate,
            // solution
            // Open the file and read the sample
            long pos = 0; // Current position in file
            if (file.getStart() > 0) {
                pos += in.skip(file.getStart());
                pos += readUntilEOL(in, line);
            }

            // Initialize the random variable which is used for sampling
            Random rand = new Random(seed);
            sampledLines = 0;

            // Read the first 10 lines to estimate the average record size
            long end = file.getStart() + file.getLength();
            for (int i = 0; i < 10 && pos < end; i++) {
                line.clear();
                pos += readUntilEOL(in, line);
                if (rand.nextFloat() < ratio) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }

            int averageLineSize = (int) ((pos - file.getStart()) / 10);
            int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines;
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }

    in.close();
    return sampledLines;
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testMultipleClose() throws IOException {
    URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
    assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
    File testFile = new File(testFileUrl.getFile());
    Path testFilePath = new Path(testFile.getAbsolutePath());
    long testFileSize = testFile.length();
    Configuration conf = new Configuration();
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    // read the data and check whether BOM is skipped
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
    LineRecordReader reader = new LineRecordReader();
    reader.initialize(split, context);/*w  ww .j a  v  a  2s. co m*/

    //noinspection StatementWithEmptyBody
    while (reader.nextKeyValue())
        ;
    reader.close();
    reader.close();

    BZip2Codec codec = new BZip2Codec();
    codec.setConf(conf);
    Set<Decompressor> decompressors = new HashSet<Decompressor>();
    for (int i = 0; i < 10; ++i) {
        decompressors.add(CodecPool.getDecompressor(codec));
    }
    assertEquals(10, decompressors.size());
}

From source file:format.OverlapLengthRecordReader.java

License:Apache License

public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;/*  w  w  w  . j  a v  a  2s  .  co m*/
    end = start + splitLength;
    long partialRecordLength = start % recordLength;
    long numBytesToSkip = 0;

    /* This if check is not necessary since for this, we will read one entire split */
    /*
    if (partialRecordLength != 0) {
      numBytesToSkip = recordLength - partialRecordLength;
    }
    */

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        numRecordsRemainingInSplit = Long.MAX_VALUE;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        long splitSize = end - start - numBytesToSkip;
        /* This remains to be observed, since we are assuming recordLength = splitSize */
        //      numRecordsRemainingInSplit = (splitSize + recordLength - 1)/recordLength;
        numRecordsRemainingInSplit = 1;
        if (numRecordsRemainingInSplit < 0) {
            numRecordsRemainingInSplit = 0;
        }
        LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength
                + " bytes in the split with an effective size of " + splitSize + " bytes");
    }
    if (numBytesToSkip != 0) {
        start += inputStream.skip(numBytesToSkip);
    }
    this.pos = start;
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*  w  ww  .j av a2 s  .  c  o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitFastqLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}