Example usage for org.apache.hadoop.util LineReader readLine

List of usage examples for org.apache.hadoop.util LineReader readLine

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader readLine.

Prototype

public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException 

Source Link

Document

Read one line from the InputStream into the given Text.

Usage

From source file:com.knewton.mrtool.io.JsonRecordReader.java

License:Apache License

/**
 * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line
 * by line. This separate method helps with testing too.
 * /*from  w  ww.  j  a  va 2 s. c  o m*/
 * @param fileSplit
 * @param conf
 * @return
 * @throws IOException
 */
protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException {
    final Path file = fileSplit.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(fileSplit.getPath());
    seekableIn = fileIn;
    boolean skipFirstLine = false;
    LineReader lineReader;
    if (codec != null) {
        lineReader = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        // if the start is not the beginning of the file then skip the first line to get the
        // next complete json record. The previous json record will be read by the record reader
        // that got assigned the previous InputSplit.
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new LineReader(fileIn, conf);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    return lineReader;
}

From source file:libra.common.hadoop.io.reader.fasta.FastaKmerReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    this.kmersize = FastaKmerInputFormat.getKmerSize(conf);
    this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {/*from w w  w . j a v  a 2 s  .  c o m*/
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);
    boolean inTheMiddle = false;
    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        if (this.start != 0) {
            this.start--;
            fileIn.seek(this.start);

            inTheMiddle = true;
        }
        this.in = new LineReader(fileIn, conf);
    }

    this.buffer = new Text();

    if (inTheMiddle) {
        // find new start line
        this.start += this.in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start));

        // back off
        FSDataInputStream fileIn2 = fs.open(file);
        fileIn2.seek(this.start - 1000);

        LineReader in2 = new LineReader(fileIn2, conf);
        Text tempLine = new Text();
        long curpos = this.start - 1000;
        while (curpos < this.start) {
            curpos += in2.readLine(tempLine, 0, (int) (this.start - curpos));
        }

        if (tempLine.charAt(0) == READ_DELIMITER) {
            // clean start
            this.buffer.clear();
        } else {
            // leave k-1 seq in the buffer
            String seq = tempLine.toString().trim();
            String left = seq.substring(seq.length() - this.kmersize + 1);
            this.buffer.set(left);
        }

        in2.close();
    }

    this.pos = this.start;

    this.key = null;
    this.value = null;
}

From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java

License:Apache License

/**
 * Mostly copied from LineRecordReader (MapReduce) to pull an example of
 * actual usage into this test./*  w w  w .j av  a 2  s. c om*/
 */
public boolean nextKeyValue(final SplitCompressionInputStream in, final LineReader lr, final long end,
        Text value) throws IOException {
    final int maxLineLength = Integer.MAX_VALUE;
    if (value == null) {
        value = new Text();
    }
    int newSize = 0;
    // We always read one extra line, which lies outside the upper
    // split limit i.e. (end - 1)
    while (in.getPos() <= end) {
        newSize = lr.readLine(value, maxLineLength, maxLineLength);
        if (newSize == 0) {
            break;
        }
        if (newSize < maxLineLength) {
            break;
        }

        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (in.getPos() - newSize));
    }
    if (newSize == 0) {
        value = null;
        return false;
    } else {
        return true;
    }
}