Example usage for org.apache.hadoop.util LineReader readLine

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader readLine.

Prototype

public int readLine(Text str, int maxLineLength) throws IOException

Source Link

Document

Read from the InputStream into the given Text.

Usage

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *//*from   w w w  .j  av  a 2  s. c  o m*/
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

private void setKeySeq(FileSystem fs, Configuration job) { //Set currentKey

    if (Constant.SPLIT2_DEBUG_MODE)
        currentKey = new Text(file.getName() + "." + start);
    else {//from  www  .j  a v a2  s.co  m
        try {
            LineReader reader = new LineReader(fs.open(file), job, recordDelimiterBytes);
            currentKey = new Text();
            reader.readLine(currentKey, maxLineLength);
            reader.close();
            currentKey.set(currentKey.toString().replaceAll(">", ""));
        } catch (Exception e) {
            LOG.error(e.getMessage());
            currentKey = new Text(file.getName());
        }
    }

}

From source file:org.apache.carbondata.processing.csvreaderstep.UnivocityCsvParser.java

License:Apache License

/**
 * Below method will be used to initialize the reader
 *
 * @throws IOException/*from   w  w  w .j  a  v a 2 s.c o  m*/
 */
private void initializeReader() throws IOException {
    // if already one input stream is open first we need to close and then
    // open new stream
    close();
    // get the block offset
    long startOffset = this.csvParserVo.getBlockDetailsList().get(blockCounter).getBlockOffset();
    FileType fileType = FileFactory
            .getFileType(this.csvParserVo.getBlockDetailsList().get(blockCounter).getFilePath());
    // calculate the end offset the block
    long endOffset = this.csvParserVo.getBlockDetailsList().get(blockCounter).getBlockLength() + startOffset;

    // create a input stream for the block
    DataInputStream dataInputStream = FileFactory.getDataInputStream(
            this.csvParserVo.getBlockDetailsList().get(blockCounter).getFilePath(), fileType, bufferSize,
            startOffset);
    // if start offset is not 0 then reading then reading and ignoring the extra line
    if (startOffset != 0) {
        LineReader lineReader = new LineReader(dataInputStream, 1);
        startOffset += lineReader.readLine(new Text(), 0);
    }
    inputStreamReader = new BufferedReader(
            new InputStreamReader(new CustomDataStream(dataInputStream, endOffset - startOffset)));
}

From source file:org.apache.sqoop.connector.hdfs.HdfsExtractor.java

License:Apache License

/**
 * Extracts Text file/*from  w  ww.j a  v  a2 s  .  c o m*/
 * @param file
 * @param start
 * @param length
 * @throws IOException
 */
@SuppressWarnings("resource")
private void extractTextFile(Path file, long start, long length) throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream filestream = fs.open(file);
    CompressionCodec codec = (new CompressionCodecFactory(conf)).getCodec(file);
    LineReader filereader;
    Seekable fileseeker = filestream;

    // Hadoop 1.0 does not have support for custom record delimiter and thus
    // we
    // are supporting only default one.
    // We might add another "else if" case for SplittableCompressionCodec once
    // we drop support for Hadoop 1.0.
    if (codec == null) {
        filestream.seek(start);
        filereader = new LineReader(filestream);
    } else {
        filereader = new LineReader(codec.createInputStream(filestream, codec.createDecompressor()), conf);
        fileseeker = filestream;
    }
    if (start != 0) {
        // always throw away first record because
        // one extra line is read in previous split
        start += filereader.readLine(new Text(), 0);
    }
    int size;
    LOG.info("Start position: " + String.valueOf(start));
    long next = start;
    while (next <= end) {
        Text line = new Text();
        size = filereader.readLine(line, Integer.MAX_VALUE);
        if (size == 0) {
            break;
        }
        if (codec == null) {
            next += size;
        } else {
            next = fileseeker.getPos();
        }
        rowRead++;
        dataWriter.writeStringRecord(line.toString());
    }
    LOG.info("Extracting ended on position: " + fileseeker.getPos());
    filestream.close();
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *//*  ww w.j  av a 2  s. co m*/
protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec)
        throws IOException {
    Text buffer = new Text();
    long originalStart = start;

    LineReader reader;
    if (codec == null) {
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        reader = new LineReader(stream);
    } else {
        // Unlike the codec == null case, we don't seek before creating the
        // reader, SplittableCompressionCodec.createInputStream places the
        // stream at the start of the first compression block after our
        // split start
        //
        // as noted above, we need to be at pos 0 in the stream before
        // calling this
        reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end,
                SplittableCompressionCodec.READ_MODE.BYBLOCK));
    }

    int bytesRead = 0;
    do {
        bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
        int bufferLength = buffer.getLength();
        if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
            start += bytesRead;
        } else {

            // line starts with @.  Read two more and verify that it starts
            // with a +:
            //
            // @<readname>
            // <sequence>
            // +[readname]
            //
            // if the second line we read starts with a @, we know that
            // we've read:
            //
            // <qualities> <-- @ is a valid ASCII phred encoding
            // @<readname>
            //
            // and thus, the second read is the delimiter and we can break
            long trackForwardPosition = start + bytesRead;

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') {
                start = trackForwardPosition;
                break;
            } else {
                trackForwardPosition += bytesRead;
            }

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            trackForwardPosition += bytesRead;
            if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                break; // all good!
            } else {
                start = trackForwardPosition;
            }
        }
    } while (bytesRead > 0);

    pos = start;
    start = originalStart;
    stream.seek(start);
    return (int) (pos - originalStart);
}