Example usage for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len)

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:edu.umn.cs.sthadoop.hdfs.KNNJoin.java

License:Open Source License

public static void serializeText(Text t, Vector<Partition> partitions)
        throws IOException, InterruptedException {
    byte[] bytes = ("&" + partitions.get(0).filename).getBytes();
    t.append(bytes, 0, bytes.length);
    for (int i = 1; i < partitions.size(); i++) {
        bytes = ("#" + partitions.get(i).filename).getBytes();
        t.append(bytes, 0, bytes.length);
    }/*w  w w. ja  v a 2  s .  c  o  m*/
}

From source file:fi.tkk.ics.hadoop.bam.LineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line./*from  ww w  .java 2s .  com*/
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed; //account for CR from previous read
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength; //CR at the end of the buffer
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:generated.scala.io.LineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *///  w  ww .  j a v a  2s.c  o m
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; //account for CR from previous read
            }
            bufferLength = fillBuffer(in, buffer, prevCharCR);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; //CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:generated.scala.io.LineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *///from   ww  w. j a  v  a2 s.  c om
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from inputStream, but the head of the stream may be
     *  already captured in the previous buffer, so we have several cases:
     * 
     * 1. The buffer tail does not contain any character sequence which
     *    matches with the head of delimiter. We count it as a 
     *    ambiguous byte count = 0
     *    
     * 2. The buffer tail contains a X number of characters,
     *    that forms a sequence, which matches with the
     *    head of delimiter. We count ambiguous byte count = X
     *    
     *    // ***  eg: A segment of input file is as follows
     *    
     *    " record 1792: I found this bug very interesting and
     *     I have completely read about it. record 1793: This bug
     *     can be solved easily record 1794: This ." 
     *    
     *    delimiter = "record";
     *        
     *    supposing:- String at the end of buffer =
     *    "I found this bug very interesting and I have completely re"
     *    There for next buffer = "ad about it. record 179       ...."           
     *     
     *     The matching characters in the input
     *     buffer tail and delimiter head = "re" 
     *     Therefore, ambiguous byte count = 2 ****   //
     *     
     *     2.1 If the following bytes are the remaining characters of
     *         the delimiter, then we have to capture only up to the starting 
     *         position of delimiter. That means, we need not include the 
     *         ambiguous characters in str.
     *     
     *     2.2 If the following bytes are not the remaining characters of
     *         the delimiter ( as mentioned in the example ), 
     *         then we have to include the ambiguous characters in str. 
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
            if (bufferLength <= 0) {
                if (ambiguousByteCount > 0) {
                    str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                    bytesConsumed += ambiguousByteCount;
                }
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        bytesConsumed += ambiguousByteCount;
        if (appendLength >= 0 && ambiguousByteCount > 0) {
            //appending the ambiguous characters (refer case 2.2)
            str.append(recordDelimiterBytes, 0, ambiguousByteCount);
            ambiguousByteCount = 0;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; //to be consumed in next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;/*from  ww  w. j a v  a  2  s.c o  m*/
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') {
            j++;
        }

        j++; // skip the ">"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastaLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line./* w w  w . j a v a2  s . c  o m*/
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;
    int startPosn;
    StringBuilder recordBlock = new StringBuilder(this.bufferSize);

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

    /*
    find the next record start:  first scan to end of the line
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
                recordBlock.append((char) buffer[copyi]);
            }
            //recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
        bufferPosn++;
    } while (buffer[bufferPosn - 1] != CR && buffer[bufferPosn - 1] != LF);

    /*
    find the next record start:  scan till next ">"
     */
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
                recordBlock.append((char) buffer[copyi]);
            }
            //recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>'); // only read one record at a time

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
            recordBlock.append((char) buffer[copyi]);
        }
        //recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        key.clear();
        str.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.length()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } //else if (c == ' ' || c == '\t') {
              //  junkOnLine = true;
              //  break;
              //}
        }
        if (j == i) {
            // then we didn't parse out a proper id
            LOG.error("Unable to parse entry: " + recordBlock);
            str.clear();
            key.clear();
            return totalBytesRead;
        }
        key.set(recordBlock.substring(i, j - 1));

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.length() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.length() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        StringBuilder sequenceTmp = new StringBuilder(recordBlock.length());
        do {
            i = j;
            while (j < recordBlock.length()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            //byte[] ba = recordBlock.getBytes();
            //if (ba.length <= i || ba.length <= j - i - 1) {
            //    LOG.fatal("hmm... ba.length = " + ba.length + " i = " + i + " j-i-1 = " + (j-i-1));
            //}

            if (j == i) {
                // then we didn't parse out a proper id
                LOG.error("Unable to parse entry: " + recordBlock);
                str.clear();
                key.clear();
                return totalBytesRead;
            }
            for (int copyi = i; copyi < j - 1; copyi++) {
                sequenceTmp.append((char) recordBlock.charAt(copyi));
            }

            while (j < recordBlock.length() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.length() && recordBlock.charAt(j) != '>');
        str.set(sequenceTmp.toString());

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.length() && recordBlock.charAt(j) != '>') {
            j++;
        }

        j++; // skip the ">"

    } while (j < recordBlock.length());

    //        LOG.info("");
    //        LOG.info("object key = " + key);
    byte[] strpacked = SequenceString.sequenceToByteArray(str.toString().toLowerCase());

    str.clear();
    str.append(strpacked, 0, strpacked.length);

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;//from  w ww. ja va 2 s  .  com
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') {
            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.//from w  w w  .jav a 2 s. c  o m
 *
 * @param str               the object to store the given line (without newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *                          in this call.  This is only a hint, because if the line cross
 *                          this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 *         found.
 * @throws java.io.IOException if the underlying stream throws
 */
public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fastq record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    int numOfNewlines = 0;//Added by lanhin
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
        //Modefied by lanhin
        if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) {
            numOfNewlines++;
        }
        if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') {
            bufferPosn++;
            break;
        }
        bufferPosn++;
    } while (true);//buffer[bufferPosn++] != '@');  // only read one record at a time
    //Modefied by lanhin end

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    int i = 1; // skip initial record seperator "@"
    int j = 1;
    do {
        key.clear();
        str.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        key.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            str.append(recordBlock.getBytes(), i, j - i - 1);

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength()) {
            // && recordBlock.charAt(j) != '@') {  // Modified by lanhin
            /* Should go straight to the end of recordBlock,
               ignore all the left info.  --lanhin*/

            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:InvertedIndex.NLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line./* w  ww. j av a2s  . c  o m*/
 *
 * @param str               the object to store the given line (without newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *                          in this call.  This is only a hint, because if the line cross
 *                          this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 * found.
 * @throws IOException if the underlying stream throws
 */
public int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    //boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            //if (prevCharCR)
            //  ++bytesConsumed; //account for CR from previous read
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            //if (prevCharCR) { //CR + notLF, we are at notLF
            //newlineLength = 0;
            //break;
            //}
            //prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        //if (prevCharCR && newlineLength == 0)
        // --readLength; //CR at the end of the buffer
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
    //System.err.println(str);
    //System.err.println(bytesConsumed);
    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:mr.MyLineReader.java

License:Apache License

private int myReadLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    System.out.println("Enter myReadLine!!!");
    str.clear();//from ww  w  .  j ava  2s  .c o m
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the last  time
        bufferPosn = buffer.length;
        int n = -1;
        int readLength = 0;
        while ((n = in.read(buffer)) > 0) {
            readLength += n;
        }
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    return (int) bytesConsumed;
}