Example usage for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength()

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:eu.larkc.RDFPig.io.NTriplesReader.java

License:Apache License

@Override
public Tuple getNext() throws IOException {

    while (true) {
        mProtoTuple = null;//from ww w . ja v  a 2s .  co  m
        try {
            boolean notDone = in.nextKeyValue();
            if (!notDone) {
                return null;
            }
            Text value = null;
            try {
                value = (Text) in.getCurrentValue();

                byte[] buf = value.getBytes();
                int len = value.getLength();

                if (len < 3)
                    continue; // Ignore lines with less than 3 bytes

                //Get rid of any trailing whitespace
                while (Character.isWhitespace(buf[len - 1]))
                    len--;

                if (buf[len - 1] != '.')
                    continue;//throw new ExecException("Could not parse triple, no trailing \'.\': " + value);
                else
                    len--;

                //Get rid of any trailing whitespace
                while (Character.isWhitespace(buf[len - 1]))
                    len--;

                int start = 0;
                while (Character.isWhitespace(buf[start]))
                    start++;

                // Parse subject
                boolean isURI = buf[0] == '<';
                for (int i = 0; i < len; i++) {
                    if (isURI && buf[i] == '>') {
                        readField(buf, start, i + 1);
                        start = i + 1;
                        break;
                    } else if (Character.isWhitespace(buf[i])) {
                        readField(buf, start, i);
                        start = i + 1;
                        break;
                    }
                }

                while (Character.isWhitespace(buf[start]))
                    start++;

                // Parse predicate (always URI)
                for (int i = start; i < len; i++) {
                    if (buf[i] == '>') {
                        readField(buf, start, i + 1);
                        start = i + 1;
                        break;
                    }
                }

                while (Character.isWhitespace(buf[start]))
                    start++;

                // Parse object
                if (buf[start] == '<') //URI
                    for (int i = start + 1; i < len; i++) {
                        if (buf[i] == '>') {
                            readField(buf, start, i + 1);
                            start = i + 1;
                            break;
                        }
                    }
                else if (buf[start] == '"') //Literal
                    for (int i = start + 1; i < len; i++) {
                        if (buf[i] == '"' && i > 0 && buf[i - 1] != '\\') {
                            readField(buf, start, i + 1);
                            start = i + 1;
                            break;
                        }
                    }
                else if (buf[start] == '_') {//BNode
                    int i = start + 1;
                    for (; i < len; i++) {
                        if (Character.isWhitespace(buf[i])) {
                            readField(buf, start, i);
                            start = i + 1;
                            break;
                        }
                    }
                    // We are at end of line, read it
                    readField(buf, start, i);

                } else
                    continue;//throw new ExecException("Could not parse triple, invalid term in object position: " + value);
                // After the first three terms, the rest are ignored

                if (mProtoTuple.size() != 3)
                    continue;

                Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
                mProtoTuple = null;
                return t;
            } catch (Exception e) {
                e.printStackTrace();
                System.err.println("For line: " + value);
                mProtoTuple = null;
            }
        } catch (Exception e) {
            int errCode = 6018;
            String errMsg = "Error while reading input";
            throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
        }
    }
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testUncompressedInputCustomDelimiterPosValue() throws Exception {
    Configuration conf = new Configuration();
    conf.setInt("io.file.buffer.size", 10);
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    String inputData = "abcdefghij++kl++mno";
    Path inputFile = createInputFile(conf, inputData);
    String delimiter = "++";
    byte[] recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    int splitLength = 15;
    FileSplit split = new FileSplit(inputFile, 0, splitLength, (String[]) null);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    LineRecordReader reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);// w  w w . ja  v a 2 s .c om
    // Get first record: "abcdefghij"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    LongWritable key = reader.getCurrentKey();
    Text value = reader.getCurrentValue();
    assertEquals("Wrong length for record value", 10, value.getLength());
    assertEquals("Wrong position after record read", 0, key.get());
    // Get second record: "kl"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong length for record value", 2, value.getLength());
    // Key should be 12 right after "abcdefghij++"
    assertEquals("Wrong position after record read", 12, key.get());
    // Get third record: "mno"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // Key should be 16 right after "abcdefghij++kl++"
    assertEquals("Wrong position after record read", 16, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 19 right after "abcdefghij++kl++mno"
    assertEquals("Wrong position after record read", 19, key.get());
    // after refresh should be empty
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();
    split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    // No record is in the second split because the second split dropped
    // the first record, which was already reported by the first split.
    assertFalse("Unexpected record returned", reader.nextKeyValue());
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();

    // multi char delimiter with starting part of the delimiter in the data
    inputData = "abcd+efgh++ijk++mno";
    inputFile = createInputFile(conf, inputData);
    splitLength = 5;
    split = new FileSplit(inputFile, 0, splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    // Get first record: "abcd+efgh"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    assertEquals("Wrong position after record read", 0, key.get());
    assertEquals("Wrong length for record value", 9, value.getLength());
    // should have jumped over the delimiter, no record
    assertFalse(reader.nextKeyValue());
    assertEquals("Wrong position after record read", 11, key.get());
    // after refresh should be empty
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();
    // next split: check for duplicate or dropped records
    split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get second record: "ijk" first in this split
    assertEquals("Wrong position after record read", 11, key.get());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // Get third record: "mno" second in this split
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong position after record read", 16, key.get());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // should be at the end of the input
    assertFalse(reader.nextKeyValue());
    assertEquals("Wrong position after record read", 19, key.get());
    reader.close();

    inputData = "abcd|efgh|+|ij|kl|+|mno|pqr";
    inputFile = createInputFile(conf, inputData);
    delimiter = "|+|";
    recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    // walking over the buffer and split sizes checks for proper processing
    // of the ambiguous bytes of the delimiter
    for (int bufferSize = 1; bufferSize <= inputData.length(); bufferSize++) {
        for (int splitSize = 1; splitSize < inputData.length(); splitSize++) {
            // track where we are in the inputdata
            int keyPosition = 0;
            conf.setInt("io.file.buffer.size", bufferSize);
            split = new FileSplit(inputFile, 0, bufferSize, (String[]) null);
            reader = new LineRecordReader(recordDelimiterBytes);
            reader.initialize(split, context);
            // Get the first record: "abcd|efgh" always possible
            assertTrue("Expected record got nothing", reader.nextKeyValue());
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            assertTrue("abcd|efgh".equals(value.toString()));
            // Position should be 0 right at the start
            assertEquals("Wrong position after record read", keyPosition, key.get());
            // Position should be 12 right after the first "|+|"
            keyPosition = 12;
            // get the next record: "ij|kl" if the split/buffer allows it
            if (reader.nextKeyValue()) {
                // check the record info: "ij|kl"
                assertTrue("ij|kl".equals(value.toString()));
                assertEquals("Wrong position after record read", keyPosition, key.get());
                // Position should be 20 after the second "|+|"
                keyPosition = 20;
            }
            // get the third record: "mno|pqr" if the split/buffer allows it
            if (reader.nextKeyValue()) {
                // check the record info: "mno|pqr"
                assertTrue("mno|pqr".equals(value.toString()));
                assertEquals("Wrong position after record read", keyPosition, key.get());
                // Position should be the end of the input
                keyPosition = inputData.length();
            }
            assertFalse("Unexpected record returned", reader.nextKeyValue());
            // no more records can be read we should be at the last position
            assertEquals("Wrong position after record read", keyPosition, key.get());
            // after refresh should be empty
            key = reader.getCurrentKey();
            assertNull("Unexpected key returned", key);
            reader.close();
        }
    }
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testUncompressedInputDefaultDelimiterPosValue() throws Exception {
    Configuration conf = new Configuration();
    String inputData = "1234567890\r\n12\r\n345";
    Path inputFile = createInputFile(conf, inputData);
    conf.setInt("io.file.buffer.size", 10);
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    LineRecordReader reader = new LineRecordReader(null);
    reader.initialize(split, context);/*from   w  ww  .j  av a  2s  . c o  m*/
    LongWritable key;
    Text value;
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"1234567890"
    assertEquals(10, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:"12"
    assertEquals(2, value.getLength());
    // Key should be 12 right after "1234567890\r\n"
    assertEquals(12, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());

    split = new FileSplit(inputFile, 15, 4, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    // The second split dropped the first record "\n"
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get third record:"345"
    assertEquals(3, value.getLength());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 19 right after "1234567890\r\n12\r\n345"
    assertEquals(19, key.get());

    inputData = "123456789\r\r\n";
    inputFile = createInputFile(conf, inputData);
    split = new FileSplit(inputFile, 0, 12, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"123456789"
    assertEquals(9, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:""
    assertEquals(0, value.getLength());
    // Key should be 10 right after "123456789\r"
    assertEquals(10, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 12 right after "123456789\r\r\n"
    assertEquals(12, key.get());
}

From source file:fi.tkk.ics.hadoop.bam.SequencedFragment.java

License:Open Source License

/**
 * Convert quality scores in-place.//from  w ww .ja  v a 2s .  co m
 *
 * @raise FormatException if quality scores are out of the range
 * allowed by the current encoding.
 * @raise IllegalArgumentException if current and  target quality encodings are the same.
 */
public static void convertQuality(Text quality, BaseQualityEncoding current, BaseQualityEncoding target) {
    if (current == target)
        throw new IllegalArgumentException(
                "current and target quality encodinds are the same (" + current + ")");

    byte[] bytes = quality.getBytes();
    final int len = quality.getLength();
    final int illuminaSangerDistance = FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET;

    if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger) {
        for (int i = 0; i < len; ++i) {
            if (bytes[i] < FormatConstants.ILLUMINA_OFFSET
                    || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX)) {
                throw new FormatException("base quality score out of range for Illumina Phred+64 format (found "
                        + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) + " but acceptable range is [0,"
                        + FormatConstants.ILLUMINA_MAX + "]).\n"
                        + "Maybe qualities are encoded in Sanger format?\n");
            }
            bytes[i] -= illuminaSangerDistance;
        }
    } else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina) {
        for (int i = 0; i < len; ++i) {
            if (bytes[i] < FormatConstants.SANGER_OFFSET
                    || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX)) {
                throw new FormatException("base quality score out of range for Sanger Phred+64 format (found "
                        + (bytes[i] - FormatConstants.SANGER_OFFSET) + " but acceptable range is [0,"
                        + FormatConstants.SANGER_MAX + "]).\n"
                        + "Maybe qualities are encoded in Illumina format?\n");
            }
            bytes[i] += illuminaSangerDistance;
        }
    } else
        throw new IllegalArgumentException(
                "unsupported BaseQualityEncoding transformation from " + current + " to " + target);
}

From source file:fi.tkk.ics.hadoop.bam.SequencedFragment.java

License:Open Source License

/**
 * Verify that the given quality bytes are within the range allowed for the specified encoding.
 *
 * In theory, the Sanger encoding uses the entire
 * range of characters from ASCII 33 to 126, giving a value range of [0,93].  However, values over 60 are
 * unlikely in practice, and are more likely to be caused by mistaking a file that uses Illumina encoding
 * for Sanger.  So, we'll enforce the same range supported by Illumina encoding ([0,62]) for Sanger.
 *
 * @return -1 if quality is ok.// w w  w .ja  v a 2s .  c  om
 * @return If an out-of-range value is found the index of the value is returned.
 */
public static int verifyQuality(Text quality, BaseQualityEncoding encoding) {
    // set allowed quality range
    int max, min;

    if (encoding == BaseQualityEncoding.Illumina) {
        max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX;
        min = FormatConstants.ILLUMINA_OFFSET;
    } else if (encoding == BaseQualityEncoding.Sanger) {
        max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX;
        min = FormatConstants.SANGER_OFFSET;
    } else
        throw new IllegalArgumentException("Unsupported base encoding quality " + encoding);

    // verify
    final byte[] bytes = quality.getBytes();
    final int len = quality.getLength();

    for (int i = 0; i < len; ++i) {
        if (bytes[i] < min || bytes[i] > max)
            return i;
    }
    return -1;
}

From source file:fm.last.darling.hbase.HBaseJSONOutputReader.java

License:Apache License

private byte[] trimOuterBytes(Text text) {
    byte[] bytes = new byte[text.getLength() - 2];
    System.arraycopy(text.getBytes(), 1, bytes, 0, bytes.length);
    return bytes;
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.ExpressionRecordWriter.java

License:LGPL

@Override
public synchronized void write(final Text key, final LongWritable value)
        throws IOException, InterruptedException {

    this.context.getCounter(COUNTERS_GROUP, INPUT_ENTRIES).increment(1);

    if (value == null) {
        return;//www  .j  a v  a 2 s. c  o m
    }

    this.out.write(key.getBytes(), 0, key.getLength());
    this.out.write(separator);
    this.out.write(value.toString().getBytes(StandardCharsets.UTF_8));
    this.out.write(newline);

    this.context.getCounter(COUNTERS_GROUP, ENTRIES_WRITTEN).increment(1);
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMRecordWriter.java

License:LGPL

@Override
public synchronized void write(final Text key, final Text value) throws IOException, InterruptedException {

    this.context.getCounter(COUNTERS_GROUP, INPUT_ENTRIES).increment(1);

    if (value == null) {
        return;//from  w  w  w  .  j a  v a 2 s.  c om
    }

    this.out.write(value.getBytes(), 0, value.getLength());
    this.out.write(newline);

    this.context.getCounter(COUNTERS_GROUP, ENTRIES_WRITTEN).increment(1);
}

From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;/*from  w  w  w. j a v  a2s  . c  o  m*/
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') {
            j++;
        }

        j++; // skip the ">"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;/*from   w  ww.java  2s . co  m*/
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') {
            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}