Example usage for org.apache.hadoop.io Text append

List of usage examples for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len) 

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line.//from  w  w  w .j  a v a  2 s  .com
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start));

    // ok, so first, split/unsplit, compressed/uncompressed notwithstanding,
    // there are three cases we can run into:
    //
    // 1. we read data
    // 2. we are at an acceptable eof/end-of-split and don't read data
    // 3. we are at an unacceptable eof/end-of-split and don't read data
    //
    // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed.
    //
    // case 3 is simple in the unsplit or uncompressed cases; something has
    // gone wrong, we throw an EOFException, and move on with our lives
    //
    // case 3 is where working with split compressed files gets fun.
    //
    // with the split compression stream, the first time we read past the
    // end of the last compression block within a file split, we get no
    // bytes back. the BZip2Codec and BGZFCodec's actually tell us that
    // we'll get -2 back in this case, but we'll cast a wider net yet.
    //
    // this is important information---if we don't know this, we'll keep reading
    // past the end of the split to the end of the file---but we still need to
    // finish reading our multiline record, so we set some state to let us know
    // that we're reading the last record in the split (endOfCompressedSplit)
    // and repeat the read. if the read fails again, then that means that
    // something has actually gone wrong, and we want to fall through and
    // throw an EOFException or return no bytes read (depending on eofOk).
    // that's why we have the lastReadWasZeroBytes flag around. we set this
    // to true on the first read that gets bytesRead <= 0, and clear it on
    // any read that reads more than 0 bytes.
    if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) {

        // we need to clear the reader state so we can continue reading
        ((ResettableCompressedSplitLineReader) lineReader).reset();

        // set the state to stop us from reading another record and
        // to catch back-to-back failed reads
        lastReadWasZeroBytes = true;
        endOfCompressedSplit = true;

        // recursively call to redo the read
        return appendLineInto(dest, eofOk);
    } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) {
        throw new EOFException();
    } else {
        lastReadWasZeroBytes = false;
    }

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    if (isSplittable && isCompressed) {
        pos = ((SplitCompressionInputStream) inputStream).getPos();
    } else {
        pos += bytesRead;
    }

    return bytesRead;
}

From source file:org.cloudata.core.common.util.CloudataLineReader.java

License:Apache License

/**
 * Read from the InputStream into the given Text.
 * /*from   w  w w. ja  v a2s.  c  om*/
 * @param str
 *          the object to store the given line
 * @return the number of bytes read including the newline
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str) throws IOException {
    str.clear();
    boolean hadFinalNewline = false;
    boolean hadFinalReturn = false;
    boolean hitEndOfFile = false;
    int startPosn = bufferPosn;
    outerLoop: while (true) {
        if (bufferPosn >= bufferLength) {
            if (!backfill()) {
                hitEndOfFile = true;
                break;
            }
        }
        startPosn = bufferPosn;
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            switch (buffer[bufferPosn]) {
            case '\n':
                hadFinalNewline = true;
                bufferPosn += 1;
                break outerLoop;
            case '\r':
                if (hadFinalReturn) {
                    // leave this \n in the stream, so we'll get it next time
                    break outerLoop;
                }
                hadFinalReturn = true;
                break;
            default:
                if (hadFinalReturn) {
                    break outerLoop;
                }
            }
        }
        int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
        if (length >= 0) {
            str.append(buffer, startPosn, length);
        }
    }
    int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0);
    if (!hitEndOfFile) {
        int length = bufferPosn - startPosn - newlineLength;
        if (length > 0) {
            str.append(buffer, startPosn, length);
        }
    }
    return str.getLength() + newlineLength;
}

From source file:org.gestore.hadoop.LongRecordReader.java

License:Apache License

/******
 * Gets one complete entry//w w w  .j ava2s.c  o m
 */

private int getEntry(Pattern matcherStart, Pattern matcherStop) throws IOException {
    boolean started = false;
    boolean done = false;

    ByteBuffer newLine = ByteBuffer.allocate(2);
    newLine.putChar('\n');
    byte[] newLineBytes = newLine.array();

    Text tempLine = new Text();
    int totalRead = 0;
    int newRead = 0;
    // Discard lines before start record match, save first line that matches regex
    while (!started) {
        if (lastLine.getLength() <= 0) {
            newRead = in.readLine(tempLine, maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        } else {
            tempLine = lastLine;
            newRead = lastLine.getLength();
            lastLine = new Text();
        }
        if (newRead == 0) {
            return 0;
        }
        totalRead += newRead;
        Matcher m = matcherStart.matcher(tempLine.toString());
        if (m.matches()) {
            started = true;
            tempLine.append(newLineBytes, 0, newLineBytes.length);
            value.append(tempLine.getBytes(), 0, tempLine.getLength());
            break;
        }
    }

    // Save lines until end record match, save last line
    while (!done) {
        newRead = in.readLine(tempLine, maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        if (newRead == 0) {
            return totalRead;
        }
        totalRead += newRead;
        Matcher m = matcherStop.matcher(tempLine.toString());
        if (m.matches()) {
            done = true;
            lastLine = tempLine;
            return totalRead -= newRead;
        }
        tempLine.append(newLineBytes, 0, newLineBytes.length);
        value.append(tempLine.getBytes(), 0, tempLine.getLength());
    }
    return totalRead;
}

From source file:org.gistic.spatialHadoop.NYCTripData.Trip.java

@Override
public Text toText(Text text) {
    text.append(attributes.getBytes(), 0, attributes.getBytes().length);
    return text;
}

From source file:org.hypertable.hadoop.mapred.HypertableRecordReader.java

License:Open Source License

private void fill_key(Text key, Key cell_key) {
    boolean clear = false;
    /* XXX not sure if "clear" is necessary */

    /*// www . j  a v a2 s.c o m
     * !!
     * If the key format changes, the code which invokes fill_key()
     * will need to be adjusted because it uses a hard-coded length
     * of 24 + cell.key.row.length()!
     */

    try {
        if (m_include_timestamps && cell_key.isSetTimestamp()) {
            t_timestamp = Long.toString(cell_key.timestamp).getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetRow()) {
            t_row = cell_key.row.getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetColumn_family()) {
            t_column_family = cell_key.column_family.getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetColumn_qualifier()) {
            t_column_qualifier = cell_key.column_qualifier.getBytes("UTF-8");
            clear = true;
        }
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
        System.exit(-1);
    }

    if (clear) {
        key.clear();
        if (m_include_timestamps) {
            key.append(t_timestamp, 0, t_timestamp.length);
            key.append(tab, 0, tab.length);
        }
        key.append(t_row, 0, t_row.length);
        key.append(tab, 0, tab.length);
        key.append(t_column_family, 0, t_column_family.length);
        if (t_column_qualifier.length > 0) {
            key.append(colon, 0, colon.length);
            key.append(t_column_qualifier, 0, t_column_qualifier.length);
        }
    }
}

From source file:org.hypertable.hadoop.mapred.HypertableRecordWriter.java

License:Open Source License

/**
 * Write data to HT// www .ja  v  a  2s .c  o  m
 */
public void write(Text key, Text value) throws IOException {
    try {
        key.append(tab, 0, tab.length);

        m_line.clear();
        m_line.append(key.getBytes(), 0, key.getLength());
        m_line.append(value.getBytes(), 0, value.getLength());
        int len = m_line.getLength();

        int tab_count = 0;
        int tab_pos = 0;
        int found = 0;
        while (found != -1) {
            found = m_line.find(tab_str, found + 1);
            if (found > 0) {
                tab_count++;
                if (tab_count == 1)
                    tab_pos = found;
            }
        }

        boolean has_timestamp;
        if (tab_count >= 3) {
            has_timestamp = true;
        } else if (tab_count == 2) {
            has_timestamp = false;
        } else {
            throw new Exception("incorrect output line format only " + tab_count + " tabs");
        }

        byte[] byte_array = m_line.getBytes();
        int row_offset, row_length;
        int family_offset = 0, family_length = 0;
        int qualifier_offset = 0, qualifier_length = 0;
        int value_offset = 0, value_length = 0;
        long timestamp = SerializedCellsFlag.AUTO_ASSIGN;

        int offset = 0;
        if (has_timestamp) {
            timestamp = Long.parseLong(m_line.decode(byte_array, 0, tab_pos));
            offset = tab_pos + 1;
        }

        row_offset = offset;
        tab_pos = m_line.find(tab_str, offset);
        row_length = tab_pos - row_offset;

        offset = tab_pos + 1;
        family_offset = offset;

        tab_pos = m_line.find(tab_str, offset);
        for (int i = family_offset; i < tab_pos; i++) {
            if (byte_array[i] == ':' && qualifier_offset == 0) {
                family_length = i - family_offset;
                qualifier_offset = i + 1;
            }
        }
        // no qualifier
        if (qualifier_offset == 0)
            family_length = tab_pos - family_offset;
        else
            qualifier_length = tab_pos - qualifier_offset;

        offset = tab_pos + 1;
        value_offset = offset;
        value_length = len - value_offset;

        if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length,
                byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset,
                value_length, SerializedCellsFlag.FLAG_INSERT)) {
            mClient.mutator_set_cells_serialized(mMutator, mCellsWriter.buffer(), false);
            mCellsWriter.clear();
            if ((row_length + family_length + qualifier_length + value_length + 32) > mCellsWriter.capacity())
                mCellsWriter = new SerializedCellsWriter(
                        row_length + family_length + qualifier_length + value_length + 32);
            if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length,
                    byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset,
                    value_length, SerializedCellsFlag.FLAG_INSERT))
                throw new IOException("Unable to add cell to SerializedCellsWriter " + "(row='"
                        + new String(byte_array, row_offset, row_length, "UTF-8") + "'");
        }
    } catch (Exception e) {
        log.error(e);
        throw new IOException("Unable to write cell - " + e.toString());
    }
}

From source file:org.rassee.omniture.hadoop.util.EscapedLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF), '\r' (CR),
 * or '\r\n' (CR+LF).  Will ignore any of these termination characters
 * if they are proceeded by a designated escape character. EOF also
 * terminates an otherwise unterminated line.
 *
 * @param str               the object to store the given line (without the newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest will be silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume in
 *                          this call.  This is only a hint, because if the line crosses this
 *                          threshold, we allow it to happen.  It can overshoot potentially by
 *                          as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 * found/* w  w  w  .  java 2s.  c o m*/
 * @throws IOException
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength() as an optimization
    int newLineLength = 0; // length of the terminating newline
    boolean prevCharCR = false; // true if prev char was \r
    long bytesConsumed = 0;

    do {
        int startPos = bufferPos; // starting from where we left off
        if (bufferPos >= bufferLength) {
            startPos = bufferPos = 0;
            if (prevCharCR)
                ++bytesConsumed; // account for CR from previous read
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPos < bufferLength; ++bufferPos) {
            boolean escaped = false;
            if (prevCharCR && bufferPos > 1)
                escaped = (buffer[bufferPos - 2] == escapeChar);
            if (!prevCharCR && bufferPos > 0)
                escaped = (buffer[bufferPos - 1] == escapeChar);

            if (buffer[bufferPos] == LF && !escaped) {
                newLineLength = prevCharCR ? 2 : 1;
                ++bufferPos; // at next loop proceed from following byte
                break;
            }
            if (prevCharCR && !escaped) { // CR + notLF, we are at notLF
                newLineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPos] == CR);
            //prevCharCR = (buffer[bufferPos] == CR && !escaped);
        }
        int readLength = bufferPos - startPos;
        if (prevCharCR && newLineLength == 0)
            --readLength;
        bytesConsumed += readLength;
        int appendLength = readLength - newLineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPos, appendLength);
            txtLength += appendLength;
        }
    } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);

    return (int) bytesConsumed;
}

From source file:org.springframework.yarn.batch.item.LineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *//*from   w  w  w. java  2 s.  c om*/
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already
     * buffered in buffer, so we have several cases: 1. No newline
     * characters are in the buffer, so we need to copy everything and read
     * another buffer from the stream. 2. An unambiguously terminated line
     * is in buffer, so we just copy to str. 3. Ambiguously terminated line
     * is in buffer, i.e. buffer ends in CR. In this case we copy everything
     * up to CR to str, but we also need to see what follows CR: if it's LF,
     * then we need consume LF as well, so next call to readLine will read
     * from after that. We use a flag prevCharCR to signal if previous
     * character was CR and, if it happens to be at the end of the buffer,
     * delay consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    boolean prevCharCR = false; // true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the
        // last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; // account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for
            // newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following
                // byte
                break;
            }
            if (prevCharCR) { // CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; // CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:org.springframework.yarn.batch.item.LineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*  w w  w.java2s .  co  m*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from inputStream, but the head of the stream may
     * be already captured in the previous buffer, so we have several cases:
     *
     * 1. The buffer tail does not contain any character sequence which
     * matches with the head of delimiter. We count it as a ambiguous byte
     * count = 0
     *
     * 2. The buffer tail contains a X number of characters, that forms a
     * sequence, which matches with the head of delimiter. We count
     * ambiguous byte count = X
     *
     * // *** eg: A segment of input file is as follows
     *
     * " record 1792: I found this bug very interesting and I have
     * completely read about it. record 1793: This bug can be solved easily
     * record 1794: This ."
     *
     * delimiter = "record";
     *
     * supposing:- String at the end of buffer =
     * "I found this bug very interesting and I have completely re" There
     * for next buffer = "ad about it. record 179       ...."
     *
     * The matching characters in the input buffer tail and delimiter head =
     * "re" Therefore, ambiguous byte count = 2 **** //
     *
     * 2.1 If the following bytes are the remaining characters of the
     * delimiter, then we have to capture only up to the starting position
     * of delimiter. That means, we need not include the ambiguous
     * characters in str.
     *
     * 2.2 If the following bytes are not the remaining characters of the
     * delimiter ( as mentioned in the example ), then we have to include
     * the ambiguous characters in str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                // appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; // to be consumed in
                // next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:StorageEngineClient.MyLineReader.java

License:Open Source License

public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {

    str.clear();/* w w  w. j  a  v  a 2  s .  co m*/
    int txtLength = 0;
    int newlineLength = 0;
    boolean prevCharCR = false;
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn;
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break;
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR && lineendmode != 2) ? 2 : 1;
                ++bufferPosn;
                break;
            }
            if (prevCharCR) {
                if (lineendmode == 0) {
                    newlineLength = 1;
                    break;
                }
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength;
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}