Example usage for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len)

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:org.apache.asterix.external.input.record.reader.hdfs.HDFSTextLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated
 * line.// w  ww .  j a va  2  s  .  co  m
 *
 * @param str
 *            the object to store the given line (without newline)
 * @param maxLineLength
 *            the maximum number of bytes to store into str;
 *            the rest of the line is silently discarded.
 * @param maxBytesToConsume
 *            the maximum number of bytes to consume
 *            in this call. This is only a hint, because if the line cross
 *            this threshold, we allow it to happen. It can overshoot
 *            potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 *         found.
 * @throws IOException
 *             if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed; //account for CR from previous read
            bufferLength = reader.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength; //CR at the end of the buffer
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    currentFilePos = reader.getPos() - bufferLength + bufferPosn;
    return (int) bytesConsumed;
}

From source file:org.apache.ben.QuotationLineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *//*ww w.  j a  v a 2 s. c  o  m*/
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    int quoteBefore = 0;
    boolean previousCharEsc = false;
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; //account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == QUOTE) {
                if (!previousCharEsc) {
                    quoteBefore = (quoteBefore == 0) ? 1 : 0;
                }

            }
            if (buffer[bufferPosn] == LF && (quoteBefore == 0)) {

                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
            previousCharEsc = (buffer[bufferPosn] == ESCAPE);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; //CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:org.apache.ben.QuotationLineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*from   ww  w  .java  2 s .  com*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from inputStream, but the head of the stream may be
     *  already captured in the previous buffer, so we have several cases:
     * 
     * 1. The buffer tail does not contain any character sequence which
     *    matches with the head of delimiter. We count it as a 
     *    ambiguous byte count = 0
     *    
     * 2. The buffer tail contains a X number of characters,
     *    that forms a sequence, which matches with the
     *    head of delimiter. We count ambiguous byte count = X
     *    
     *    // ***  eg: A segment of input file is as follows
     *    
     *    " record 1792: I found this bug very interesting and
     *     I have completely read about it. record 1793: This bug
     *     can be solved easily record 1794: This ." 
     *    
     *    delimiter = "record";
     *        
     *    supposing:- String at the end of buffer =
     *    "I found this bug very interesting and I have completely re"
     *    There for next buffer = "ad about it. record 179       ...."           
     *     
     *     The matching characters in the input
     *     buffer tail and delimiter head = "re" 
     *     Therefore, ambiguous byte count = 2 ****   //
     *     
     *     2.1 If the following bytes are the remaining characters of
     *         the delimiter, then we have to capture only up to the starting 
     *         position of delimiter. That means, we need not include the 
     *         ambiguous characters in str.
     *     
     *     2.2 If the following bytes are not the remaining characters of
     *         the delimiter ( as mentioned in the example ), 
     *         then we have to include the ambiguous characters in str. 
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                //appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; //to be consumed in next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:org.apache.flume.sink.hdfs.HDFSTextFormatter.java

License:Apache License

@Override
public byte[] getBytes(Event e) {
    Text record = makeText(e);
    record.append("\n".getBytes(), 0, 1);
    byte[] rawBytes = record.getBytes();
    return Arrays.copyOf(rawBytes, record.getLength());
}

From source file:org.apache.gora.accumulo.store.AccumuloStore.java

License:Apache License

Text pad(Text key, int bytes) {
    if (key.getLength() < bytes)
        key = new Text(key);

    while (key.getLength() < bytes) {
        key.append(new byte[] { 0 }, 0, 1);
    }//www. j av a 2  s  .  co  m

    return key;
}

From source file:org.apache.jena.tdbloader4.SecondMapper.java

License:Apache License

@Override
public void map(LongWritable key, QuadWritable value, Context context)
        throws IOException, InterruptedException {
    log.debug("< ({}, {})", key, value);
    Quad quad = value.getQuad();/*  ww w  . j ava  2 s.  com*/
    String s = Utils.serialize(quad.getSubject());
    String p = Utils.serialize(quad.getPredicate());
    String o = Utils.serialize(quad.getObject());
    String g = null;
    if (!quad.isDefaultGraphGenerated()) {
        g = Utils.serialize(quad.getGraph());
    }

    // TODO: reuse hash from TDB NodeTableNative?
    MessageDigest digest = null;
    try {
        digest = MessageDigest.getInstance("MD5");
        digest.update(s.getBytes("UTF-8"));
        digest.update(p.getBytes("UTF-8"));
        digest.update(o.getBytes("UTF-8"));
        if (g != null)
            digest.update(g.getBytes("UTF-8"));
        String hash = new String(Hex.encodeHex(digest.digest()));
        ht.set(hash);
        if ((s != null) && (p != null) && (o != null)) {
            st.set(s);
            pt.set(p);
            ot.set(o);

            Text hs = new Text(ht);
            hs.append(S, 0, S.length);
            Text hp = new Text(ht);
            hp.append(P, 0, P.length);
            Text ho = new Text(ht);
            ho.append(O, 0, O.length);

            emit(context, st, hs);
            emit(context, pt, hp);
            emit(context, ot, ho);
        }
        if (g != null) {
            gt.set(g);
            Text hg = new Text(ht);
            hg.append(G, 0, G.length);
            emit(context, gt, hg);
            EventManager.send(counters, new Event(Constants.eventQuad, quad));
        } else {
            EventManager.send(counters, new Event(Constants.eventTriple, quad.asTriple()));
        }
    } catch (Exception e) {
        throw new TDBLoader4Exception(e);
    } finally {
        st.clear();
        pt.clear();
        ot.clear();
        gt.clear();
        ht.clear();
    }
}

From source file:org.apache.rya.indexing.KeyParts.java

License:Apache License

/**
 * Append any byte array to a row key./*from ww w.  j av a2s  .  c o  m*/
 * @param bytes append this
 * @param keyText text to append to
 */
private static void appendBytes(final byte[] bytes, final Text keyText) {
    keyText.append(bytes, 0, bytes.length);
}

From source file:org.apache.rya.indexing.KeyParts.java

License:Apache License

/**
* Get a collision unlikely hash string and append to the key,
* so that if two keys have the same value, then they will be the same,
* if two different values that occur at the same time there keys are different.
* If the application uses a very large number of statements at the exact same time,
* the md5 value might be upgraded to for example sha-1 to avoid collisions.
* @param statement/*from   w  w  w  . jav  a  2 s  .c o  m*/
* @param keyText
*/
public static void appendUniqueness(final Statement statement, final Text keyText) {
    keyText.append(HASH_PREFIX, 0, 1); // delimiter
    final Value statementValue = new Value(
            StringUtils.getBytesUtf8(StatementSerializer.writeStatement(statement)));
    final byte[] hashOfValue = Md5Hash.md5Binary(statementValue);
    keyText.append(hashOfValue, 0, hashOfValue.length);
}

From source file:org.apache.tajo.storage.LineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*from  w w  w  .ja va  2s . c  om*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from inputStream, but the head of the stream may be
     *  already captured in the previous buffer, so we have several cases:
     *
     * 1. The buffer tail does not contain any character sequence which
     *    matches with the head of delimiter. We count it as a
     *    ambiguous byte count = 0
     *
     * 2. The buffer tail contains a X number of characters,
     *    that forms a sequence, which matches with the
     *    head of delimiter. We count ambiguous byte count = X
     *
     *    // ***  eg: A segment of input file is as follows
     *
     *    " record 1792: I found this bug very interesting and
     *     I have completely read about it. record 1793: This bug
     *     can be solved easily record 1794: This ."
     *
     *    delimiter = "record";
     *
     *    supposing:- String at the end of buffer =
     *    "I found this bug very interesting and I have completely re"
     *    There for next buffer = "ad about it. record 179       ...."
     *
     *     The matching characters in the input
     *     buffer tail and delimiter head = "re"
     *     Therefore, ambiguous byte count = 2 ****   //
     *
     *     2.1 If the following bytes are the remaining characters of
     *         the delimiter, then we have to capture only up to the starting
     *         position of delimiter. That means, we need not include the
     *         ambiguous characters in str.
     *
     *     2.2 If the following bytes are not the remaining characters of
     *         the delimiter ( as mentioned in the example ),
     *         then we have to include the ambiguous characters in str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                //appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; //to be consumed in next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Parses a read from an interleaved FASTQ file.
 *
 * Only reads a single record.//w w  w  . j  a  v  a  2 s.  c  o m
 *
 * @param readName Text record containing read name. Output parameter.
 * @param value Text record containing full record. Output parameter.
 * @return Returns true if read was successful (did not hit EOF).
 *
 * @throws RuntimeException Throws exception if FASTQ record doesn't
 *   have proper formatting (e.g., record doesn't start with @).
 */
protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException {

    if (endOfCompressedSplit) {
        return false;
    }

    // ID line
    readName.clear();
    long skipped = appendLineInto(readName, true);
    if (skipped == 0) {
        return false; // EOF
    }

    if (readName.getBytes()[0] != '@') {
        throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage()
                + ". Line: " + readName + ". \n");
    }
    value.append(readName.getBytes(), 0, readName.getLength());

    // sequence
    appendLineInto(value, false);

    // separator line
    appendLineInto(value, false);

    // quality
    appendLineInto(value, false);

    return true;
}