Example usage for org.apache.hadoop.io Text append

List of usage examples for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len) 

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.EscapedLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF), '\r' (CR),
 * or '\r\n' (CR+LF).  Will ignore any of these termination characters
 * if they are proceeded by a designated escape character. EOF also
 * terminates an otherwise unterminated line.
 *
 * @param str               the object to store the given line (without the newline)
 * @param maxLineLength     the maximum number of bytes to store into str; the rest will be silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume in this call.  This is only a hint, because if the line crosses this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline found
 *///from w  ww .  jav a 2 s.c o  m
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength() as an optimization
    int newLineLength = 0; // length of the terminating newline
    boolean prevCharCR = false; // true if prev char was \r
    long bytesConsumed = 0;

    do {
        int startPos = bufferPos; // starting from where we left off
        if (bufferPos >= bufferLength) {
            startPos = bufferPos = 0;
            if (prevCharCR) {
                ++bytesConsumed; // account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPos < bufferLength; ++bufferPos) {
            boolean escaped = false;
            if (prevCharCR && bufferPos > 1) {
                escaped = (buffer[bufferPos - 2] == escapeChar);
            }
            if (!prevCharCR && bufferPos > 0) {
                escaped = (buffer[bufferPos - 1] == escapeChar);
            }

            if (buffer[bufferPos] == LF && !escaped) {
                newLineLength = prevCharCR ? 2 : 1;
                ++bufferPos; // at next loop proceed from following byte
                break;
            }
            if (prevCharCR && !escaped) { // CR + notLF, we are at notLF
                newLineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPos] == CR);
            //prevCharCR = (buffer[bufferPos] == CR && !escaped);
        }
        int readLength = bufferPos - startPos;
        if (prevCharCR && newLineLength == 0) {
            --readLength;
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newLineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPos, appendLength);
            txtLength += appendLength;
        }
    } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }

    return (int) bytesConsumed;
}

From source file:cosmos.impl.CosmosImpl.java

License:Apache License

protected Mutation getDocumentPrefix(Store id, Record<?> record, byte[] suffix) {
    final Text t = new Text();
    byte[] b = id.uuid().getBytes();
    t.append(b, 0, b.length);
    t.append(new byte[] { 0 }, 0, 1);
    t.append(suffix, 0, suffix.length);//  ww w.j  a v  a  2  s  .c  om

    return new Mutation(t);
}

From source file:cosmos.mapred.AggregatingRecordReader.java

License:Apache License

private void textAppend(Text t, String s) throws IOException {
    try {//  w  w w . j  a v a 2s .  c  o m
        ByteBuffer buf = Text.encode(s, false);
        t.append(buf.array(), 0, buf.limit());
    } catch (CharacterCodingException e) {
        throw new IOException(e);
    }
}

From source file:cosmos.mapred.LfLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
 * //from   w  w  w .j a  va2s.c  om
 * @param str
 *          the object to store the given line (without newline)
 * @param maxLineLength
 *          the maximum number of bytes to store into str; the rest of the line is silently discarded.
 * @param maxBytesToConsume
 *          the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
 *          overshoot potentially by as much as one buffer length.
 * 
 * @return the number of bytes read including the (longest) newline found.
 * 
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
     * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java

License:Apache License

@Test
public void testSplit() throws Exception {
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}

From source file:eastcircle.terasort.TotalOrderPartitioner.java

License:Apache License

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly./*from   w ww .j  av  a  2  s  .c  o m*/
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) {
    int depth = prefix.getLength();
    if (depth >= maxDepth || lower == upper) {
        return new LeafTrieNode(depth, splits, lower, upper);
    }
    InnerTrieNode result = new InnerTrieNode(depth);
    Text trial = new Text(prefix);
    // append an extra byte on to the prefix
    trial.append(new byte[1], 0, 1);
    int currentBound = lower;
    for (int ch = 0; ch < 255; ++ch) {
        trial.getBytes()[depth] = (byte) (ch + 1);
        lower = currentBound;
        while (currentBound < upper) {
            if (splits[currentBound].compareTo(trial) >= 0) {
                break;
            }
            currentBound += 1;
        }
        trial.getBytes()[depth] = (byte) ch;
        result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth);
    }
    // pick up the rest
    trial.getBytes()[depth] = (byte) 255;
    result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth);
    return result;
}

From source file:edu.cshl.schatz.jnomics.util.TextCutter.java

License:Open Source License

/**
 * Sets the contents of <code>text</code> to the value of the requested cuts
 * (including any intermediate delimiters).
 * <p>//from w  w w .  j a  v  a  2 s  .  c o  m
 * Negative cut indices are interpreted as
 * <code>{@link #getCutCount()} + cutIndex</code>. For example, a
 * <code>cutIndex</code> of -1 would return the last cut.
 * <p>
 * If <code>lastIndex < firstIndex</code> (after converting negative indices
 * to their positive equivalents), then the resulting cut order is reversed.
 * For example, given the input "0 1 2 3 4", <code>getCutRange(4,2)</code>
 * and <code>getCutRange(-1,-3)</code> would both return "4 3 2".
 * 
 * @param text The {@link Text} instance to reset.
 * @param firstIndex The 0-based index of the first desired cut in the range
 *            (inclusive).
 * @param lastIndex The 0-based index of the last desired cut in the range
 *            (inclusive).
 * @return The passed {@link Text} instance text (not a copy).
 * @throws ArrayIndexOutOfBoundsException if cutIndex is greater than or
 *             equal to the number of cuts.
 */
public Text getCutRange(Text text, int firstIndex, int lastIndex) {
    if (modFlag) {
        reinitialize();
    }

    if (firstIndex < 0) {
        firstIndex = cutCount + firstIndex;
    }

    if (lastIndex < 0) {
        lastIndex = cutCount + lastIndex;
    }

    if (lastIndex >= cutCount) {
        throw new ArrayIndexOutOfBoundsException("Requested cut does not exist (cutCount=" + cutCount + ")");
    }

    int position, length;

    if (firstIndex <= lastIndex) {
        position = cutIndices[firstIndex][0];
        length = lastIndex - firstIndex;

        for (int i = firstIndex; i <= lastIndex; i++) {
            length += cutIndices[i][1];
        }

        text.set(sourceText.getBytes(), position, length);
    } else {
        final byte[] delimBytes = new byte[] { (byte) delimiterChar };

        position = cutIndices[firstIndex][0];
        length = cutIndices[firstIndex][1];
        text.set(sourceText.getBytes(), position, length);

        for (int i = firstIndex - 1; i >= lastIndex; i--) {
            position = cutIndices[i][0];
            length = cutIndices[i][1];

            text.append(delimBytes, 0, 1);
            text.append(sourceText.getBytes(), position, length);
        }
    }

    return text;
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();//  ww w . ja v a 2  s .c  o m

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // extract chunks from sentence
    mChunks.clear();
    mChunkTokens.clear();
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    Text lastNETag = new Text();
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0;
        Text neTag = t.getNETag();
        if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0
                || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O'))
                        && (chunkType == 'B' || chunkType == 'O')) {
            if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
                Text chunk = createChunk(mChunkTokens, mChunkType);
                mChunks.add(chunk);
            }
            mChunkTokens.clear();
            mChunkType.set(t.getChunkTag());
        }
        mChunkTokens.add(t.getToken());
        lastNETag.set(neTag);
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
        Text chunk = createChunk(mChunkTokens, mChunkType);
        mChunks.add(chunk);
    }

    // generate adjacent (context, pattern) pairs
    for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) {
        Text leftPattern = new Text();
        leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength());
        leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1));

        Text rightPattern = new Text();
        rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength());
        addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1));
    }

    // generate non-adjacent (context, pattern) pairs based on chunks
    for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) {
        for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) {
            if (patternPos - leftSkip - 1 < 0) {
                continue;
            }

            if (mOrContextStyle && !mRightOnlyContextStyle) {
                addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                        ContextPatternWritable.ASTERISK);
            }

            if (mOrContextStyle && mLeftOnlyContextStyle) {
                continue;
            }

            for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) {
                if (patternPos + rightSkip + 1 >= mChunks.size()) {
                    continue;
                }

                // construct (context, pattern) pair
                if (mOrContextStyle) {
                    addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                } else {
                    addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                }
            }
        }
    }

    // get iterator
    mChunkPairsIter = mChunkPairs.iterator();
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private Text createChunk(List<Text> terms, Text type) {
    Text t = new Text();

    for (int i = 0; i < terms.size(); i++) {
        Text term = terms.get(i);

        t.append(term.getBytes(), 0, term.getLength());

        if (i != terms.size() - 1) {
            t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }/*from   ww  w  .j a  v  a2 s  . c  o m*/
    }

    if (t.getLength() > 0 && !mSurfaceForms) {
        t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (type.getLength() > 2) {
            t.append(type.getBytes(), 2, type.getLength() - 2);
        }
    }

    return t;
}

From source file:edu.isi.mavuno.extract.CooccurExtractor.java

License:Apache License

protected boolean getPattern(Text pattern, Text[] terms, int start, int len) {
    if (start < 0 || start + len > terms.length) {
        return false;
    }//from   w  w w. j  ava2s . c  o m

    pattern.clear();
    for (int i = start; i < start + len; i++) {
        pattern.append(terms[i].getBytes(), 0, terms[i].getLength());
        if (i != start + len - 1) {
            pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }
    }

    return true;
}