Example usage for org.apache.hadoop.io Text bytesToCodePoint

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text bytesToCodePoint.

Prototype

public static int bytesToCodePoint(ByteBuffer bytes)

Source Link

Document

Returns the next code point at the current position in the buffer.

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) {
        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

        ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
        int cp;//from w  ww . j  a  v a 2 s  . c  om
        while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
            System.out.println(Integer.toHexString(cp));
        }
    }

From source file:io.aos.hdfs.TextIterator.java

License:Apache License

public static void main(String... args) {
    Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

    ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
    int cp;//from w w w  . j a  va2 s .  c o  m
    while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
        System.out.println(Integer.toHexString(cp));
    }
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

License:Apache License

/**
 * Get the next code point from the ByteBuffer. Moves the position in the
 * ByteBuffer forward to the next code point.
 * @param param the source of bytes/*from  www.j av a2s  .  c o m*/
 * @param defaultValue if there are no bytes left, use this value
 * @return the code point that was found at the front of the buffer.
 */
static int getNextCodepoint(ByteBuffer param, int defaultValue) {
    if (param.remaining() == 0) {
        return defaultValue;
    } else {
        return Text.bytesToCodePoint(param);
    }
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

License:Apache License

/**
 * Mask a string by finding the character category of each character
 * and replacing it with the matching literal.
 * @param source the source column vector
 * @param row the value index/* w  ww.  j  a v a2  s. c o  m*/
 * @param target the target column vector
 */
void maskString(BytesColumnVector source, int row, BytesColumnVector target) {
    int expectedBytes = source.length[row];
    ByteBuffer sourceBytes = ByteBuffer.wrap(source.vector[row], source.start[row], source.length[row]);
    // ensure we have enough space, if the masked data is the same size
    target.ensureValPreallocated(expectedBytes);
    byte[] outputBuffer = target.getValPreallocatedBytes();
    int outputOffset = target.getValPreallocatedStart();
    int outputStart = outputOffset;

    int index = 0;
    while (sourceBytes.remaining() > 0) {
        int cp = Text.bytesToCodePoint(sourceBytes);

        // Find the replacement for the current character.
        int replacement = getReplacement(cp);
        if (replacement == UNMASKED_CHAR || isIndexInUnmaskRange(index, source.length[row])) {
            replacement = cp;
        }

        // increment index
        index++;

        int len = getCodepointLength(replacement);

        // If the translation will overflow the buffer, we need to resize.
        // This will only happen when the masked size is larger than the original.
        if (len + outputOffset > outputBuffer.length) {
            // Revise estimate how much we are going to need now. We are maximally
            // pesamistic here so that we don't have to expand again for this value.
            int currentOutputStart = outputStart;
            int currentOutputLength = outputOffset - currentOutputStart;
            expectedBytes = currentOutputLength + len + sourceBytes.remaining() * 4;

            // Expand the buffer to fit the new estimate
            target.ensureValPreallocated(expectedBytes);

            // Copy over the bytes we've already written for this value and move
            // the pointers to the new output buffer.
            byte[] oldBuffer = outputBuffer;
            outputBuffer = target.getValPreallocatedBytes();
            outputOffset = target.getValPreallocatedStart();
            outputStart = outputOffset;
            System.arraycopy(oldBuffer, currentOutputStart, outputBuffer, outputOffset, currentOutputLength);
            outputOffset += currentOutputLength;
        }

        // finally copy the bytes
        writeCodepoint(replacement, outputBuffer, outputOffset, len);
        outputOffset += len;
    }
    target.setValPreallocated(row, outputOffset - outputStart);
}

From source file:org.mgrover.hive.translate.GenericUDFTranslate.java

License:Apache License

/**
 * Pre-process the from and to strings populate {@link #replacementMap} and {@link #deletionSet}.
 * /*from w  w  w .j av a  2  s .c o  m*/
 * @param from
 *          from string to be used for translation
 * @param to
 *          to string to be used for translation
 */
private void populateMappings(Text from, Text to) {
    replacementMap.clear();
    deletionSet.clear();

    ByteBuffer fromBytes = ByteBuffer.wrap(from.getBytes(), 0, from.getLength());
    ByteBuffer toBytes = ByteBuffer.wrap(to.getBytes(), 0, to.getLength());

    // Traverse through the from string, one code point at a time
    while (fromBytes.hasRemaining()) {
        // This will also move the iterator ahead by one code point
        int fromCodePoint = Text.bytesToCodePoint(fromBytes);
        // If the to string has more code points, make sure to traverse it too
        if (toBytes.hasRemaining()) {
            int toCodePoint = Text.bytesToCodePoint(toBytes);
            // If the code point from from string already has a replacement or is to be deleted, we
            // don't need to do anything, just move on to the next code point
            if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) {
                continue;
            }
            replacementMap.put(fromCodePoint, toCodePoint);
        } else {
            // If the code point from from string already has a replacement or is to be deleted, we
            // don't need to do anything, just move on to the next code point
            if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) {
                continue;
            }
            deletionSet.add(fromCodePoint);
        }
    }
}

From source file:org.mgrover.hive.translate.GenericUDFTranslate.java

License:Apache License

/**
 * Translates the input string based on {@link #replacementMap} and {@link #deletionSet} and
 * returns the translated string./*from w w w  .  j a va 2s.  c om*/
 * 
 * @param input
 *          input string to perform the translation on
 * @return translated string
 */
private String processInput(Text input) {
    StringBuilder resultBuilder = new StringBuilder();
    // Obtain the byte buffer from the input string so we can traverse it code point by code point
    ByteBuffer inputBytes = ByteBuffer.wrap(input.getBytes(), 0, input.getLength());
    // Traverse the byte buffer containing the input string one code point at a time
    while (inputBytes.hasRemaining()) {
        int inputCodePoint = Text.bytesToCodePoint(inputBytes);
        // If the code point exists in deletion set, no need to emit out anything for this code point.
        // Continue on to the next code point
        if (deletionSet.contains(inputCodePoint)) {
            continue;
        }

        Integer replacementCodePoint = replacementMap.get(inputCodePoint);
        // If a replacement exists for this code point, emit out the replacement and append it to the
        // output string. If no such replacement exists, emit out the original input code point
        char[] charArray = Character
                .toChars((replacementCodePoint != null) ? replacementCodePoint : inputCodePoint);
        resultBuilder.append(charArray);
    }
    String resultString = resultBuilder.toString();
    return resultString;
}