Convert UTF8 bytes into UTF16 characters. - Java java.lang

Java examples for java.lang:String UTF

Description

Convert UTF8 bytes into UTF16 characters.

Demo Code

/**/*  w  w  w. ja  v a 2  s.c o  m*/
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class Main{
    public static final int UNI_SUR_HIGH_START = 0xD800;
    public static final int UNI_SUR_LOW_START = 0xDC00;
    private static final long UNI_MAX_BMP = 0x0000FFFF;
    private static final int HALF_BASE = 0x0010000;
    private static final long HALF_SHIFT = 10;
    private static final long HALF_MASK = 0x3FFL;
    /** Convert UTF8 bytes into UTF16 characters.  If offset
     *  is non-zero, conversion starts at that starting point
     *  in utf8, re-using the results from the previous call
     *  up until offset. */
    public static void UTF8toUTF16(final byte[] utf8, final int offset,
            final int length, final UTF16Result result) {

        final int end = offset + length;
        char[] out = result.result;
        if (result.offsets.length <= end) {
            result.offsets = ArrayUtil.grow(result.offsets, end + 1);
        }
        final int[] offsets = result.offsets;

        // If incremental decoding fell in the middle of a
        // single unicode character, rollback to its start:
        int upto = offset;
        while (offsets[upto] == -1)
            upto--;

        int outUpto = offsets[upto];

        // Pre-allocate for worst case 1-for-1
        if (outUpto + length >= out.length) {
            out = result.result = ArrayUtil.grow(out, outUpto + length + 1);
        }

        while (upto < end) {

            final int b = utf8[upto] & 0xff;
            final int ch;

            offsets[upto++] = outUpto;

            if (b < 0xc0) {
                assert b < 0x80;
                ch = b;
            } else if (b < 0xe0) {
                ch = ((b & 0x1f) << 6) + (utf8[upto] & 0x3f);
                offsets[upto++] = -1;
            } else if (b < 0xf0) {
                ch = ((b & 0xf) << 12) + ((utf8[upto] & 0x3f) << 6)
                        + (utf8[upto + 1] & 0x3f);
                offsets[upto++] = -1;
                offsets[upto++] = -1;
            } else {
                assert b < 0xf8;
                ch = ((b & 0x7) << 18) + ((utf8[upto] & 0x3f) << 12)
                        + ((utf8[upto + 1] & 0x3f) << 6)
                        + (utf8[upto + 2] & 0x3f);
                offsets[upto++] = -1;
                offsets[upto++] = -1;
                offsets[upto++] = -1;
            }

            if (ch <= UNI_MAX_BMP) {
                // target is a character <= 0xFFFF
                out[outUpto++] = (char) ch;
            } else {
                // target is a character in range 0xFFFF - 0x10FFFF
                final int chHalf = ch - HALF_BASE;
                out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
                out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
            }
        }

        offsets[upto] = outUpto;
        result.length = outUpto;
    }
    /**
     * Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if 
     * it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
     * <p>
     * NOTE: Full characters are read, even if this reads past the length passed (and
     * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
     * Explicit checks for valid UTF-8 are not performed. 
     */
    // TODO: broken if incoming result.offset != 0
    public static void UTF8toUTF16(byte[] utf8, int offset, int length,
            CharsRef chars) {
        int out_offset = chars.offset = 0;
        final char[] out = chars.chars = ArrayUtil
                .grow(chars.chars, length);
        final int limit = offset + length;
        while (offset < limit) {
            int b = utf8[offset++] & 0xff;
            if (b < 0xc0) {
                assert b < 0x80;
                out[out_offset++] = (char) b;
            } else if (b < 0xe0) {
                out[out_offset++] = (char) (((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
            } else if (b < 0xf0) {
                out[out_offset++] = (char) (((b & 0xf) << 12)
                        + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
                offset += 2;
            } else {
                assert b < 0xf8 : "b=" + b;
                int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12)
                        + ((utf8[offset + 1] & 0x3f) << 6)
                        + (utf8[offset + 2] & 0x3f);
                offset += 3;
                if (ch < UNI_MAX_BMP) {
                    out[out_offset++] = (char) ch;
                } else {
                    int chHalf = ch - 0x0010000;
                    out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
                    out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
                }
            }
        }
        chars.length = out_offset - chars.offset;
    }
    /**
     * Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)}
     * @see #UTF8toUTF16(byte[], int, int, CharsRef)
     */
    public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {
        UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);
    }
}

Related Tutorials