Encode characters from a char[] source, starting at offset and stopping when the character 0xffff is seen.

Description

Demo Code

/**/*from ww w  .ja  v  a  2 s .  com*/
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class Main{
    private static final int SURROGATE_OFFSET = Character.MIN_SUPPLEMENTARY_CODE_POINT
            - (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
    /** Encode characters from a char[] source, starting at
     *  offset and stopping when the character 0xffff is seen.
     *  Returns the number of bytes written to bytesOut. */
    public static void UTF16toUTF8(final char[] source, final int offset,
            UTF8Result result) {

        int upto = 0;
        int i = offset;
        byte[] out = result.result;

        while (true) {

            final int code = (int) source[i++];

            if (upto + 4 > out.length) {
                out = result.result = ArrayUtil.grow(out, upto + 4);
            }
            if (code < 0x80)
                out[upto++] = (byte) code;
            else if (code < 0x800) {
                out[upto++] = (byte) (0xC0 | (code >> 6));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else if (code < 0xD800 || code > 0xDFFF) {
                if (code == 0xffff)
                    // END
                    break;
                out[upto++] = (byte) (0xE0 | (code >> 12));
                out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && source[i] != 0xffff) {
                    int utf32 = (int) source[i];
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
                        i++;
                        out[upto++] = (byte) (0xF0 | (utf32 >> 18));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
                        out[upto++] = (byte) (0x80 | (utf32 & 0x3F));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                out[upto++] = (byte) 0xEF;
                out[upto++] = (byte) 0xBF;
                out[upto++] = (byte) 0xBD;
            }
        }
        //assert matches(source, offset, i-offset-1, out, upto);
        result.length = upto;
    }
    /** Encode characters from a char[] source, starting at
     *  offset for length chars.  Returns the number of bytes
     *  written to bytesOut. */
    public static void UTF16toUTF8(final char[] source, final int offset,
            final int length, UTF8Result result) {

        int upto = 0;
        int i = offset;
        final int end = offset + length;
        byte[] out = result.result;

        while (i < end) {

            final int code = (int) source[i++];

            if (upto + 4 > out.length) {
                out = result.result = ArrayUtil.grow(out, upto + 4);
            }
            if (code < 0x80)
                out[upto++] = (byte) code;
            else if (code < 0x800) {
                out[upto++] = (byte) (0xC0 | (code >> 6));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else if (code < 0xD800 || code > 0xDFFF) {
                out[upto++] = (byte) (0xE0 | (code >> 12));
                out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && i < end && source[i] != 0xffff) {
                    int utf32 = (int) source[i];
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
                        i++;
                        out[upto++] = (byte) (0xF0 | (utf32 >> 18));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
                        out[upto++] = (byte) (0x80 | (utf32 & 0x3F));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                out[upto++] = (byte) 0xEF;
                out[upto++] = (byte) 0xBF;
                out[upto++] = (byte) 0xBD;
            }
        }
        //assert matches(source, offset, length, out, upto);
        result.length = upto;
    }
    /** Encode characters from this String, starting at offset
     *  for length characters.  Returns the number of bytes
     *  written to bytesOut. */
    public static void UTF16toUTF8(final String s, final int offset,
            final int length, UTF8Result result) {
        final int end = offset + length;

        byte[] out = result.result;

        int upto = 0;
        for (int i = offset; i < end; i++) {
            final int code = (int) s.charAt(i);

            if (upto + 4 > out.length) {
                out = result.result = ArrayUtil.grow(out, upto + 4);
            }
            if (code < 0x80)
                out[upto++] = (byte) code;
            else if (code < 0x800) {
                out[upto++] = (byte) (0xC0 | (code >> 6));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else if (code < 0xD800 || code > 0xDFFF) {
                out[upto++] = (byte) (0xE0 | (code >> 12));
                out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && (i < end - 1)) {
                    int utf32 = (int) s.charAt(i + 1);
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
                        i++;
                        out[upto++] = (byte) (0xF0 | (utf32 >> 18));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
                        out[upto++] = (byte) (0x80 | (utf32 & 0x3F));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                out[upto++] = (byte) 0xEF;
                out[upto++] = (byte) 0xBF;
                out[upto++] = (byte) 0xBD;
            }
        }
        //assert matches(s, offset, length, out, upto);
        result.length = upto;
    }
    /** Encode characters from this String, starting at offset
     *  for length characters. After encoding, result.offset will always be 0.
     */
    // TODO: broken if incoming result.offset != 0
    public static void UTF16toUTF8(final CharSequence s, final int offset,
            final int length, BytesRef result) {
        final int end = offset + length;

        byte[] out = result.bytes;
        result.offset = 0;
        // Pre-allocate for worst case 4-for-1
        final int maxLen = length * 4;
        if (out.length < maxLen)
            out = result.bytes = new byte[maxLen];

        int upto = 0;
        for (int i = offset; i < end; i++) {
            final int code = (int) s.charAt(i);

            if (code < 0x80)
                out[upto++] = (byte) code;
            else if (code < 0x800) {
                out[upto++] = (byte) (0xC0 | (code >> 6));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else if (code < 0xD800 || code > 0xDFFF) {
                out[upto++] = (byte) (0xE0 | (code >> 12));
                out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && (i < end - 1)) {
                    int utf32 = (int) s.charAt(i + 1);
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
                        i++;
                        out[upto++] = (byte) (0xF0 | (utf32 >> 18));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
                        out[upto++] = (byte) (0x80 | (utf32 & 0x3F));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                out[upto++] = (byte) 0xEF;
                out[upto++] = (byte) 0xBF;
                out[upto++] = (byte) 0xBD;
            }
        }
        //assert matches(s, offset, length, out, upto);
        result.length = upto;
    }
    /** Encode characters from a char[] source, starting at
     *  offset for length chars. After encoding, result.offset will always be 0.
     */
    // TODO: broken if incoming result.offset != 0
    public static void UTF16toUTF8(final char[] source, final int offset,
            final int length, BytesRef result) {

        int upto = 0;
        int i = offset;
        final int end = offset + length;
        byte[] out = result.bytes;
        // Pre-allocate for worst case 4-for-1
        final int maxLen = length * 4;
        if (out.length < maxLen)
            out = result.bytes = new byte[maxLen];
        result.offset = 0;

        while (i < end) {

            final int code = (int) source[i++];

            if (code < 0x80)
                out[upto++] = (byte) code;
            else if (code < 0x800) {
                out[upto++] = (byte) (0xC0 | (code >> 6));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else if (code < 0xD800 || code > 0xDFFF) {
                out[upto++] = (byte) (0xE0 | (code >> 12));
                out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
                out[upto++] = (byte) (0x80 | (code & 0x3F));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && i < end) {
                    int utf32 = (int) source[i];
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
                        i++;
                        out[upto++] = (byte) (0xF0 | (utf32 >> 18));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
                        out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
                        out[upto++] = (byte) (0x80 | (utf32 & 0x3F));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                out[upto++] = (byte) 0xEF;
                out[upto++] = (byte) 0xBF;
                out[upto++] = (byte) 0xBD;
            }
        }
        //assert matches(source, offset, length, out, upto);
        result.length = upto;
    }
}
Encode characters from a char[] source, starting at offset and stopping when the character 0xffff is seen. - Java java.lang

Description

Demo Code

Related Tutorials