Encode characters from a char[] source, starting at offset for length chars.

Description

Demo Code

/**/*  ww w.  j  a  va 2 s  .c  o  m*/
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class Main{
    private static final int SURROGATE_OFFSET = Character.MIN_SUPPLEMENTARY_CODE_POINT
            - (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
    /** Encode characters from a char[] source, starting at
     *  offset for length chars.  Returns a hash of the resulting bytes.  After encoding, result.offset will always be 0. */
    // TODO: broken if incoming result.offset != 0
    public static int UTF16toUTF8WithHash(final char[] source,
            final int offset, final int length, BytesRef result) {
        int hash = 0;
        int upto = 0;
        int i = offset;
        final int end = offset + length;
        byte[] out = result.bytes;
        // Pre-allocate for worst case 4-for-1
        final int maxLen = length * 4;
        if (out.length < maxLen)
            out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
        result.offset = 0;

        while (i < end) {

            final int code = (int) source[i++];

            if (code < 0x80) {
                hash = 31 * hash + (out[upto++] = (byte) code);
            } else if (code < 0x800) {
                hash = 31 * hash
                        + (out[upto++] = (byte) (0xC0 | (code >> 6)));
                hash = 31 * hash
                        + (out[upto++] = (byte) (0x80 | (code & 0x3F)));
            } else if (code < 0xD800 || code > 0xDFFF) {
                hash = 31 * hash
                        + (out[upto++] = (byte) (0xE0 | (code >> 12)));
                hash = 31
                        * hash
                        + (out[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F)));
                hash = 31 * hash
                        + (out[upto++] = (byte) (0x80 | (code & 0x3F)));
            } else {
                // surrogate pair
                // confirm valid high surrogate
                if (code < 0xDC00 && i < end) {
                    int utf32 = (int) source[i];
                    // confirm valid low surrogate and write pair
                    if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                        utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
                        i++;
                        hash = 31
                                * hash
                                + (out[upto++] = (byte) (0xF0 | (utf32 >> 18)));
                        hash = 31
                                * hash
                                + (out[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F)));
                        hash = 31
                                * hash
                                + (out[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F)));
                        hash = 31
                                * hash
                                + (out[upto++] = (byte) (0x80 | (utf32 & 0x3F)));
                        continue;
                    }
                }
                // replace unpaired surrogate or out-of-order low surrogate
                // with substitution character
                hash = 31 * hash + (out[upto++] = (byte) 0xEF);
                hash = 31 * hash + (out[upto++] = (byte) 0xBF);
                hash = 31 * hash + (out[upto++] = (byte) 0xBD);
            }
        }
        //assert matches(source, offset, length, out, upto);
        result.length = upto;
        return hash;
    }
}
Encode characters from a char[] source, starting at offset for length chars. - Java java.lang

Description

Demo Code

Related Tutorials