Example usage for org.apache.lucene.util UnicodeUtil UNI_SUR_HIGH_START

List of usage examples for org.apache.lucene.util UnicodeUtil UNI_SUR_HIGH_START

Introduction

In this page you can find the example usage for org.apache.lucene.util UnicodeUtil UNI_SUR_HIGH_START.

Prototype

int UNI_SUR_HIGH_START

To view the source code for org.apache.lucene.util UnicodeUtil UNI_SUR_HIGH_START.

Click Source Link

Usage

From source file:org.elasticsearch.common.Unicode.java

License:Apache License

/**
 * Convert UTF8 bytes into UTF16 characters.  If offset
 * is non-zero, conversion starts at that starting point
 * in utf8, re-using the results from the previous call
 * up until offset./*w  w  w .  ja v  a  2 s. c o  m*/
 */
public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length,
        final UTF16Result result) {

    final int end = offset + length;
    char[] out = result.result;
    //        if (result.offsets.length <= end) {
    //            int[] newOffsets = new int[2 * end];
    //            System.arraycopy(result.offsets, 0, newOffsets, 0, result.offsets.length);
    //            result.offsets = newOffsets;
    //        }
    //        final int[] offsets = result.offsets;

    // If incremental decoding fell in the middle of a
    // single unicode character, rollback to its start:
    int upto = offset;
    //        while (offsets[upto] == -1)
    //            upto--;

    int outUpto = 0; // offsets[upto];

    // Pre-allocate for worst case 1-for-1
    if (outUpto + length >= out.length) {
        char[] newOut = new char[2 * (outUpto + length)];
        System.arraycopy(out, 0, newOut, 0, outUpto);
        result.result = out = newOut;
    }

    while (upto < end) {

        final int b = utf8[upto] & 0xff;
        final int ch;

        upto += 1; // CHANGE
        //            offsets[upto++] = outUpto;

        if (b < 0xc0) {
            assert b < 0x80;
            ch = b;
        } else if (b < 0xe0) {
            ch = ((b & 0x1f) << 6) + (utf8[upto] & 0x3f);
            upto += 1; // CHANGE
            //                offsets[upto++] = -1;
        } else if (b < 0xf0) {
            ch = ((b & 0xf) << 12) + ((utf8[upto] & 0x3f) << 6) + (utf8[upto + 1] & 0x3f);
            upto += 2; // CHANGE
            //                offsets[upto++] = -1;
            //                offsets[upto++] = -1;
        } else {
            assert b < 0xf8;
            ch = ((b & 0x7) << 18) + ((utf8[upto] & 0x3f) << 12) + ((utf8[upto + 1] & 0x3f) << 6)
                    + (utf8[upto + 2] & 0x3f);
            upto += 3; // CHANGE
            //                offsets[upto++] = -1;
            //                offsets[upto++] = -1;
            //                offsets[upto++] = -1;
        }

        if (ch <= UNI_MAX_BMP) {
            // target is a character <= 0xFFFF
            out[outUpto++] = (char) ch;
        } else {
            // target is a character in range 0xFFFF - 0x10FFFF
            final int chHalf = ch - HALF_BASE;
            out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UnicodeUtil.UNI_SUR_HIGH_START);
            out[outUpto++] = (char) ((chHalf & HALF_MASK) + UnicodeUtil.UNI_SUR_LOW_START);
        }
    }

    //        offsets[upto] = outUpto;
    result.length = outUpto;
}