Returns the number of code points in this UTF8 sequence. - Java java.lang

Java examples for java.lang:String UTF

Description

Returns the number of code points in this UTF8 sequence.

Demo Code

/*/*from www.j a v  a 2s  . c  o m*/
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class Main{
    /** 
     * Returns the number of code points in this UTF8 sequence.
     * 
     * <p>This method assumes valid UTF8 input. This method 
     * <strong>does not perform</strong> full UTF8 validation, it will check only the 
     * first byte of each codepoint (for multi-byte sequences any bytes after 
     * the head are skipped).  
     * 
     * @throws IllegalArgumentException If invalid codepoint header byte occurs or the 
     *    content is prematurely truncated.
     */
    public static int codePointCount(BytesRef utf8) {
        int pos = utf8.offset;
        final int limit = pos + utf8.length;
        final byte[] bytes = utf8.bytes;

        int codePointCount = 0;
        for (; pos < limit; codePointCount++) {
            int v = bytes[pos] & 0xFF;
            if (v < /* 0xxx xxxx */0x80) {
                pos += 1;
                continue;
            }
            if (v >= /* 110x xxxx */0xc0) {
                if (v < /* 111x xxxx */0xe0) {
                    pos += 2;
                    continue;
                }
                if (v < /* 1111 xxxx */0xf0) {
                    pos += 3;
                    continue;
                }
                if (v < /* 1111 1xxx */0xf8) {
                    pos += 4;
                    continue;
                }
                // fallthrough, consider 5 and 6 byte sequences invalid. 
            }

            // Anything not covered above is invalid UTF8.
            throw new IllegalArgumentException();
        }

        // Check if we didn't go over the limit on the last character.
        if (pos > limit)
            throw new IllegalArgumentException();

        return codePointCount;
    }
}

Related Tutorials