org.apache.cassandra.serializers.UTF8Serializer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.cassandra.serializers.UTF8Serializer.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.serializers;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

public class UTF8Serializer extends AbstractTextSerializer {
    public static final UTF8Serializer instance = new UTF8Serializer();

    private UTF8Serializer() {
        super(StandardCharsets.UTF_8);
    }

    public void validate(ByteBuffer bytes) throws MarshalException {
        if (!UTF8Validator.validate(bytes))
            throw new MarshalException("String didn't validate.");
    }

    static class UTF8Validator {
        enum State {
            START, TWO, TWO_80, THREE_a0bf, THREE_80bf_1, THREE_80bf_2, FOUR_90bf, FOUR_80bf_3,
        };

        // since we're not converting to java strings, we don't need to worry about converting to surrogates.
        // buf has already been sliced/duplicated.
        static boolean validate(ByteBuffer buf) {
            if (buf == null)
                return false;

            buf = buf.slice();
            int b = 0;
            State state = State.START;
            while (buf.remaining() > 0) {
                b = buf.get();
                switch (state) {
                case START:
                    if (b >= 0) {
                        // ascii, state stays start.
                        if (b > 127)
                            return false;
                    } else if ((b >> 5) == -2) {
                        // validate first byte of 2-byte char, 0xc2-0xdf
                        if (b == (byte) 0xc0)
                            // special case: modified utf8 null is 0xc080.
                            state = State.TWO_80;
                        else if ((b & 0x1e) == 0)
                            return false;
                        else
                            state = State.TWO;
                    } else if ((b >> 4) == -2) {
                        // 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ.
                        // so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf.
                        if (b == (byte) 0xe0)
                            state = State.THREE_a0bf;
                        else
                            state = State.THREE_80bf_2;
                        break;
                    } else if ((b >> 3) == -2) {
                        // 4 bytes. this is where the fun starts.
                        if (b == (byte) 0xf0)
                            // 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf
                            state = State.FOUR_90bf;
                        else
                            // 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
                            // 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
                            state = State.FOUR_80bf_3;
                        break;
                    } else
                        return false; // malformed.
                    break;
                case TWO:
                    // validate second byte of 2-byte char, 0x80-0xbf
                    if ((b & 0xc0) != 0x80)
                        return false;
                    state = State.START;
                    break;
                case TWO_80:
                    if (b != (byte) 0x80)
                        return false;
                    state = State.START;
                    break;
                case THREE_a0bf:
                    if ((b & 0xe0) == 0x80)
                        return false;
                    state = State.THREE_80bf_1;
                    break;
                case THREE_80bf_1:
                    // expecting 0x80-0xbf
                    if ((b & 0xc0) != 0x80)
                        return false;
                    state = State.START;
                    break;
                case THREE_80bf_2:
                    // expecting 0x80-bf and then another of the same.
                    if ((b & 0xc0) != 0x80)
                        return false;
                    state = State.THREE_80bf_1;
                    break;
                case FOUR_90bf:
                    // expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq).
                    if ((b & 0x30) == 0)
                        return false;
                    state = State.THREE_80bf_2;
                    break;
                case FOUR_80bf_3:
                    // expecting 0x80-bf 3 times. degenerates to THREE_80bf_2.
                    if ((b & 0xc0) != 0x80)
                        return false;
                    state = State.THREE_80bf_2;
                    break;
                default:
                    return false; // invalid state.
                }
            }
            // if state != start, we've got underflow. that's an error.
            return state == State.START;
        }
    }
}