Reads a string in UTF-8 encoding from a byte array. - Java java.lang

Java examples for java.lang:String UTF

Description

Reads a string in UTF-8 encoding from a byte array.

Demo Code

/*//from w  ww . j  a  va 2  s. co m
 Written in 2013 by Peter O.
 Any copyright is dedicated to the Public Domain.
 http://creativecommons.org/publicdomain/zero/1.0/
 If you like this, you should donate to Peter O.
 at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
 */
//package com.java2s;

public class Main {
    /**
     * Reads a string in UTF-8 encoding from a byte array.
     * @param data A byte array containing a UTF-8 string.
     * @param offset Offset into the byte array to start reading.
     * @param bytesCount Length, in bytes, of the UTF-8 string.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when invalid UTF-8
     * is seen.
     * @return 0 if the entire string was read without errors, or -1 if the string
     * is not valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code data} is null or {@code
     * builder} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code bytesCount} is less than 0, or offset plus bytesCount is
     * greater than the length of {@code data} .
     */
    public static int ReadUtf8FromBytes(byte[] data, int offset,
            int bytesCount, StringBuilder builder, boolean replace) {
        if (data == null) {
            throw new NullPointerException("data");
        }
        if (offset < 0) {
            throw new IllegalArgumentException("offset (" + offset
                    + ") is less than " + "0");
        }
        if (offset > data.length) {
            throw new IllegalArgumentException("offset (" + offset
                    + ") is more than " + data.length);
        }
        if (bytesCount < 0) {
            throw new IllegalArgumentException("bytesCount (" + bytesCount
                    + ") is less than 0");
        }
        if (bytesCount > data.length) {
            throw new IllegalArgumentException("bytesCount (" + bytesCount
                    + ") is more than " + data.length);
        }
        if (data.length - offset < bytesCount) {
            throw new IllegalArgumentException("data.length minus offset ("
                    + (data.length - offset) + ") is less than "
                    + bytesCount);
        }
        if (builder == null) {
            throw new NullPointerException("builder");
        }
        int cp = 0;
        int bytesSeen = 0;
        int bytesNeeded = 0;
        int lower = 0x80;
        int upper = 0xbf;
        int pointer = offset;
        int endpointer = offset + bytesCount;
        while (pointer < endpointer) {
            int b = data[pointer] & (int) 0xff;
            ++pointer;
            if (bytesNeeded == 0) {
                if ((b & 0x7f) == b) {
                    builder.append((char) b);
                } else if (b >= 0xc2 && b <= 0xdf) {
                    bytesNeeded = 1;
                    cp = (b - 0xc0) << 6;
                } else if (b >= 0xe0 && b <= 0xef) {
                    lower = (b == 0xe0) ? 0xa0 : 0x80;
                    upper = (b == 0xed) ? 0x9f : 0xbf;
                    bytesNeeded = 2;
                    cp = (b - 0xe0) << 12;
                } else if (b >= 0xf0 && b <= 0xf4) {
                    lower = (b == 0xf0) ? 0x90 : 0x80;
                    upper = (b == 0xf4) ? 0x8f : 0xbf;
                    bytesNeeded = 3;
                    cp = (b - 0xf0) << 18;
                } else {
                    if (replace) {
                        builder.append((char) 0xfffd);
                    } else {
                        return -1;
                    }
                }
                continue;
            }
            if (b < lower || b > upper) {
                cp = bytesNeeded = bytesSeen = 0;
                lower = 0x80;
                upper = 0xbf;
                if (replace) {
                    --pointer;
                    builder.append((char) 0xfffd);
                    continue;
                }
                return -1;
            } else {
                lower = 0x80;
                upper = 0xbf;
                ++bytesSeen;
                cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
                if (bytesSeen != bytesNeeded) {
                    continue;
                }
                int ret = cp;
                cp = 0;
                bytesSeen = 0;
                bytesNeeded = 0;
                if (ret <= 0xffff) {
                    builder.append((char) ret);
                } else {
                    int ch = ret - 0x10000;
                    int lead = (ch / 0x400) + 0xd800;
                    int trail = (ch & 0x3ff) + 0xdc00;
                    builder.append((char) lead);
                    builder.append((char) trail);
                }
            }
        }
        if (bytesNeeded != 0) {
            if (replace) {
                builder.append((char) 0xfffd);
            } else {
                return -1;
            }
        }
        return 0;
    }
}

Related Tutorials