Reads a string in UTF-8 encoding from a data stream in full and returns that string. - Java File Path IO

Java examples for File Path IO:UTF

Description

Reads a string in UTF-8 encoding from a data stream in full and returns that string.

Demo Code

/*/*w  w w. j  av a 2  s. c  om*/
 Written in 2013 by Peter O.
 Any copyright is dedicated to the Public Domain.
 http://creativecommons.org/publicdomain/zero/1.0/
 If you like this, you should donate to Peter O.
 at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
 */
import java.io.*;

public class Main{
    /**
     * Reads a string in UTF-8 encoding from a data stream in full and returns that
     * string. Replaces invalid encoding with the replacement character (U +
     * FFFD).
     * @param stream A readable data stream.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(InputStream stream)
            throws java.io.IOException {
        return ReadUtf8ToString(stream, -1, true);
    }
    /**
     * Reads a string in UTF-8 encoding from a data stream and returns that string.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, throws an error if an unpaired
     * surrogate code point is seen.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred; or, the string is not
     * valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(InputStream stream,
            int bytesCount, boolean replace) throws java.io.IOException {
        StringBuilder builder = new StringBuilder();
        int retval = DataUtilities.ReadUtf8(stream, bytesCount, builder,
                replace);
        if (retval == -1) {
            throw new IOException("Unpaired surrogate code point found.",
                    new java.nio.charset.MalformedInputException(1));
        }
        return builder.toString();
    }
    /**
     * Reads a string in UTF-8 encoding from a data stream.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when an unpaired
     * surrogate code point is seen.
     * @return 0 if the entire string was read without errors, -1 if the string is
     * not valid UTF-8 and {@code replace} is false, or -2 if the end of the
     * stream was reached before the last character was read completely
     * (which is only the case if {@code bytesCount} is 0 or greater).
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null or {@code
     * builder} is null.
     */
    public static int ReadUtf8(InputStream stream, int bytesCount,
            StringBuilder builder, boolean replace)
            throws java.io.IOException {
        if (stream == null) {
            throw new NullPointerException("stream");
        }
        if (builder == null) {
            throw new NullPointerException("builder");
        }
        int cp = 0;
        int bytesSeen = 0;
        int bytesNeeded = 0;
        int lower = 0x80;
        int upper = 0xbf;
        int pointer = 0;
        while (pointer < bytesCount || bytesCount < 0) {
            int b = stream.read();
            if (b < 0) {
                if (bytesNeeded != 0) {
                    bytesNeeded = 0;
                    if (replace) {
                        builder.append((char) 0xfffd);
                        if (bytesCount >= 0) {
                            return -2;
                        }
                        break; // end of stream
                    }
                    return -1;
                }
                if (bytesCount >= 0) {
                    return -2;
                }
                break; // end of stream
            }
            if (bytesCount > 0) {
                ++pointer;
            }
            if (bytesNeeded == 0) {
                if ((b & 0x7f) == b) {
                    builder.append((char) b);
                } else if (b >= 0xc2 && b <= 0xdf) {
                    bytesNeeded = 1;
                    cp = (b - 0xc0) << 6;
                } else if (b >= 0xe0 && b <= 0xef) {
                    lower = (b == 0xe0) ? 0xa0 : 0x80;
                    upper = (b == 0xed) ? 0x9f : 0xbf;
                    bytesNeeded = 2;
                    cp = (b - 0xe0) << 12;
                } else if (b >= 0xf0 && b <= 0xf4) {
                    lower = (b == 0xf0) ? 0x90 : 0x80;
                    upper = (b == 0xf4) ? 0x8f : 0xbf;
                    bytesNeeded = 3;
                    cp = (b - 0xf0) << 18;
                } else {
                    if (replace) {
                        builder.append((char) 0xfffd);
                    } else {
                        return -1;
                    }
                }
                continue;
            }
            if (b < lower || b > upper) {
                cp = bytesNeeded = bytesSeen = 0;
                lower = 0x80;
                upper = 0xbf;
                if (replace) {
                    builder.append((char) 0xfffd);
                    // "Read" the last byte again
                    if (b < 0x80) {
                        builder.append((char) b);
                    } else if (b >= 0xc2 && b <= 0xdf) {
                        bytesNeeded = 1;
                        cp = (b - 0xc0) << 6;
                    } else if (b >= 0xe0 && b <= 0xef) {
                        lower = (b == 0xe0) ? 0xa0 : 0x80;
                        upper = (b == 0xed) ? 0x9f : 0xbf;
                        bytesNeeded = 2;
                        cp = (b - 0xe0) << 12;
                    } else if (b >= 0xf0 && b <= 0xf4) {
                        lower = (b == 0xf0) ? 0x90 : 0x80;
                        upper = (b == 0xf4) ? 0x8f : 0xbf;
                        bytesNeeded = 3;
                        cp = (b - 0xf0) << 18;
                    } else {
                        builder.append((char) 0xfffd);
                    }
                    continue;
                }
                return -1;
            } else {
                lower = 0x80;
                upper = 0xbf;
                ++bytesSeen;
                cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
                if (bytesSeen != bytesNeeded) {
                    continue;
                }
                int ret = cp;
                cp = 0;
                bytesSeen = 0;
                bytesNeeded = 0;
                if (ret <= 0xffff) {
                    builder.append((char) ret);
                } else {
                    int ch = ret - 0x10000;
                    int lead = (ch / 0x400) + 0xd800;
                    int trail = (ch & 0x3ff) + 0xdc00;
                    builder.append((char) lead);
                    builder.append((char) trail);
                }
            }
        }
        if (bytesNeeded != 0) {
            if (replace) {
                builder.append((char) 0xfffd);
            } else {
                return -1;
            }
        }
        return 0;
    }
}

Related Tutorials