Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.



import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** A WritableComparable for strings that uses the UTF8 encoding.
 * <p>Also includes utilities for efficiently reading and writing UTF-8.
 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
 * UTF-8: see
 * @deprecated replaced by Text
@InterfaceAudience.LimitedPrivate({ "HDFS", "MapReduce" })
public class UTF8 implements WritableComparable<UTF8> {
    private static final Logger LOG = LoggerFactory.getLogger(UTF8.class);
    private static final DataInputBuffer IBUF = new DataInputBuffer();

    private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = new ThreadLocal<DataOutputBuffer>() {
        protected DataOutputBuffer initialValue() {
            return new DataOutputBuffer();

    private static final byte[] EMPTY_BYTES = new byte[0];

    private byte[] bytes = EMPTY_BYTES;
    private int length;

    public UTF8() {

    /** Construct from a given string. */
    public UTF8(String string) {

    /** Construct from a given string. */
    public UTF8(UTF8 utf8) {

    /** The raw bytes. */
    public byte[] getBytes() {
        return bytes;

    /** The number of bytes in the encoded string. */
    public int getLength() {
        return length;

    /** Set to contain the contents of a string. */
    public void set(String string) {
        if (string.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + string.length() + " chars, starting with "
                    + string.substring(0, 20));
            string = string.substring(0, 0xffff / 3);

        length = utf8Length(string); // compute length
        if (length > 0xffff) // double-check length
            throw new RuntimeException("string too long!");

        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];

        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, bytes, 0, length);
        } catch (IOException e) {
            throw new RuntimeException(e);

    /** Set to contain the contents of a string. */
    public void set(UTF8 other) {
        length = other.length;
        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];
        System.arraycopy(other.bytes, 0, bytes, 0, length);

    public void readFields(DataInput in) throws IOException {
        length = in.readUnsignedShort();
        if (bytes == null || bytes.length < length)
            bytes = new byte[length];
        in.readFully(bytes, 0, length);

    /** Skips over one UTF8 in the input. */
    public static void skip(DataInput in) throws IOException {
        int length = in.readUnsignedShort();
        WritableUtils.skipFully(in, length);

    public void write(DataOutput out) throws IOException {
        out.write(bytes, 0, length);

    /** Compare two UTF8s. */
    public int compareTo(UTF8 o) {
        return WritableComparator.compareBytes(bytes, 0, length, o.bytes, 0, o.length);

    /** Convert to a String. */
    public String toString() {
        StringBuilder buffer = new StringBuilder(length);
        try {
            synchronized (IBUF) {
                IBUF.reset(bytes, length);
                readChars(IBUF, buffer, length);
        } catch (IOException e) {
            throw new RuntimeException(e);
        return buffer.toString();

     * Convert to a string, checking for valid UTF8.
     * @return the converted string
     * @throws UTFDataFormatException if the underlying bytes contain invalid
     * UTF8 data.
    public String toStringChecked() throws IOException {
        StringBuilder buffer = new StringBuilder(length);
        synchronized (IBUF) {
            IBUF.reset(bytes, length);
            readChars(IBUF, buffer, length);
        return buffer.toString();

    /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
    public boolean equals(Object o) {
        if (!(o instanceof UTF8))
            return false;
        UTF8 that = (UTF8) o;
        if (this.length != that.length)
            return false;
            return WritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length) == 0;

    public int hashCode() {
        return WritableComparator.hashBytes(bytes, length);

    /** A WritableComparator optimized for UTF8 keys. */
    public static class Comparator extends WritableComparator {
        public Comparator() {

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int n1 = readUnsignedShort(b1, s1);
            int n2 = readUnsignedShort(b2, s2);
            return compareBytes(b1, s1 + 2, n1, b2, s2 + 2, n2);

    static { // register this comparator
        WritableComparator.define(UTF8.class, new Comparator());


    /// These are probably not used much anymore, and might be removed...

    /** Convert a string to a UTF-8 encoded byte array.
     * @see String#getBytes(String)
    public static byte[] getBytes(String string) {
        byte[] result = new byte[utf8Length(string)];
        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
        } catch (IOException e) {
            throw new RuntimeException(e);
        return result;

     * Convert a UTF-8 encoded byte array back into a string.
     * @throws IOException if the byte array is invalid UTF8
    public static String fromBytes(byte[] bytes) throws IOException {
        DataInputBuffer dbuf = new DataInputBuffer();
        dbuf.reset(bytes, 0, bytes.length);
        StringBuilder buf = new StringBuilder(bytes.length);
        readChars(dbuf, buf, bytes.length);
        return buf.toString();

    /** Read a UTF-8 encoded string.
     * @see DataInput#readUTF()
    public static String readString(DataInput in) throws IOException {
        int bytes = in.readUnsignedShort();
        StringBuilder buffer = new StringBuilder(bytes);
        readChars(in, buffer, bytes);
        return buffer.toString();

    private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
            throws UTFDataFormatException, IOException {
        DataOutputBuffer obuf = OBUF_FACTORY.get();
        obuf.write(in, nBytes);
        byte[] bytes = obuf.getData();
        int i = 0;
        while (i < nBytes) {
            byte b = bytes[i++];
            if ((b & 0x80) == 0) {
                // 0b0xxxxxxx: 1-byte sequence
                buffer.append((char) (b & 0x7F));
            } else if ((b & 0xE0) == 0xC0) {
                if (i >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 1));
                // 0b110xxxxx: 2-byte sequence
                buffer.append((char) (((b & 0x1F) << 6) | (bytes[i++] & 0x3F)));
            } else if ((b & 0xF0) == 0xE0) {
                // 0b1110xxxx: 3-byte sequence
                if (i + 1 >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 2));
                buffer.append((char) (((b & 0x0F) << 12) | ((bytes[i++] & 0x3F) << 6) | (bytes[i++] & 0x3F)));
            } else if ((b & 0xF8) == 0xF0) {
                if (i + 2 >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 3));
                // 0b11110xxx: 4-byte sequence
                int codepoint = ((b & 0x07) << 18) | ((bytes[i++] & 0x3F) << 12) | ((bytes[i++] & 0x3F) << 6)
                        | ((bytes[i++] & 0x3F));
            } else {
                // The UTF8 standard describes 5-byte and 6-byte sequences, but
                // these are no longer allowed as of 2003 (see RFC 3629)

                // Only show the next 6 bytes max in the error code - in case the
                // buffer is large, this will prevent an exceedingly large message.
                int endForError = Math.min(i + 5, nBytes);
                throw new UTFDataFormatException(
                        "Invalid UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, endForError));

    private static char highSurrogate(int codePoint) {
        return (char) ((codePoint >>> 10)
                + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));

    private static char lowSurrogate(int codePoint) {
        return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);

    /** Write a UTF-8 encoded string.
     * @see DataOutput#writeUTF(String)
    public static int writeString(DataOutput out, String s) throws IOException {
        if (s.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + s.length() + " chars, starting with " + s.substring(0, 20));
            s = s.substring(0, 0xffff / 3);

        int len = utf8Length(s);
        if (len > 0xffff) // double-check length
            throw new IOException("string too long!");

        writeChars(out, s, 0, s.length());
        return len;

    /** Returns the number of bytes required to write this. */
    private static int utf8Length(String string) {
        int stringLength = string.length();
        int utf8Length = 0;
        for (int i = 0; i < stringLength; i++) {
            int c = string.charAt(i);
            if (c <= 0x007F) {
            } else if (c > 0x07FF) {
                utf8Length += 3;
            } else {
                utf8Length += 2;
        return utf8Length;

    private static void writeChars(DataOutput out, String s, int start, int length) throws IOException {
        final int end = start + length;
        for (int i = start; i < end; i++) {
            int code = s.charAt(i);
            if (code <= 0x7F) {
                out.writeByte((byte) code);
            } else if (code <= 0x07FF) {
                out.writeByte((byte) (0xC0 | ((code >> 6) & 0x1F)));
                out.writeByte((byte) (0x80 | code & 0x3F));
            } else {
                out.writeByte((byte) (0xE0 | ((code >> 12) & 0X0F)));
                out.writeByte((byte) (0x80 | ((code >> 6) & 0x3F)));
                out.writeByte((byte) (0x80 | (code & 0x3F)));
