UTF8 String utilities : UTF8 Byte Hex « Development Class « Java






UTF8 String utilities

    

// 
// Copyright 2004-2005 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at 
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 

import java.io.UnsupportedEncodingException;

// 
/**
 * Fast String Utilities.
 * 
 * These string utilities provide both conveniance methods and performance
 * improvements over most standard library versions. The main aim of the
 * optimizations is to avoid object creation unless absolutely required.
 * 
 * @author Greg Wilkins (gregw)
 */
public class StringUtil {
  public static final String CRLF = "\015\012";

  public static final String __LINE_SEPARATOR = System.getProperty("line.separator", "\n");

  public static String __ISO_8859_1;
  static {
    String iso = System.getProperty("ISO_8859_1");
    if (iso != null)
      __ISO_8859_1 = iso;
    else {
      try {
        new String(new byte[] { (byte) 20 }, "ISO-8859-1");
        __ISO_8859_1 = "ISO-8859-1";
      } catch (java.io.UnsupportedEncodingException e) {
        __ISO_8859_1 = "ISO8859_1";
      }
    }
  }

  public final static String __UTF8 = "UTF-8";

  private static char[] lowercases = { '\000', '\001', '\002', '\003', '\004', '\005', '\006',
      '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021',
      '\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034',
      '\035', '\036', '\037', '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
      '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', '\060', '\061', '\062',
      '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', '\075',
      '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150',
      '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163',
      '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136',
      '\137', '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151',
      '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164',
      '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177' };

  /* ------------------------------------------------------------ */
  /**
   * fast lower case conversion. Only works on ascii (not unicode)
   * 
   * @param s
   *          the string to convert
   * @return a lower case version of s
   */
  public static String asciiToLowerCase(String s) {
    char[] c = null;
    int i = s.length();

    // look for first conversion
    while (i-- > 0) {
      char c1 = s.charAt(i);
      if (c1 <= 127) {
        char c2 = lowercases[c1];
        if (c1 != c2) {
          c = s.toCharArray();
          c[i] = c2;
          break;
        }
      }
    }

    while (i-- > 0) {
      if (c[i] <= 127)
        c[i] = lowercases[c[i]];
    }

    return c == null ? s : new String(c);
  }

  /* ------------------------------------------------------------ */
  public static boolean startsWithIgnoreCase(String s, String w) {
    if (w == null)
      return true;

    if (s == null || s.length() < w.length())
      return false;

    for (int i = 0; i < w.length(); i++) {
      char c1 = s.charAt(i);
      char c2 = w.charAt(i);
      if (c1 != c2) {
        if (c1 <= 127)
          c1 = lowercases[c1];
        if (c2 <= 127)
          c2 = lowercases[c2];
        if (c1 != c2)
          return false;
      }
    }
    return true;
  }

  /* ------------------------------------------------------------ */
  public static boolean endsWithIgnoreCase(String s, String w) {
    if (w == null)
      return true;

    if (s == null)
      return false;

    int sl = s.length();
    int wl = w.length();

    if (sl < wl)
      return false;

    for (int i = wl; i-- > 0;) {
      char c1 = s.charAt(--sl);
      char c2 = w.charAt(i);
      if (c1 != c2) {
        if (c1 <= 127)
          c1 = lowercases[c1];
        if (c2 <= 127)
          c2 = lowercases[c2];
        if (c1 != c2)
          return false;
      }
    }
    return true;
  }


  /* ------------------------------------------------------------ */
  public static boolean equals(String s, char[] buf, int offset, int length) {
    if (s.length() != length)
      return false;
    for (int i = 0; i < length; i++)
      if (buf[offset + i] != s.charAt(i))
        return false;
    return true;
  }

  /* ------------------------------------------------------------ */
  public static String toUTF8String(byte[] b, int offset, int length) {
    try {
      if (length < 32) {
        Utf8StringBuffer buffer = new Utf8StringBuffer(length);
        buffer.append(b, offset, length);
        return buffer.toString();
      }

      return new String(b, offset, length, __UTF8);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      return null;
    }
  }

  /* ------------------------------------------------------------ */
  public static String toString(byte[] b, int offset, int length, String charset) {
    if (charset == null || StringUtil.isUTF8(charset))
      return toUTF8String(b, offset, length);

    try {
      return new String(b, offset, length, charset);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      return null;
    }
  }

  /* ------------------------------------------------------------ */
  public static boolean isUTF8(String charset) {
    return charset == __UTF8 || __UTF8.equalsIgnoreCase(charset);
  }

  /* ------------------------------------------------------------ */
  public static String printable(String name) {
    if (name == null)
      return null;
    StringBuffer buf = new StringBuffer(name.length());
    for (int i = 0; i < name.length(); i++) {
      char c = name.charAt(i);
      if (!Character.isISOControl(c))
        buf.append(c);
    }
    return buf.toString();
  }

}

// 
// Copyright 2006 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 

/* ------------------------------------------------------------ */
/**
 * UTF-8 StringBuffer.
 * 
 * This class wraps a standard {@link java.lang.StringBuffer} and provides
 * methods to append UTF-8 encoded bytes, that are converted into characters.
 * 
 * This class is stateful and up to 6 calls to {@link #append(byte)} may be
 * needed before state a character is appended to the string buffer.
 * 
 * The UTF-8 decoding is done by this class and no additional buffers or Readers
 * are used. The UTF-8 code was inspired by http://javolution.org
 * 
 */
class Utf8StringBuffer {
  StringBuffer _buffer;

  int _more;

  int _bits;

  boolean _errors;

  Utf8StringBuffer() {
    _buffer = new StringBuffer();
  }

  Utf8StringBuffer(int capacity) {
    _buffer = new StringBuffer(capacity);
  }

  public void append(byte[] b, int offset, int length) {
    int end = offset + length;
    for (int i = offset; i < end; i++)
      append(b[i]);
  }

  public void append(byte b) {
    if (b > 0) {
      if (_more > 0) {
        _buffer.append('?');
        _more = 0;
        _bits = 0;
      } else
        _buffer.append((char) (0x7f & b));
    } else if (_more == 0) {
      if ((b & 0xc0) != 0xc0) {
        // 10xxxxxx
        _buffer.append('?');
        _more = 0;
        _bits = 0;
      } else if ((b & 0xe0) == 0xc0) {
        // 110xxxxx
        _more = 1;
        _bits = b & 0x1f;
      } else if ((b & 0xf0) == 0xe0) {
        // 1110xxxx
        _more = 2;
        _bits = b & 0x0f;
      } else if ((b & 0xf8) == 0xf0) {
        // 11110xxx
        _more = 3;
        _bits = b & 0x07;
      } else if ((b & 0xfc) == 0xf8) {
        // 111110xx
        _more = 4;
        _bits = b & 0x03;
      } else if ((b & 0xfe) == 0xfc) {
        // 1111110x
        _more = 5;
        _bits = b & 0x01;
      }
    } else {
      if ((b & 0xc0) == 0xc0) { // 11??????
        _buffer.append('?');
        _more = 0;
        _bits = 0;
        _errors = true;
      } else {
        // 10xxxxxx
        _bits = (_bits << 6) | (b & 0x3f);
        if (--_more == 0)
          _buffer.append((char) _bits);
      }
    }
  }

  public int length() {
    return _buffer.length();
  }

  public void reset() {
    _buffer.setLength(0);
    _more = 0;
    _bits = 0;
    _errors = false;
  }

  public StringBuffer getStringBuffer() {
    return _buffer;
  }

  public String toString() {
    return _buffer.toString();
  }

  /* ------------------------------------------------------------ */
  /**
   * @return True if there are non UTF-8 characters or incomplete UTF-8
   *         characters in the buffer.
   */
  public boolean isError() {
    return _errors || _more > 0;
  }
}

   
    
    
    
  








Related examples in the same category

1.Convert file in SJIS to UTF8
2.Return an UTF-8 encoded String
3.Return an UTF-8 encoded String by length
4.Return UTF-8 encoded byte[] representation of a String
5.Encodes octects (using utf-8) into Hex data
6.Decodes values of attributes in the DN encoded in hex into a UTF-8 String.
7.converting between byte arrays and hex encoded strings
8.Convert bytes To Hex
9.Convert hex To Bytes
10.Unicode 2 ASCII
11.Make bytes
12.String converterString converter
13.Show unicode stringShow unicode string
14.Normalizer
15.Convert from UTF-8 to Unicode
16.Convert from Unicode to UTF-8
17.Utility methods for handling UTF-8 encoded byte streams.
18.Read Windows Notepad Unicode files
19.UTF Util
20.To UTF8 InputStream
21.Returns {@code true} if the specified character sequence is a valid sequence of UTF-16 char values.
22.URL UTF8 Encoder