Returns the number of additional bytes in a UTF-8 character sequence (not including the first byte).

 
    
/******************************************************************************
* The MIT License
* Copyright (c) 2003 Novell Inc.  www.novell.com
* 
* Permission is hereby granted, free of charge, to any person obtaining  a copy
* of this software and associated documentation files (the Software), to deal
* in the Software without restriction, including  without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
* copies of the Software, and to  permit persons to whom the Software is 
* furnished to do so, subject to the following conditions:
* 
* The above copyright notice and this permission notice shall be included in 
* all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*******************************************************************************/
//
// Novell.Directory.Ldap.Utilclass.Base64.cs
//
// Author:
//   Sunil Kumar (Sunilk@novell.com)
//
// (C) 2003 Novell, Inc (http://www.novell.com)
//

using System;

namespace Novell.Directory.Ldap.Utilclass
{
  
  /// <summary> The Base64 utility class performs base64 encoding and decoding.
  /// 
  /// The Base64 Content-Transfer-Encoding is designed to represent
  /// arbitrary sequences of octets in a form that need not be humanly
  /// readable.  The encoding and decoding algorithms are simple, but the
  /// encoded data are consistently only about 33 percent larger than the
  /// unencoded data.  The base64 encoding algorithm is defined by
  /// RFC 2045.
  /// </summary>
  public class Base64
  {    
    /* **************UTF-8 Validation methods and members*******************
    * The following text is taken from draft-yergeau-rfc2279bis-02 and explains
    * UTF-8 encoding:
    *
    *In UTF-8, characters are encoded using sequences of 1 to 6 octets.
    * If the range of character numbers is restricted to U+0000..U+10FFFF
    * (the UTF-16 accessible range), then only sequences of one to four
    * octets will occur.  The only octet of a "sequence" of one has the
    * higher-order bit set to 0, the remaining 7 bits being used to encode
    * the character number.  In a sequence of n octets, n>1, the initial
    * octet has the n higher-order bits set to 1, followed by a bit set to
    * 0.  The remaining bit(s) of that octet contain bits from the number
    * of the character to be encoded.  The following octet(s) all have the
    * higher-order bit set to 1 and the following bit set to 0, leaving 6
    * bits in each to contain bits from the character to be encoded.
    *
    * The table below summarizes the format of these different octet types.
    * The letter x indicates bits available for encoding bits of the
    * character number.
    *
    * <pre>
    * Char. number range  |        UTF-8 octet sequence
    *    (hexadecimal)    |              (binary)
    * --------------------+---------------------------------------------
    * 0000 0000-0000 007F | 0xxxxxxx
    * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
    * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
    * 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    * 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    * 0400 0000-7FFF FFFF | 1111110x 10xxxxxx ... 10xxxxxx
    * </pre>
    */
    
    /// <summary> Given the first byte in a sequence, getByteCount returns the number of
    /// additional bytes in a UTF-8 character sequence (not including the first
    /// byte).
    /// 
    /// </summary>
    /// <param name="b"> The first byte in a UTF-8 character sequence.
    /// 
    /// </param>
    /// <returns> the number of additional bytes in a UTF-8 character sequence.
    /// </returns>
    private static int getByteCount(sbyte b)
    {
      if (b > 0)
        return 0;
      if ((b & 0xE0) == 0xC0)
      {
        return 1; //one additional byte (2 bytes total)
      }
      if ((b & 0xF0) == 0xE0)
      {
        return 2; //two additional bytes (3 bytes total)
      }
      if ((b & 0xF8) == 0xF0)
      {
        return 3; //three additional bytes (4 bytes total)
      }
      if ((b & 0xFC) == 0xF8)
      {
        return 4; //four additional bytes (5 bytes total)
      }
      if ((b & 0xFF) == 0xFC)
      {
        return 5; //five additional bytes (6 bytes total)
      }
      return - 1;
    }
   }
}

Related examples in the same category

1.	Write the UTF-16 encoded bytes of the source string
2.	Write the UTF-8 and ASCII encoded byte arrays
3.	Write special symbol to Text file: pi (\u03A0r^2)	$Write special symbol to Text file: pi (\u03A0r^2)$
4.	Replace utf-16 encoding with utf-8 encoding
5.	Write a string to a file using default encoding
6.	Convert a string from one charset to another charset
7.	Return the value of CharSize and display it.
8.	Returns the value encoded in Big/Little Endian (PPC, XDR) format.

Returns the number of additional bytes in a UTF-8 character sequence (not including the first byte). : Unicode UTF16 « Internationalization I18N « C# / C Sharp

Related examples in the same category