Utility methods for handling UTF-8 encoded byte streams. : UTF8 Byte Hex « Development Class « Java






Utility methods for handling UTF-8 encoded byte streams.

   
/*

   Derby - Class org.apache.derby.iapi.util.UTF8Util

   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.

 */

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UTFDataFormatException;


/**
 * Utility methods for handling UTF-8 encoded byte streams.
 * <p>
 * Note that when the {@code skip} methods mention detection of invalid
 * UTF-8 encodings, it only checks the first byte of a character. For multibyte
 * encodings, the second and third byte are not checked for correctness, just
 * skipped and ignored.
 *
 * @see java.io.DataInput
 */
//@ThreadSafe
public final class UTF8Util {

    /** This class cannot be instantiated. */
    private UTF8Util() {}

    /**
     * Skip until the end-of-stream is reached.
     *
     * @param in byte stream with UTF-8 encoded characters
     * @return The number of characters skipped.
     * @throws IOException if reading from the stream fails
     * @throws UTFDataFormatException if an invalid UTF-8 encoding is detected
     */
    public static final long skipUntilEOF(InputStream in)
            throws IOException {
        // No need to do the skip in a loop, as Reader.read() returning -1
        // means EOF has been reached.
        // Note that a loop should be used if skip is used instead of read.
        return internalSkip(in, Long.MAX_VALUE).charsSkipped();
    }

    /**
     * Skip the requested number of characters from the stream.
     * <p>
     * @param in byte stream with UTF-8 encoded characters
     * @param charsToSkip number of characters to skip
     * @return The number of bytes skipped.
     * @throws EOFException if end-of-stream is reached before the requested
     *      number of characters are skipped
     * @throws IOException if reading from the stream fails
     * @throws UTFDataFormatException if an invalid UTF-8 encoding is detected
     */
    public static final long skipFully(InputStream in, long charsToSkip)
            throws EOFException, IOException {
        SkipCount skipped = internalSkip(in, charsToSkip);
        if (skipped.charsSkipped() != charsToSkip) {
            throw new EOFException("Reached end-of-stream prematurely at " +
                "character/byte position " + skipped.charsSkipped() + "/" +
                skipped.bytesSkipped() + ", trying to skip " + charsToSkip);
        }
        return skipped.bytesSkipped();
    }

    /**
     * Skip characters in the stream.
     * <p>
     * Note that a smaller number than requested might be skipped if the
     * end-of-stream is reached before the specified number of characters has
     * been decoded. It is up to the caller to decide if this is an error
     * or not. For instance, when determining the character length of a stream,
     * <code>Long.MAX_VALUE</code> could be passed as the requested number of
     * characters to skip.
     *
     * @param in byte stream with UTF-8 encoded characters
     * @param charsToSkip the number of characters to skip
     * @return A long array with counts; the characters skipped at position
     *      <code>CHAR_COUNT</code>, the bytes skipped at position
     *      <code>BYTE_COUNT</code>. Note that the number of characters skipped
     *      may be smaller than the requested number.
     * @throws IOException if reading from the stream fails
     * @throws UTFDataFormatException if an invalid UTF-8 encoding is detected
     */
    private static final SkipCount internalSkip(final InputStream in,
                                                final long charsToSkip)
            throws IOException {
        long charsSkipped = 0;
        long bytesSkipped = 0;
        // Decoding routine for modified UTF-8.
        // See java.io.DataInput
        while (charsSkipped < charsToSkip) {
            int c = in.read();
            if (c == -1) {
                break;
            }
            charsSkipped++;
            if ((c & 0x80) == 0x00) { // 8th bit set (top bit)
                // Found char of one byte width.
                bytesSkipped++;
            } else if ((c & 0x60) == 0x40) { // 7th bit set, 6th bit unset
                // Found char of two byte width.
                if (InputStreamUtil.skipPersistent(in, 1L) != 1L) {
                    // No second byte present.
                    throw new UTFDataFormatException(
                        "Second byte in two byte character missing; byte pos " +
                        bytesSkipped + " ; char pos " + charsSkipped);
                }
                bytesSkipped += 2;
            } else if ((c & 0x70) == 0x60) { // 7th and 6th bit set, 5th unset
                // Found char of three byte width.
                int skipped = 0;
                if (c == 0xe0) {
                    // Check for Derby EOF marker.
                    int c1 = in.read();
                    int c2 = in.read();
                    if (c1 == 0x00 && c2 == 0x00) {
                        // Found Derby EOF marker, exit loop.
                        charsSkipped--; // Compensate by subtracting one.
                        break;
                    }
                    // Do some rudimentary error checking.
                    // Allow everything except EOF, which is the same as done in
                    // normal processing (skipPersistent below).
                    if (c1 != -1 && c2 != -1) {
                        skipped = 2;
                    }
                } else {
                    skipped = (int)InputStreamUtil.skipPersistent(in, 2L);
                }
                if (skipped != 2) {
                    // No second or third byte present
                    throw new UTFDataFormatException(
                        "Second or third byte in three byte character " +
                        "missing; byte pos " + bytesSkipped + " ; char pos " +
                        charsSkipped);
                }
                bytesSkipped += 3;
            } else {
                throw new UTFDataFormatException(
                    "Invalid UTF-8 encoding encountered: (decimal) " + c);
            }
        }
        // We don't close the stream, since it might be reused. One example of
        // this is use of Resetable streams.
        return new SkipCount(charsSkipped, bytesSkipped);
    }

    /**
     * Helper class to hold skip counts; one for chars and one for bytes.
     */
    // @Immutable
    private static final class SkipCount {
        /** Number of bytes skipped. */
        private final long byteCount;
        /** Number of characters skipped. */
        private final long charCount; 

        /**
         * Creates a holder for the specified skip counts.
         * 
         * @param byteCount number of bytes
         * @param charCount number of characters
         */
        SkipCount(long charCount, long byteCount) {
            if (byteCount < 0 || charCount < 0) {
                // Don't allow negative counts.
                throw new IllegalArgumentException("charCount/byteCount " +
                        "cannot be negative: " + charCount + "/" + byteCount);
            }
            if (byteCount < charCount) {
                // A char must always be represented by at least one byte.
                throw new IllegalArgumentException("Number of bytes cannot be" +
                        "less than number of chars: " + byteCount + " < " +
                        charCount);
            }
            this.byteCount = byteCount;
            this.charCount = charCount;
        }

        long charsSkipped() {
            return this.charCount;
        }

        long bytesSkipped() {
            return this.byteCount;
        }
    }
} // End class UTF8Util

/*

Derby - Class org.apache.derby.iapi.services.io.InputStreamUtil

Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements.  See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to you under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License.  You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

*/

/**
Utility methods for InputStream that are stand-ins for
a small subset of DataInput methods. This avoids pushing
a DataInputStream just to get this functionality.
*/
final class InputStreamUtil {
 private static final int SKIP_FRAGMENT_SIZE = Integer.MAX_VALUE;

/**
  Read an unsigned byte from an InputStream, throwing an EOFException
  if the end of the input is reached.

  @exception IOException if an I/O error occurs.
  @exception EOFException if the end of the stream is reached

  @see DataInput#readUnsignedByte

*/
public static int readUnsignedByte(InputStream in) throws IOException {
  int b = in.read();
  if (b < 0)
    throw new EOFException();

  return b;
}

/**
  Read a number of bytes into an array.

  @exception IOException if an I/O error occurs.
  @exception EOFException if the end of the stream is reached

  @see DataInput#readFully

*/
public static void readFully(InputStream in, byte b[],
                              int offset,
                              int len) throws IOException
{
  do {
    int bytesRead = in.read(b, offset, len);
    if (bytesRead < 0)
      throw new EOFException();
    len -= bytesRead;
    offset += bytesRead;
  } while (len != 0);
}


/**
  Read a number of bytes into an array.
     Keep reading in a loop until len bytes are read or EOF is reached or
     an exception is thrown. Return the number of bytes read.
     (InputStream.read(byte[],int,int) does not guarantee to read len bytes
      even if it can do so without reaching EOF or raising an exception.)

  @exception IOException if an I/O error occurs.
*/
public static int readLoop(InputStream in,
                             byte b[],
                             int offset,
                             int len)
     throws IOException
{
     int firstOffset = offset;
  do {
    int bytesRead = in.read(b, offset, len);
    if (bytesRead <= 0)
             break;
    len -= bytesRead;
    offset += bytesRead;
  } while (len != 0);
     return offset - firstOffset;
}

 /**
  * Skips until EOF, returns number of bytes skipped.
  * @param is
  *      InputStream to be skipped.
  * @return
  *      number of bytes skipped in fact.
  * @throws IOException
  *      if IOException occurs. It doesn't contain EOFException.
  * @throws NullPointerException
  *      if the param 'is' equals null.
  */
 public static long skipUntilEOF(InputStream is) throws IOException {
     if(is == null)
         throw new NullPointerException();

     long bytes = 0;
     while(true){
         long r = skipPersistent(is, SKIP_FRAGMENT_SIZE);
         bytes += r;
         if(r < SKIP_FRAGMENT_SIZE)
             return bytes;
     }
 }

 /**
  * Skips requested number of bytes,
  * throws EOFException if there is too few bytes in the stream.
  * @param is
  *      InputStream to be skipped.
  * @param skippedBytes
  *      number of bytes to skip. if skippedBytes <= zero, do nothing.
  * @throws EOFException
  *      if EOF meets before requested number of bytes are skipped.
  * @throws IOException
  *      if IOException occurs. It doesn't contain EOFException.
  * @throws NullPointerException
  *      if the param 'is' equals null.
  */
 public static void skipFully(InputStream is, long skippedBytes)
 throws IOException {
     if(is == null)
         throw new NullPointerException();

     if(skippedBytes <= 0)
         return;

     long bytes = skipPersistent(is, skippedBytes);

     if(bytes < skippedBytes)
         throw new EOFException();
 }

 /**
  * Tries harder to skip the requested number of bytes.
  * <p>
  * Note that even if the method fails to skip the requested number of bytes,
  * it will not throw an exception. If this happens, the caller can be sure
  * that end-of-stream has been reached.
  *
  * @param in byte stream
  * @param bytesToSkip the number of bytes to skip
  * @return The number of bytes skipped.
  * @throws IOException if reading from the stream fails
  */
 public static final long skipPersistent(InputStream in, long bytesToSkip)
 throws IOException {
     long skipped = 0;
     while (skipped < bytesToSkip) {
         long skippedNow = in.skip(bytesToSkip - skipped);
         if (skippedNow == 0) {
             if (in.read() == -1) {
                 // EOF, return what we have and leave it up to caller to
                 // decide what to do about it.
                 break;
             } else {
                 skippedNow = 1; // Added to count below.
             }
         }
         skipped += skippedNow;
     }
     return skipped;
 }
}

   
    
    
  








Related examples in the same category

1.Convert file in SJIS to UTF8
2.Return an UTF-8 encoded String
3.Return an UTF-8 encoded String by length
4.UTF8 String utilities
5.Return UTF-8 encoded byte[] representation of a String
6.Encodes octects (using utf-8) into Hex data
7.Decodes values of attributes in the DN encoded in hex into a UTF-8 String.
8.converting between byte arrays and hex encoded strings
9.Convert bytes To Hex
10.Convert hex To Bytes
11.Unicode 2 ASCII
12.Make bytes
13.String converterString converter
14.Show unicode stringShow unicode string
15.Normalizer
16.Convert from UTF-8 to Unicode
17.Convert from Unicode to UTF-8
18.Read Windows Notepad Unicode files
19.UTF Util
20.To UTF8 InputStream
21.Returns {@code true} if the specified character sequence is a valid sequence of UTF-16 char values.
22.URL UTF8 Encoder