get Charset For byte array - Java Internationalization

Description

get Charset For byte array
Demo Code


import java.text.*;
import java.util.zip.*;
import java.io.*;
import java.nio.*;
import java.nio.charset.*;

public class Main{
    public static CharsetInfo getCharsetFor(byte[] byteArray) {
        //copy the first 100 bytes to a new testing array
        int amt = byteArray.length;
        if (amt > 100) {
            amt = 100;//from w w w  .  j a  v  a2s  .c  o  m
        }
        byte[] testingArray = new byte[amt];
        System.arraycopy(byteArray, 0, testingArray, 0, amt);
        String stringForBytesUTF8 = convertToString(testingArray,
                Charset.forName("UTF-8"));
        String stringForBytesUTF16BE = convertToString(testingArray,
                Charset.forName("UTF-16BE"));
        String stringForBytesUTF16LE = convertToString(testingArray,
                Charset.forName("UTF-16LE"));
        //if no length, give up
        if (byteArray.length < 1) {
            //we'll just assume UTF-8 and give up
            return new CharsetInfo(Charset.forName("UTF-8"), false, false);
        }
        //first, deal with files that have BOM
        if (testingArray.length >= 3) {
            if ((testingArray[0] == 0xEF - 256)
                    && (testingArray[1] == 0xBB - 256)
                    && (testingArray[2] == 0xBF - 256)) {
                boolean hasErrors = false;
                //UTF-8 MIGHT have BOM (Notepad adds it for instance), but it's not a good idea
                if (!encodingIsCorrect(byteArray, "UTF-8")) {
                    hasErrors = true;
                }
                return new CharsetInfo(Charset.forName("UTF-8"), true,
                        hasErrors);
            } else if ((testingArray[0] == 0xFE - 256)
                    && (testingArray[1] == 0xFF - 256)) {
                boolean hasErrors = false;
                if (!encodingIsCorrect(byteArray, "UTF-16BE")) {
                    hasErrors = true;
                }
                return new CharsetInfo(Charset.forName("UTF-16BE"), true,
                        hasErrors);
            } else if ((testingArray[0] == 0xFF - 256)
                    && (testingArray[1] == 0xFE - 256)) {
                boolean hasErrors = false;
                if (!encodingIsCorrect(byteArray, "UTF-16LE")) {
                    hasErrors = true;
                }
                return new CharsetInfo(Charset.forName("UTF-16LE"), true,
                        hasErrors);
            }
        }
        //now, deal with files with XML encoding specified
        String encodingTest = "";
        if (stringForBytesUTF8.startsWith("<?xml ")) {
            encodingTest = stringForBytesUTF8;
        } else if (stringForBytesUTF16BE.startsWith("<?xml ")) {
            encodingTest = stringForBytesUTF16BE;
        } else if (stringForBytesUTF16LE.startsWith("<?xml ")) {
            encodingTest = stringForBytesUTF16LE;
        }
        if (!(encodingTest.equals(""))) {
            encodingTest = encodingTest.toLowerCase();
            if (encodingTest.indexOf("encoding") > 0) {
                int startPos = encodingTest.indexOf("encoding");
                startPos = encodingTest.indexOf("\"", startPos) + 1;
                int endPos = encodingTest.indexOf(("\""), startPos);
                if ((endPos > startPos) && (endPos > 0) && (startPos > 0)) {
                    encodingTest = encodingTest.substring(startPos, endPos);
                } else {
                    encodingTest = "";
                }
            } else {
                encodingTest = "";
            }
        }
        boolean encodingTypeErr = false;
        if (!encodingTest.equals("")) {
            try {
                Charset theCharset = Charset.forName(encodingTest);
                boolean hasErrors = false;
                if (!encodingIsCorrect(byteArray, theCharset.name())) {
                    hasErrors = true;
                }
                return new CharsetInfo(theCharset, false, hasErrors);
            } catch (Exception err) {
                //not a valid encoding type; just keep going and mark it as error
                encodingTypeErr = true;
            }
        }
        //check for XHTML with no header (assumed UTF-8)
        if (stringForBytesUTF8.startsWith("<!DOCTYPE HTML")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-8")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-8"), false,
                    hasErrors);
        }
        //here we check for XMP declarations
        //look for '<x:xmpmeta ' or '<?xpacket ' in a variety of encodings
        if (stringForBytesUTF8.startsWith("<?xpacket ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-8")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-8"), false,
                    hasErrors);
        }
        if (stringForBytesUTF8.startsWith("<x:xmpmeta ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-8")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-8"), false,
                    hasErrors);
        }
        if (stringForBytesUTF16BE.startsWith("<?xpacket ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-16BE")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-16BE"), false,
                    hasErrors);
        }
        if (stringForBytesUTF16BE.startsWith("<x:xmpmeta ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-16BE")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-16BE"), false,
                    hasErrors);
        }
        if (stringForBytesUTF16LE.startsWith("<?xpacket ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-16LE")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-16LE"), false,
                    hasErrors);
        }
        if (stringForBytesUTF16LE.startsWith("<x:xmpmeta ")) {
            boolean hasErrors = encodingTypeErr;
            if (!encodingIsCorrect(byteArray, "UTF-16LE")) {
                hasErrors = true;
            }
            return new CharsetInfo(Charset.forName("UTF-16LE"), false,
                    hasErrors);
        }
        //if we reached here, file is just plain text, not XMP or XML, so just check standard ones; if any of them encode properly, use that
        //if neither, give up
        if (encodingIsCorrect(byteArray, "UTF-8")) {
            return new CharsetInfo(Charset.forName("UTF-8"), false,
                    encodingTypeErr);
        }
        if (encodingIsCorrect(byteArray, "ISO-8859-1")) {
            return new CharsetInfo(Charset.forName("ISO-8859-1"), false,
                    encodingTypeErr);
        }
        if (encodingIsCorrect(byteArray, "UTF-16LE")) {
            return new CharsetInfo(Charset.forName("UTF-16LE"), false,
                    encodingTypeErr);
        }
        if (encodingIsCorrect(byteArray, "UTF-16BE")) {
            return new CharsetInfo(Charset.forName("UTF-16BE"), false,
                    encodingTypeErr);
        }
        return new CharsetInfo(null, false, true);
    }
    public static String convertToString(byte[] bytes, Charset charset) {
        String ret = new String(bytes, charset);
        if ((bytes[0] == 0xEF - 256) && (bytes[1] == 0xBB - 256)
                && (bytes[2] == 0xBF - 256)) {
            ret = ret.substring(1);
        } else if ((bytes[0] == 0xFE - 256) && (bytes[1] == 0xFF - 256)) {
            ret = ret.substring(1);
        } else if ((bytes[0] == 0xFF - 256) && (bytes[1] == 0xFE - 256)) {
            ret = ret.substring(1);
        }
        return ret;
    }
    /**
     * Used to verify the encoding of a byte array. This method does not actually
     * convert to string; it merely checks to see if it is possible without errors.
     * @param bytes the bytes to convert to string.
     * @param charset the charset to convert to
     * @return true is the bytes can be encoded into this charset, and false otherwise
     */
    public static boolean encodingIsCorrect(byte[] bytes, String charset) {
        try {
            CharsetDecoder decoder = Charset.forName(charset).newDecoder();
            decoder.decode(ByteBuffer.wrap(bytes));
        } catch (Exception e) {
            return false;
        }
        return true;
    }
}
get Charset For byte array - Java Internationalization

Description

Demo Code

Related Tutorials