Java Charset Create getEncodingOfXml(File file)

Here you can find the source of getEncodingOfXml(File file)

Description

Try to find the encoding of a xml file.

License

Apache License

Parameter

Parameter Description
file a parameter

Exception

Parameter Description
IOException an exception

Declaration

public static String getEncodingOfXml(File file) throws IOException 

Method Source Code

//package com.java2s;
/**//  ww  w. j  a  v a2  s  . c  o  m
 *  Copyright 2009 Welocalize, Inc. 
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  
 *  You may obtain a copy of the License at 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 */

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;
import java.io.InputStream;

import java.nio.charset.Charset;

import java.util.Iterator;

import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    static public final String UTF8 = "UTF-8";
    static public final String UTF16LE = "UTF-16LE";
    static public final String UTF16BE = "UTF-16BE";

    /**
     * Try to find the encoding of a xml file.
     * 
     * @param file
     * @return
     * @throws IOException
     */
    public static String getEncodingOfXml(File file) throws IOException {
        byte[] bs = readFile(file, 150);
        String encoding = "utf-8";
        boolean findEncoding = false;

        Map chars = Charset.availableCharsets();
        Set keys = chars.keySet();
        Iterator iterator = keys.iterator();

        Pattern pattern = Pattern.compile("encoding=\"([^\"]*?)\"");

        while (iterator.hasNext()) {
            encoding = (String) iterator.next();
            String s = new String(bs, encoding);

            // If "<?xml " can be recognized.
            if (s.indexOf("<?xml ") > -1) {
                // If the file has assigned the encoding, return the
                // assigned recoding.
                Matcher matcher = pattern.matcher(s);
                if (matcher.find()) {
                    encoding = matcher.group(1);
                    findEncoding = true;
                } else {
                    String guessedEncoding = guessEncoding(file);
                    if (guessedEncoding != null) {
                        encoding = guessedEncoding;
                        findEncoding = true;
                    }
                }

                break;
            }
        }

        return findEncoding ? encoding : "UTF-8";
    }

    public static byte[] readFile(File file, int size) throws IOException {
        return readFile(new FileInputStream(file), size);
    }

    /**
     * Reads bytes from given input stream with specified length.
     */
    public static byte[] readFile(InputStream in, int size) throws IOException {
        byte[] b = new byte[size];
        try {
            in.read(b, 0, size);
        } finally {
            if (in != null) {
                in.close();
            }
        }

        return b;
    }

    public static String readFile(File file) throws IOException {
        FileInputStream in = null;

        try {
            in = new FileInputStream(file);
            byte[] b = new byte[in.available()];
            in.read(b, 0, b.length);
            return new String(b);
        } finally {
            if (in != null) {
                in.close();
            }
        }
    }

    public static String readFile(File file, String encoding) throws IOException {
        return readFile(new FileInputStream(file), encoding);
    }

    /**
     * Reads the given input stream to a string content.
     */
    public static String readFile(InputStream in, String encoding) throws IOException {
        try {
            byte[] b = new byte[in.available()];
            in.read(b);
            return new String(b, encoding);
        } finally {
            if (in != null) {
                in.close();
            }
        }
    }

    /**
     * Try to guess the file encoding.
     * <p>
     * 
     * Only guees encodings of "UTF-8", "UTF-16" or "UTF-16BE".
     * 
     * @param file
     *            The file needed to guess the encoding.
     * @return The encoding, may be null.
     * @throws IOException
     */
    public static String guessEncoding(File file) throws IOException {
        byte[] b = readFile(file, 3);
        String guess = null;

        if (b[0] == (byte) 0xef && b[1] == (byte) 0xbb && b[2] == (byte) 0xbf)
            guess = UTF8;
        else if (b[0] == (byte) 0xff && b[1] == (byte) 0xfe)
            guess = UTF16LE;
        else if (b[0] == (byte) 0xfe && b[1] == (byte) 0xff)
            guess = UTF16BE;

        return guess;
    }
}

Related

  1. getEncoder()
  2. getEncoder(String encoding)
  3. getEncoding(byte[] htmlData)
  4. getEncoding(OutputStreamWriter inWriter)
  5. getEncoding(String text)
  6. getEncodingOption(List options)
  7. getEncodings()
  8. getEncodings()