Java File Read by Charset getFileText(File file, Charset charset)

Here you can find the source of getFileText(File file, Charset charset)

Description

Reads a text file completely, using the specified encoding.

License

Open Source License

Parameter

Parameter Description
file the file to read.
charset the character set to use for the encoding of the file.

Return

the text of the file, or an empty string if an error occurred.

Declaration

public static String getFileText(File file, Charset charset) 

Method Source Code

//package com.java2s;
/*---------------------------------------------------------------
*  Copyright 2005 by the Radiological Society of North America
*
*  This source software is released under the terms of the
*  RSNA Public License (http://mirc.rsna.org/rsnapubliclicense)
*----------------------------------------------------------------*/

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.InputStreamReader;

import java.io.StringWriter;
import java.nio.charset.Charset;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    public static Charset utf8 = Charset.forName("UTF-8");

    /**//  w  w  w  .  j  ava2  s. c  om
     * Reads a text file completely, trying to obtain the charset from the
     * file itself, and defaulting to UTF-8 if it fails.
     * @param file the file to read.
     * @return the text of the file, or an empty string if an error occurred.
     */
    public static String getFileText(File file) {
        String text = getFileText(file, utf8);
        Charset charset = getEncoding(text);
        if (charset.name().equals(utf8.name()))
            return text;
        return getFileText(file, charset);
    }

    /**
     * Reads a text file completely, using the specified encoding, or
     * UTF-8 if the specified encoding is not supported.
     * @param file the file to read.
     * @param encoding the name of the charset to use.
     * @return the text of the file, or an empty string if an error occurred.
     */
    public static String getFileText(File file, String encoding) {
        Charset charset;
        try {
            charset = Charset.forName(encoding);
        } catch (Exception ex) {
            charset = utf8;
        }
        return getFileText(file, charset);
    }

    /**
     * Reads a text file completely, using the specified encoding.
     * @param file the file to read.
     * @param charset the character set to use for the encoding of the file.
     * @return the text of the file, or an empty string if an error occurred.
     */
    public static String getFileText(File file, Charset charset) {
        BufferedReader br = null;
        try {
            if (!file.exists())
                return "";
            br = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
            StringWriter sw = new StringWriter();
            int n;
            char[] cbuf = new char[1024];
            while ((n = br.read(cbuf, 0, cbuf.length)) != -1)
                sw.write(cbuf, 0, n);
            br.close();
            return sw.toString();
        } catch (Exception e) {
            if (br != null) {
                try {
                    br.close();
                } catch (Exception ignore) {
                }
            }
            return "";
        }
    }

    private static Charset getEncoding(String text) {

        //See if this is an xml document with an encoding declaration.
        Pattern xml = Pattern.compile("^\\s*<\\?xml\\s+[^>]*\\s*encoding\\s*=\\s*(\"[^\"]*\")",
                Pattern.DOTALL | Pattern.MULTILINE);
        Matcher xmlMatcher = xml.matcher(text);
        if (xmlMatcher.find())
            return getEncoding(xmlMatcher);

        //See if this is an html document with a charset declaration.
        Pattern html = Pattern.compile(
                "^\\s*<(html|HTML).*<(meta|META)\\s+[^>]*\\s*(charset|CHARSET)\\s*=\\s*(\"[^\"]*\"|[^\"\\s]*)",
                Pattern.DOTALL | Pattern.MULTILINE);
        Matcher htmlMatcher = html.matcher(text);
        if (htmlMatcher.find())
            return getEncoding(htmlMatcher);

        //We don't recognize this document declaration; use UTF-8.
        //Maybe this should actually be ISO-8859-1 since
        //that is the web default encoding, but it is probably
        //better to default to UTF-8 because that will be better
        //for sites in the Far East, and the pain for the Europeans
        //will be minimal.
        return utf8;
    }

    private static Charset getEncoding(Matcher matcher) {
        int groups = matcher.groupCount();
        String name = matcher.group(groups);
        if (name.startsWith("\""))
            name = name.substring(1);
        if (name.endsWith("\""))
            name = name.substring(0, name.length() - 1);
        try {
            return Charset.forName(name);
        } catch (Exception ex) {
            return utf8;
        }
    }
}

Related

  1. getContent(File file, String charsetName)
  2. getEOL(File file, Charset charset)
  3. getFileContent(File file, String charsetName)
  4. getFileContents(File file, String charset)
  5. getFileEncodingCharset()
  6. getNumberOfNonEmptyLines(File file, Charset charset)
  7. getPatchFileCharset()
  8. getPropertiesVaule(File file, String key, Charset charset)
  9. getString(File file, Charset charset)