get Charset from html body using regex - Android java.lang

Android examples for java.lang:String HTML

Description

get Charset from html body using regex

Demo Code


//package com.java2s;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    public static String getCharset(String body) {
        //content="text/html; charset=big5-hkscs"
        String charset = "utf-8";
        if (body == null) {
            return charset;
        }/*from w  w  w.j  av a2s  .  c  o  m*/

        //Pattern p = Pattern.compile(";\\s*charset=\\s*([^\"]+)/?>");
        Pattern p = Pattern
                .compile("text/html;\\s*charset=([^'/\\s\"]+)[^>]*>");
        Matcher m = p.matcher(body);
        if (m.find()) {
            charset = m.group(1);

            if (charset == null) {
                charset = "utf-8";
            }

            if (m.find()) {
                charset = m.group(1);
                if (charset == null) {
                    charset = "utf-8";
                }
            }
        }

        return charset.trim();
    }
}

Related Tutorials