If CJK characters constitute a ratio of MIN_RATIO or a count of MIN_CHAR, the string will be deemed to be CJK. - Java java.lang

Java examples for java.lang:String Unicode

Description

If CJK characters constitute a ratio of MIN_RATIO or a count of MIN_CHAR, the string will be deemed to be CJK.

Demo Code


//package com.java2s;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    public static void main(String[] argv) throws Exception {
        String s = "java2s.com";
        System.out.println(isCJK(s));
    }//w ww. j  a v a  2 s  .  c om

    static Pattern isCJK_pattern = null;
    static Pattern isNotCJK_pattern = null;
    public static final float MIN_RATIO = (float) 0.15;
    public static final int MIN_CHAR = 2;

    /**
     * If CJK characters constitute a ratio of MIN_RATIO or a count of MIN_CHAR,
     * the string will be deemed to be CJK. (Primary consideration is whether the string 
     * warrants CJK search analysis.)
     */
    public static Boolean isCJK(String s) {
        if (!hasCJK(s))
            return false;
        if (isNotCJK_pattern == null)
            isNotCJK_pattern = Pattern
                    .compile("[^\\p{IsHan}\\p{IsHangul}\\p{IsKatakana}\\p{IsHiragana}]");
        String s2 = isNotCJK_pattern.matcher(s).replaceAll("");
        float ratio = (float) s2.length() / s.length();
        if ((ratio >= MIN_RATIO) || (s2.length() >= MIN_CHAR))
            return true;
        return false;
    }

    /**
     * Unlike isCJK(s), hasCJK(s) will return true if any CJK characters appear in String s.
     */
    public static Boolean hasCJK(String s) {
        if (isCJK_pattern == null)
            isCJK_pattern = Pattern
                    .compile("[\\p{IsHan}\\p{IsHangul}\\p{IsKatakana}\\p{IsHiragana}]");
        Matcher m = isCJK_pattern.matcher(s);
        return m.find();
    }
}

Related Tutorials