CharsetDetector.java :  » Portal » Open-Portal » com » sun » portal » rproxy » rewriter » util » http » Java Open Source

Java Open Source » Portal » Open Portal 
Open Portal » com » sun » portal » rproxy » rewriter » util » http » CharsetDetector.java
/*
 * Copyright 2001 Sun Microsystems, Inc.  All rights reserved.
 * PROPRIETARY/CONFIDENTIAL.  Use of this product is subject to license terms.
 */
package com.sun.portal.rproxy.rewriter.util.http;

import java.io.*;
import com.sun.portal.log.common.PortalLogger;
import java.util.ArrayList;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

public class CharsetDetector {
    private static boolean charsetDetect = true;
    private String languageHint = System.getProperty("file.encoding");
    private boolean found = false;
    private String encoding;

    private static ArrayList chinese = new ArrayList();
    private static ArrayList japanese = new ArrayList();
    private static ArrayList korean = new ArrayList();
    static{
        chinese.add("GB2312");
        chinese.add("GB18030");
        chinese.add("Big5");
        chinese.add("ISO-2022-CN");
        chinese.add("HZ-GB-2312");
        chinese.add("x-euc-tw");

        korean.add("EUC-KR");
        korean.add("ISO-2022-KR");

        japanese.add("Shift_JIS");
        japanese.add("EUC-JP");
        japanese.add("ISO-2022-JP");
    }

    public CharsetDetector(){
    }
    public static boolean charsetDetectEnabled(){
        return charsetDetect;
    }

    public int getLanguageHint() {
        return getLanguageHint(languageHint);
    }

   public int getLanguageHint(String languageHint) {
        int hint = nsPSMDetector.ALL;
        if(chinese.contains(languageHint))
            hint = nsPSMDetector.CHINESE;
        else if(korean.contains(languageHint))
            hint = nsPSMDetector.KOREAN;
        else if(japanese.contains(languageHint))
            hint = nsPSMDetector.JAPANESE;
        return hint;
    }

    public String detectCharset(HTTPData aHTTPData) {
        // Initalize the nsDetector() ;
        int lang = getLanguageHint();
        nsDetector det = new nsDetector(lang) ;

        // Set an observer...
        // The Notify() will be called when a matching charset is found.

        det.Init(new nsICharsetDetectionObserver() {
            public void Notify(String charset) {
                found = true ;
                encoding = charset;
            }
            });
        try {
            BufferedInputStream bis =
                    new BufferedInputStream(
                            new ByteArrayInputStream(aHTTPData.getContentBytes()));
            byte[] buf = new byte[1024];
            int len;
            boolean done = false;
            boolean isAscii = true;

            while ((len = bis.read(buf, 0, buf.length)) != -1) {

                // Check if the stream is only ascii.
                if (isAscii)
                    isAscii = det.isAscii(buf, len);

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                    done = det.DoIt(buf, len, false);
            }
            det.DataEnd();

            if (isAscii) {
                found = true;
                encoding = "ASCII";
            }

            if (!found) {
                det.getProbableCharsets();
                String[] probableEncoding = det.getProbableCharsets();
                if (probableEncoding.length > 0) {
                    if (!(encoding.equalsIgnoreCase(probableEncoding[0])))
                        encoding = new String(probableEncoding[0]);
                }
            }
        } catch (java.io.IOException ioe) {
            // Probably return without checking.
        }
        return encoding;
    }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.