/*
* Copyright 2001 Sun Microsystems, Inc. All rights reserved.
* PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
*/
package com.sun.portal.rproxy.rewriter.util.http;
import java.io.*;
import com.sun.portal.log.common.PortalLogger;
import java.util.ArrayList;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;
public class CharsetDetector {
private static boolean charsetDetect = true;
private String languageHint = System.getProperty("file.encoding");
private boolean found = false;
private String encoding;
private static ArrayList chinese = new ArrayList();
private static ArrayList japanese = new ArrayList();
private static ArrayList korean = new ArrayList();
static{
chinese.add("GB2312");
chinese.add("GB18030");
chinese.add("Big5");
chinese.add("ISO-2022-CN");
chinese.add("HZ-GB-2312");
chinese.add("x-euc-tw");
korean.add("EUC-KR");
korean.add("ISO-2022-KR");
japanese.add("Shift_JIS");
japanese.add("EUC-JP");
japanese.add("ISO-2022-JP");
}
public CharsetDetector(){
}
public static boolean charsetDetectEnabled(){
return charsetDetect;
}
public int getLanguageHint() {
return getLanguageHint(languageHint);
}
public int getLanguageHint(String languageHint) {
int hint = nsPSMDetector.ALL;
if(chinese.contains(languageHint))
hint = nsPSMDetector.CHINESE;
else if(korean.contains(languageHint))
hint = nsPSMDetector.KOREAN;
else if(japanese.contains(languageHint))
hint = nsPSMDetector.JAPANESE;
return hint;
}
public String detectCharset(HTTPData aHTTPData) {
// Initalize the nsDetector() ;
int lang = getLanguageHint();
nsDetector det = new nsDetector(lang) ;
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver() {
public void Notify(String charset) {
found = true ;
encoding = charset;
}
});
try {
BufferedInputStream bis =
new BufferedInputStream(
new ByteArrayInputStream(aHTTPData.getContentBytes()));
byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;
while ((len = bis.read(buf, 0, buf.length)) != -1) {
// Check if the stream is only ascii.
if (isAscii)
isAscii = det.isAscii(buf, len);
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
done = det.DoIt(buf, len, false);
}
det.DataEnd();
if (isAscii) {
found = true;
encoding = "ASCII";
}
if (!found) {
det.getProbableCharsets();
String[] probableEncoding = det.getProbableCharsets();
if (probableEncoding.length > 0) {
if (!(encoding.equalsIgnoreCase(probableEncoding[0])))
encoding = new String(probableEncoding[0]);
}
}
} catch (java.io.IOException ioe) {
// Probably return without checking.
}
return encoding;
}
}
|