Android Open Source - texthem Universal Detector






From Project

Back to project page texthem.

License

The source code is released under:

GNU General Public License

If you think the Android project texthem listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.

Java Source Code

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1/*w  ww. j av  a2s.co  m*/
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla Universal charset detector code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 2001
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *          Shy Shalom <shooshX@gmail.com>
 *          Kohei TAKETA <k-tak@void.in> (Java port)
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

package org.mozilla.universalchardet;

import org.mozilla.universalchardet.prober.CharsetProber;
import org.mozilla.universalchardet.prober.MBCSGroupProber;
import org.mozilla.universalchardet.prober.SBCSGroupProber;
import org.mozilla.universalchardet.prober.EscCharsetProber;
import org.mozilla.universalchardet.prober.Latin1Prober;
import org.mozilla.universalchardet.Constants;

public class UniversalDetector
{
    ////////////////////////////////////////////////////////////////
    // constants
    ////////////////////////////////////////////////////////////////
    public static final float SHORTCUT_THRESHOLD = 0.95f;
    public static final float MINIMUM_THRESHOLD = 0.20f;
    

    ////////////////////////////////////////////////////////////////
    // inner types
    ////////////////////////////////////////////////////////////////
    public enum InputState
    {
        PURE_ASCII,
        ESC_ASCII,
        HIGHBYTE
    }
    

    ////////////////////////////////////////////////////////////////
    // fields
    ////////////////////////////////////////////////////////////////
    private InputState  inputState;
    private boolean     done;
    private boolean     start;
    private boolean     gotData;
    private byte        lastChar;
    private String      detectedCharset;

    private CharsetProber[]     probers;
    private CharsetProber       escCharsetProber;
    
    private CharsetListener     listener;

    
    ////////////////////////////////////////////////////////////////
    // methods
    ////////////////////////////////////////////////////////////////
    /**
     * @param listener a listener object that is notified of
     *         the detected encocoding. Can be null.
     */
    public UniversalDetector(CharsetListener listener)
    {
        this.listener = listener;
        this.escCharsetProber = null;
        this.probers = new CharsetProber[3];
        for (int i=0; i<this.probers.length; ++i) {
            this.probers[i] = null;
        }
        
        reset();
    }
    
    public boolean isDone()
    {
        return this.done;
    }
    
    /**
     * @return The detected encoding is returned. If the detector couldn't
     *          determine what encoding was used, null is returned.
     */
    public String getDetectedCharset()
    {
        return this.detectedCharset;
    }
    
    public void setListener(CharsetListener listener)
    {
        this.listener = listener;
    }
    
    public CharsetListener getListener()
    {
        return this.listener;
    }
    
    public void handleData(final byte[] buf, int offset, int length)
    {
        if (this.done) {
            return;
        }
        
        if (length > 0) {
            this.gotData = true;
        }
        
        if (this.start) {
            this.start = false;
            if (length > 3) {
                int b1 = buf[offset] & 0xFF;
                int b2 = buf[offset+1] & 0xFF;
                int b3 = buf[offset+2] & 0xFF;
                int b4 = buf[offset+3] & 0xFF;
                
                switch (b1) {
                case 0xEF:
                    if (b2 == 0xBB && b3 == 0xBF) {
                        this.detectedCharset = Constants.CHARSET_UTF_8;
                    }
                    break;
                case 0xFE:
                    if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) {
                        this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412;
                    } else if (b2 == 0xFF) {
                        this.detectedCharset = Constants.CHARSET_UTF_16BE;
                    }
                    break;
                case 0x00:
                    if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) {
                        this.detectedCharset = Constants.CHARSET_UTF_32BE;
                    } else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) {
                        this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143;
                    }
                    break;
                case 0xFF:
                    if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) {
                        this.detectedCharset = Constants.CHARSET_UTF_32LE;
                    } else if (b2 == 0xFE) {
                        this.detectedCharset = Constants.CHARSET_UTF_16LE;
                    }
                    break;
                } // swich end
                
                if (this.detectedCharset != null) {
                    this.done = true;
                    return;
                }
            }
        } // if (start) end
        
        int maxPos = offset + length;
        for (int i=offset; i<maxPos; ++i) {
            int c = buf[i] & 0xFF;
            if ((c & 0x80) != 0 && c != 0xA0) {
                if (this.inputState != InputState.HIGHBYTE) {
                    this.inputState = InputState.HIGHBYTE;
                    
                    if (this.escCharsetProber != null) {
                        this.escCharsetProber = null;
                    }
                    
                    if (this.probers[0] == null) {
                        this.probers[0] = new MBCSGroupProber();
                    }
                    if (this.probers[1] == null) {
                        this.probers[1] = new SBCSGroupProber();
                    }
                    if (this.probers[2] == null) {
                        this.probers[2] = new Latin1Prober();
                    }
                }
            } else {
                if (this.inputState == InputState.PURE_ASCII &&
                    (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) {
                    this.inputState = InputState.ESC_ASCII;
                }
                this.lastChar = buf[i];
            }
        } // for end
        
        CharsetProber.ProbingState st;
        if (this.inputState == InputState.ESC_ASCII) {
            if (this.escCharsetProber == null) {
                this.escCharsetProber = new EscCharsetProber();
            }
            st = this.escCharsetProber.handleData(buf, offset, length);
            if (st == CharsetProber.ProbingState.FOUND_IT) {
                this.done = true;
                this.detectedCharset = this.escCharsetProber.getCharSetName();
            }
        } else if (this.inputState == InputState.HIGHBYTE) {
            for (int i=0; i<this.probers.length; ++i) {
                st = this.probers[i].handleData(buf, offset, length);
                if (st == CharsetProber.ProbingState.FOUND_IT) {
                    this.done = true;
                    this.detectedCharset = this.probers[i].getCharSetName();
                    return;
                }
            }
        } else { // pure ascii
            // do nothing
        }
    }
    
    public void dataEnd()
    {
        if (!this.gotData) {
            return;
        }
        
        if (this.detectedCharset != null) {
            this.done = true;
            if (this.listener != null) {
                this.listener.report(this.detectedCharset);
            }
            return;
        }
        
        if (this.inputState == InputState.HIGHBYTE) {
            float proberConfidence;
            float maxProberConfidence = 0.0f;
            int maxProber = 0;
            
            for (int i=0; i<this.probers.length; ++i) {
                proberConfidence = this.probers[i].getConfidence();
                if (proberConfidence > maxProberConfidence) {
                    maxProberConfidence = proberConfidence;
                    maxProber = i;
                }
            }
            
            if (maxProberConfidence > MINIMUM_THRESHOLD) {
                this.detectedCharset = this.probers[maxProber].getCharSetName();
                if (this.listener != null) {
                    this.listener.report(this.detectedCharset);
                }
            }
        } else if (this.inputState == InputState.ESC_ASCII) {
            // do nothing
        } else {
            // do nothing
        }
    }
    
    public void reset()
    {
        this.done = false;
        this.start = true;
        this.detectedCharset = null;
        this.gotData = false;
        this.inputState = InputState.PURE_ASCII;
        this.lastChar = 0;
        
        if (this.escCharsetProber != null) {
            this.escCharsetProber.reset();
        }
        
        for (int i=0; i<this.probers.length; ++i) {
            if (this.probers[i] != null) {
                this.probers[i].reset();
            }
        }
    }
    
    
    ////////////////////////////////////////////////////////////////
    // testing
    ////////////////////////////////////////////////////////////////
    public static void main(String[] args) throws Exception
    {
        if (args.length != 1) {
            System.out.println("USAGE: java UniversalDetector filename");
            return;
        }

        UniversalDetector detector = new UniversalDetector(
                new CharsetListener() {
                    public void report(String name)
                    {
                        System.out.println("charset = " + name);
                    }
                }
                );
        
        byte[] buf = new byte[4096];
        java.io.FileInputStream fis = new java.io.FileInputStream(args[0]);
        
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        detector.dataEnd();
    }
}




Java Source Code List

a2.marketingsms.ApplicationTest.java
a2.marketingsms.SelectList.java
a2.marketingsms.Texthem.java
a2.marketingsms.components.ContactImporter.java
a2.marketingsms.components.DialogHandler.java
a2.marketingsms.components.MyProgressDialog.java
a2.marketingsms.components.SMSSender.java
a2.marketingsms.model.Contact.java
a2.marketingsms.model.TemplateText.java
au.com.bytecode.opencsv.CSVIterator.java
au.com.bytecode.opencsv.CSVParserBuilder.java
au.com.bytecode.opencsv.CSVParser.java
au.com.bytecode.opencsv.CSVReaderBuilder.java
au.com.bytecode.opencsv.CSVReader.java
au.com.bytecode.opencsv.CSVWriter.java
au.com.bytecode.opencsv.ResultSetHelperService.java
au.com.bytecode.opencsv.ResultSetHelper.java
com.danielme.blog.demo.listviewcheckbox.CustomArrayAdapter.java
com.danielme.blog.demo.listviewcheckbox.DontPressWhenPressParentCheckBox.java
com.danielme.blog.demo.listviewcheckbox.Row.java
filechooser.FileArrayAdapter.java
filechooser.FileChooser.java
filechooser.Option.java
org.mozilla.universalchardet.CharsetListener.java
org.mozilla.universalchardet.Constants.java
org.mozilla.universalchardet.UniversalDetector.java
org.mozilla.universalchardet.prober.Big5Prober.java
org.mozilla.universalchardet.prober.CharsetProber.java
org.mozilla.universalchardet.prober.EUCJPProber.java
org.mozilla.universalchardet.prober.EUCKRProber.java
org.mozilla.universalchardet.prober.EUCTWProber.java
org.mozilla.universalchardet.prober.EscCharsetProber.java
org.mozilla.universalchardet.prober.GB18030Prober.java
org.mozilla.universalchardet.prober.HebrewProber.java
org.mozilla.universalchardet.prober.Latin1Prober.java
org.mozilla.universalchardet.prober.MBCSGroupProber.java
org.mozilla.universalchardet.prober.SBCSGroupProber.java
org.mozilla.universalchardet.prober.SJISProber.java
org.mozilla.universalchardet.prober.SingleByteCharsetProber.java
org.mozilla.universalchardet.prober.UTF8Prober.java
org.mozilla.universalchardet.prober.contextanalysis.EUCJPContextAnalysis.java
org.mozilla.universalchardet.prober.contextanalysis.JapaneseContextAnalysis.java
org.mozilla.universalchardet.prober.contextanalysis.SJISContextAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.Big5DistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.CharDistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.EUCJPDistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.EUCKRDistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.EUCTWDistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.GB2312DistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.JISDistributionAnalysis.java
org.mozilla.universalchardet.prober.distributionanalysis.SJISDistributionAnalysis.java
org.mozilla.universalchardet.prober.sequence.BulgarianModel.java
org.mozilla.universalchardet.prober.sequence.CyrillicModel.java
org.mozilla.universalchardet.prober.sequence.GreekModel.java
org.mozilla.universalchardet.prober.sequence.HebrewModel.java
org.mozilla.universalchardet.prober.sequence.Ibm855Model.java
org.mozilla.universalchardet.prober.sequence.Ibm866Model.java
org.mozilla.universalchardet.prober.sequence.Koi8rModel.java
org.mozilla.universalchardet.prober.sequence.Latin5BulgarianModel.java
org.mozilla.universalchardet.prober.sequence.Latin5Model.java
org.mozilla.universalchardet.prober.sequence.Latin7Model.java
org.mozilla.universalchardet.prober.sequence.MacCyrillicModel.java
org.mozilla.universalchardet.prober.sequence.SequenceModel.java
org.mozilla.universalchardet.prober.sequence.Win1251BulgarianModel.java
org.mozilla.universalchardet.prober.sequence.Win1251Model.java
org.mozilla.universalchardet.prober.sequence.Win1253Model.java
org.mozilla.universalchardet.prober.statemachine.Big5SMModel.java
org.mozilla.universalchardet.prober.statemachine.CodingStateMachine.java
org.mozilla.universalchardet.prober.statemachine.EUCJPSMModel.java
org.mozilla.universalchardet.prober.statemachine.EUCKRSMModel.java
org.mozilla.universalchardet.prober.statemachine.EUCTWSMModel.java
org.mozilla.universalchardet.prober.statemachine.GB18030SMModel.java
org.mozilla.universalchardet.prober.statemachine.HZSMModel.java
org.mozilla.universalchardet.prober.statemachine.ISO2022CNSMModel.java
org.mozilla.universalchardet.prober.statemachine.ISO2022JPSMModel.java
org.mozilla.universalchardet.prober.statemachine.ISO2022KRSMModel.java
org.mozilla.universalchardet.prober.statemachine.PkgInt.java
org.mozilla.universalchardet.prober.statemachine.SJISSMModel.java
org.mozilla.universalchardet.prober.statemachine.SMModel.java
org.mozilla.universalchardet.prober.statemachine.UCS2BESMModel.java
org.mozilla.universalchardet.prober.statemachine.UTF8SMModel.java