prman.model.SpellCheckManager.java Source code

Introduction

Here is the source code for prman.model.SpellCheckManager.java
Source

/*
Copyright (C) 2005  Eduardo Jodas Samper
    
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    
   author e-mail: eduardj@dev.java.net
*/
package prman.model;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class SpellCheckManager {
    private static final String ENABLED_ATTR = "enabled";
    private static final String PATTERN_ATTR = "pattern";
    private static final String MATCH_CASE_ATTR = "matchCase";
    private static final String MATCHES_TAG = "matches";
    private static final String NO_MATCHES_TAG = "noMatches";
    private static final int MIN_WORD_LENGTH = 3;

    private static final String WORDS_ROOT_TAG = "wordsToIgnore";
    private static final String WORDS_TAG = "words";
    private static final String LOCALE_ATTR = "locale";

    private static final File DIR = new File("spellCheck");

    private HashMap spellCheckers;
    private boolean isEnabled;
    private Pattern[] matchList;
    private Pattern[] nomatchList;
    private HashSet wordsToIgnore;
    private HashMap wordsByLocale;

    public SpellCheckManager() {
        spellCheckers = new HashMap();
        matchList = new Pattern[0];
        nomatchList = new Pattern[0];
        wordsToIgnore = new HashSet();
        wordsByLocale = new HashMap();
    }

    public boolean isEnabled() {
        return isEnabled;
    }

    public void setEnabled(boolean enabled) {
        isEnabled = enabled;
    }

    public Pattern[] getMatchPatterns() {
        return (Pattern[]) matchList.clone();
    }

    public Locale[] getAvailableDictionaries() {
        // File dictio = new
        // File(getClass().getClassLoader().getResource("dictio").getFile());
        Locale[] toReturn;
        if (DIR.exists() && DIR.isDirectory()) {
            File[] files = DIR.listFiles(new FileFilter() {
                public boolean accept(File pathname) {
                    return pathname.isDirectory();
                }
            });
            toReturn = new Locale[files.length];
            for (int iCnt = 0; iCnt < files.length; iCnt++) {
                String name = files[iCnt].getName();
                toReturn[iCnt] = new Locale(name);
            }
        } else
            toReturn = new Locale[0];
        return toReturn;
    }

    public String getBestEnd(String begin, Locale loc) {
        String toReturn = null;

        SpellChecker sc = getSpellChecker(loc);
        try {
            if (sc == null || sc.exist(begin))
                return null;
            IndexSearcher searcher = new IndexSearcher(sc.getSpellIndex());

            int bestLength = begin.length() + 3;

            toReturn = getBestEnd(searcher, new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "????")),
                    bestLength);
            if (toReturn == null) {
                toReturn = getBestEnd(searcher, new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "???")),
                        bestLength);
                if (toReturn == null) {
                    toReturn = getBestEnd(searcher,
                            new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "?????")), bestLength);
                    if (toReturn == null) {
                        toReturn = getBestEnd(searcher, new PrefixQuery(new Term(SpellChecker.F_WORD, begin)),
                                bestLength);
                    }
                }
            }
        } catch (Throwable _t) {
            Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker error", _t);
            toReturn = null;
        }

        return toReturn;
    }

    private String getBestEnd(IndexSearcher searcher, Query query, int bestLength) throws IOException {
        String toReturn = null;
        Hits hits = searcher.search(query);
        int length = hits.length();
        for (int iCnt = 0; (toReturn == null || toReturn.length() != bestLength) && iCnt < length; iCnt++) {
            String word = hits.doc(iCnt).get(SpellChecker.F_WORD);
            if (toReturn == null || Math.abs(toReturn.length() - bestLength) > Math.abs(word.length() - bestLength))
                toReturn = word;
            // else if (!word.startsWith(begin))
            // System.out.print(word+", ");
        }
        return toReturn;
    }

    public void setMatchPatterns(Pattern[] list) {
        matchList = (Pattern[]) list.clone();
    }

    public Pattern[] getNoMatchPatterns() {
        return (Pattern[]) nomatchList.clone();
    }

    public void setNoMatchPatterns(Pattern[] list) {
        nomatchList = (Pattern[]) list.clone();
    }

    public boolean matchesCriteria(String key) {
        boolean toReturn = key != null;
        for (int iCnt = 0; toReturn && iCnt < matchList.length; iCnt++)
            toReturn = matchList[iCnt].matcher(key).matches();
        for (int iCnt = 0; toReturn && iCnt < nomatchList.length; iCnt++)
            toReturn = !nomatchList[iCnt].matcher(key).matches();
        return toReturn;
    }

    public boolean isWordDelimiter(char ch, Locale loc) {
        if (loc.getLanguage().equals("ca"))
            return ch != '' && !Character.isLetterOrDigit(ch);
        return !Character.isLetterOrDigit(ch);
    }

    public boolean shouldIgnore(String word, Locale loc) {
        if (wordsToIgnore.contains(word.toLowerCase()))
            return true;
        HashSet byLocale = (HashSet) wordsByLocale.get(loc);
        if (byLocale != null)
            return byLocale.contains(word.toLowerCase(loc));
        return false;
    }

    public boolean isGood(String word, Locale loc) throws IOException {
        if (shouldIgnore(word, loc))
            return true;
        SpellChecker sp = getSpellChecker(loc);
        int length = word.length();
        if (sp == null || length < MIN_WORD_LENGTH)
            return true;
        boolean allUppercase = true;
        for (int iCnt = 0; iCnt < word.length(); iCnt++) {
            char charz = word.charAt(iCnt);
            if (Character.isDigit(charz))
                return true;
            if (allUppercase && !Character.isUpperCase(charz))
                allUppercase = false;
        }
        if (allUppercase)
            return true;
        return sp.exist(word.toLowerCase(loc));
        // try
        // {
        // return sp == null || sp.exist(word);
        // }
        // catch (IOException _ioe)
        // {
        // Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker
        // error", _ioe);
        // }
        // return true;
    }

    public WordTokenizer getWordTokenizer(String text, Locale loc) {
        return new DefaultWordTokenizer(text, loc);
    }

    public String[] suggestSimilar(String word, Locale loc, int max) {
        SpellChecker sp = getSpellChecker(loc);
        String[] toReturn;
        try {
            if (sp != null)
                toReturn = sp.suggestSimilar(word, max);
            else
                toReturn = new String[0];
        } catch (IOException _ioe) {
            Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker error", _ioe);
            toReturn = new String[0];
        }

        return toReturn;
    }

    public void ignoreWord(String word) {
        wordsToIgnore.add(word.toLowerCase());
    }

    public void ignoreWord(String word, Locale loc) {
        HashSet toAdd = (HashSet) wordsByLocale.get(loc);
        if (toAdd == null) {
            toAdd = new HashSet();
            wordsByLocale.put(loc, toAdd);
        }

        toAdd.add(word.toLowerCase());
    }

    public boolean isIndexed(Locale loc) {
        File dir = new File(DIR, loc.toString());
        return dir.exists() && dir.isDirectory() && dir.list().length > 0;
    }

    public boolean setup(Locale loc) throws IOException {
        if (isIndexed(loc)) {
            File fdir = new File(DIR, loc.toString());
            FSDirectory dir = FSDirectory.getDirectory(fdir, false);
            SpellChecker sc = new SpellChecker(dir);
            spellCheckers.put(loc, sc);
            return true;
        } else
            return false;
    }

    public void setupAll() throws IOException {
        Locale[] locs = getAvailableDictionaries();
        for (int iCnt = 0; iCnt < locs.length; iCnt++)
            setup(locs[iCnt]);
    }

    public void indexDictionary(Locale loc, InputStream[] words, String encoding) throws IOException {
        // unindexDictionary(loc);
        File fdir = new File(DIR, loc.toString());
        fdir.mkdirs();
        // File[] contents = fdir.listFiles();
        // for (int iCnt=0; iCnt<contents.length; iCnt++)
        // {
        // contents[iCnt].delete();
        // }
        FSDirectory dir = FSDirectory.getDirectory(fdir, false);
        SpellChecker sc = new SpellChecker(dir);

        for (int iCnt = 0; iCnt < words.length; iCnt++) {
            PlainTextDictionary dictio = new PlainTextDictionary(words[iCnt], encoding);
            sc.indexDictionnary(dictio);
        }
    }

    public void unindexDictionary(Locale locale) throws IOException {
        if (!isIndexed(locale))
            return;
        SpellChecker sc = getSpellChecker(locale);
        if (sc != null) {
            sc.clearIndex();
            Directory dir = sc.getSpellIndex();
            String[] list = dir.list();
            for (int iCnt = 0; iCnt < list.length; iCnt++)
                dir.deleteFile(list[iCnt]);
            dir.close();
            spellCheckers.remove(locale);
        }
        File fdir = new File(DIR, locale.toString());
        // fdir.mkdirs();
        if (fdir.exists() && fdir.isDirectory()) {
            File[] contents = fdir.listFiles();
            for (int iCnt = 0; iCnt < contents.length; iCnt++) {
                contents[iCnt].delete();
            }
            fdir.delete();
        }
    }

    public void tearAllDown() throws IOException {
        for (Iterator it = spellCheckers.keySet().iterator(); it.hasNext();) {
            Object key = it.next();
            ((SpellChecker) spellCheckers.get(key)).clearIndex();
        }
    }

    public void loadWordsToIgnore(InputStream in) throws IOException {
        try {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in);
            NodeList list = doc.getElementsByTagName(WORDS_TAG);
            int length = list.getLength();
            for (int iCnt = 0; iCnt < length; iCnt++) {
                Element elem = (Element) list.item(iCnt);
                HashSet toAdd = wordsToIgnore;
                String localeStr = elem.getAttribute(LOCALE_ATTR);
                if (localeStr != null && localeStr.trim().length() > 0) {
                    Locale locale = Util.parseLocale(localeStr);
                    toAdd = (HashSet) wordsByLocale.get(locale);
                    if (toAdd == null) {
                        toAdd = new HashSet();
                        wordsByLocale.put(locale, toAdd);
                    }
                }
                NodeList childs = elem.getChildNodes();
                int clength = childs.getLength();
                for (int jCnt = 0; jCnt < clength; jCnt++) {
                    Node child = childs.item(jCnt);
                    if (child.getNodeType() == Node.TEXT_NODE) {
                        StringTokenizer parser = new StringTokenizer(child.getNodeValue());
                        while (parser.hasMoreTokens()) {
                            toAdd.add(parser.nextToken().toLowerCase());
                        }
                    }
                }
            }
        } catch (Exception _e) {
            if (_e instanceof IOException)
                throw (IOException) _e;
            else {
                IOException _ioe = new IOException();
                _ioe.initCause(_e);
                throw (_ioe);
            }
        }
    }

    public void saveWordsToIgnore(OutputStream out) throws IOException {
        try {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            Element root = doc.createElement(WORDS_ROOT_TAG);
            doc.appendChild(root);
            root.appendChild(doc.createTextNode("\n"));

            int size = wordsToIgnore.size();
            if (size > 0) {
                Element words = doc.createElement(WORDS_TAG);
                root.appendChild(words);
                StringBuffer strBuf = new StringBuffer();
                ArrayList list = new ArrayList(wordsToIgnore);
                Collections.sort(list);
                for (int iCnt = 0; iCnt < size; iCnt++) {
                    strBuf.append("\n");
                    strBuf.append(list.get(iCnt));
                    //words.setAttribute(LOCALE_ATTR, matchList[iCnt].pattern());
                }
                strBuf.append("\n");
                words.appendChild(doc.createTextNode(strBuf.toString()));
            }

            for (Iterator it = wordsByLocale.keySet().iterator(); it.hasNext();) {
                Locale locale = (Locale) it.next();
                HashSet wordSet = (HashSet) wordsByLocale.get(locale);
                size = wordSet.size();
                if (size > 0) {
                    Element words = doc.createElement(WORDS_TAG);
                    words.setAttribute(LOCALE_ATTR, locale.toString());
                    root.appendChild(words);
                    StringBuffer strBuf = new StringBuffer();
                    ArrayList list = new ArrayList(wordSet);
                    Collections.sort(list);
                    for (int iCnt = 0; iCnt < size; iCnt++) {
                        strBuf.append("\n");
                        strBuf.append(list.get(iCnt));
                    }
                    strBuf.append("\n");
                    words.appendChild(doc.createTextNode(strBuf.toString()));
                }
            }

            TransformerFactory.newInstance().newTransformer().transform(new DOMSource(doc), new StreamResult(out));
        } catch (Exception _e) {
            if (_e instanceof IOException)
                throw (IOException) _e;
            else {
                IOException _ioe = new IOException();
                _ioe.initCause(_e);
                throw (_ioe);
            }
        }
    }

    public void load(InputStream in) throws IOException {
        try {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in);
            String enabled = doc.getDocumentElement().getAttribute(ENABLED_ATTR);
            isEnabled = Boolean.toString(true).equalsIgnoreCase(enabled);

            NodeList maLt = doc.getElementsByTagName(MATCHES_TAG);
            Pattern[] pmatches = new Pattern[maLt.getLength()];
            for (int iCnt = 0; iCnt < pmatches.length; iCnt++) {
                Element pe = (Element) maLt.item(iCnt);
                pmatches[iCnt] = Pattern.compile(pe.getAttribute(PATTERN_ATTR),
                        Boolean.toString(true).equalsIgnoreCase(pe.getAttribute(MATCH_CASE_ATTR)) ? 0
                                : Pattern.CASE_INSENSITIVE);
            }
            matchList = pmatches;

            NodeList nomaLt = doc.getElementsByTagName(NO_MATCHES_TAG);
            Pattern[] pnmatches = new Pattern[nomaLt.getLength()];
            for (int iCnt = 0; iCnt < pnmatches.length; iCnt++) {
                Element pe = (Element) nomaLt.item(iCnt);
                pnmatches[iCnt] = Pattern.compile(pe.getAttribute(PATTERN_ATTR),
                        Boolean.toString(true).equalsIgnoreCase(pe.getAttribute(MATCH_CASE_ATTR)) ? 0
                                : Pattern.CASE_INSENSITIVE);
            }
            nomatchList = pnmatches;
        } catch (Exception _e) {
            if (_e instanceof IOException)
                throw (IOException) _e;
            else {
                IOException _ioe = new IOException();
                _ioe.initCause(_e);
                throw (_ioe);
            }
        }
    }

    public void save(OutputStream out) throws IOException {
        try {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            Element root = doc.createElement("spellChecker");
            root.setAttribute(ENABLED_ATTR, Boolean.toString(isEnabled()));
            doc.appendChild(root);
            root.appendChild(doc.createTextNode("\n"));

            for (int iCnt = 0; iCnt < matchList.length; iCnt++) {
                root.appendChild(doc.createTextNode("\t"));
                Element match = doc.createElement(MATCHES_TAG);
                match.setAttribute(PATTERN_ATTR, matchList[iCnt].pattern());
                match.setAttribute(MATCH_CASE_ATTR,
                        Boolean.toString((matchList[iCnt].flags() & Pattern.CASE_INSENSITIVE) == 0));
                root.appendChild(match);
                root.appendChild(doc.createTextNode("\n"));
            }

            for (int iCnt = 0; iCnt < nomatchList.length; iCnt++) {
                root.appendChild(doc.createTextNode("\t"));
                Element match = doc.createElement(NO_MATCHES_TAG);
                match.setAttribute(PATTERN_ATTR, nomatchList[iCnt].pattern());
                match.setAttribute(MATCH_CASE_ATTR,
                        Boolean.toString((nomatchList[iCnt].flags() & Pattern.CASE_INSENSITIVE) == 0));
                root.appendChild(match);
                root.appendChild(doc.createTextNode("\n"));
            }

            TransformerFactory.newInstance().newTransformer().transform(new DOMSource(doc), new StreamResult(out));
        } catch (Exception _e) {
            if (_e instanceof IOException)
                throw (IOException) _e;
            else {
                IOException _ioe = new IOException();
                _ioe.initCause(_e);
                throw (_ioe);
            }
        }
    }

    private SpellChecker getSpellChecker(Locale loc) {
        SpellChecker toReturn = (SpellChecker) spellCheckers.get(loc);
        toReturn = (SpellChecker) spellCheckers.get(loc);
        if (toReturn == null && loc.getVariant() != null && loc.getVariant().trim().length() > 0)
            toReturn = (SpellChecker) spellCheckers.get(new Locale(loc.getLanguage(), loc.getCountry()));
        if (toReturn == null && loc.getCountry() != null && loc.getCountry().trim().length() > 0)
            toReturn = (SpellChecker) spellCheckers.get(new Locale(loc.getLanguage()));
        return toReturn;
    }

    // private static Locale parseLocale(String localeName)
    // {
    // StringTokenizer parser = new StringTokenizer(localeName, "_");
    // Locale locale;
    // String language = parser.nextToken();
    // if (parser.hasMoreTokens())
    // {
    // String country = parser.nextToken();
    // if (parser.hasMoreTokens())
    // locale = new Locale(language, country, parser.nextToken());
    // else
    // locale = new Locale(language, country);
    // }
    // else
    // locale = new Locale(language);
    // return locale;
    // }

    private class DefaultWordTokenizer implements WordTokenizer {
        private String text;
        private Locale locale;
        private String next;
        private int curPos;
        private int nextWordPos;
        private int wordPos;

        public DefaultWordTokenizer(String text, Locale loc) {
            this.text = text;
            locale = loc;
            nextWord();
        }

        public int getWordPosition() {
            return wordPos;
        }

        public String nextWord() {
            String toReturn = next;
            next = null;
            wordPos = nextWordPos;
            int wordIni = -1;
            for (; next == null && curPos < text.length(); curPos++) {
                boolean delimiter = isWordDelimiter(text.charAt(curPos), locale);
                if (wordIni == -1 && !delimiter)
                    wordIni = curPos;
                else if (wordIni >= 0 && delimiter) {
                    nextWordPos = wordIni;
                    next = text.substring(wordIni, curPos);
                }
            }
            if (next == null && wordIni != -1) {
                nextWordPos = wordIni;
                next = text.substring(wordIni);
            }
            return toReturn;
        }

        public boolean hasMoreWords() {
            return next != null;
        }
    }
}