Java tutorial
/* Copyright (C) 2005 Eduardo Jodas Samper This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA author e-mail: eduardj@dev.java.net */ package prman.model; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Locale; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spell.PlainTextDictionary; import org.apache.lucene.search.spell.SpellChecker; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class SpellCheckManager { private static final String ENABLED_ATTR = "enabled"; private static final String PATTERN_ATTR = "pattern"; private static final String MATCH_CASE_ATTR = "matchCase"; private static final String MATCHES_TAG = "matches"; private static final String NO_MATCHES_TAG = "noMatches"; private static final int MIN_WORD_LENGTH = 3; private static final String WORDS_ROOT_TAG = "wordsToIgnore"; private static final String WORDS_TAG = "words"; private static final String LOCALE_ATTR = "locale"; private static final File DIR = new File("spellCheck"); private HashMap spellCheckers; private boolean isEnabled; private Pattern[] matchList; private Pattern[] nomatchList; private HashSet wordsToIgnore; private HashMap wordsByLocale; public SpellCheckManager() { spellCheckers = new HashMap(); matchList = new Pattern[0]; nomatchList = new Pattern[0]; wordsToIgnore = new HashSet(); wordsByLocale = new HashMap(); } public boolean isEnabled() { return isEnabled; } public void setEnabled(boolean enabled) { isEnabled = enabled; } public Pattern[] getMatchPatterns() { return (Pattern[]) matchList.clone(); } public Locale[] getAvailableDictionaries() { // File dictio = new // File(getClass().getClassLoader().getResource("dictio").getFile()); Locale[] toReturn; if (DIR.exists() && DIR.isDirectory()) { File[] files = DIR.listFiles(new FileFilter() { public boolean accept(File pathname) { return pathname.isDirectory(); } }); toReturn = new Locale[files.length]; for (int iCnt = 0; iCnt < files.length; iCnt++) { String name = files[iCnt].getName(); toReturn[iCnt] = new Locale(name); } } else toReturn = new Locale[0]; return toReturn; } public String getBestEnd(String begin, Locale loc) { String toReturn = null; SpellChecker sc = getSpellChecker(loc); try { if (sc == null || sc.exist(begin)) return null; IndexSearcher searcher = new IndexSearcher(sc.getSpellIndex()); int bestLength = begin.length() + 3; toReturn = getBestEnd(searcher, new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "????")), bestLength); if (toReturn == null) { toReturn = getBestEnd(searcher, new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "???")), bestLength); if (toReturn == null) { toReturn = getBestEnd(searcher, new WildcardQuery(new Term(SpellChecker.F_WORD, begin + "?????")), bestLength); if (toReturn == null) { toReturn = getBestEnd(searcher, new PrefixQuery(new Term(SpellChecker.F_WORD, begin)), bestLength); } } } } catch (Throwable _t) { Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker error", _t); toReturn = null; } return toReturn; } private String getBestEnd(IndexSearcher searcher, Query query, int bestLength) throws IOException { String toReturn = null; Hits hits = searcher.search(query); int length = hits.length(); for (int iCnt = 0; (toReturn == null || toReturn.length() != bestLength) && iCnt < length; iCnt++) { String word = hits.doc(iCnt).get(SpellChecker.F_WORD); if (toReturn == null || Math.abs(toReturn.length() - bestLength) > Math.abs(word.length() - bestLength)) toReturn = word; // else if (!word.startsWith(begin)) // System.out.print(word+", "); } return toReturn; } public void setMatchPatterns(Pattern[] list) { matchList = (Pattern[]) list.clone(); } public Pattern[] getNoMatchPatterns() { return (Pattern[]) nomatchList.clone(); } public void setNoMatchPatterns(Pattern[] list) { nomatchList = (Pattern[]) list.clone(); } public boolean matchesCriteria(String key) { boolean toReturn = key != null; for (int iCnt = 0; toReturn && iCnt < matchList.length; iCnt++) toReturn = matchList[iCnt].matcher(key).matches(); for (int iCnt = 0; toReturn && iCnt < nomatchList.length; iCnt++) toReturn = !nomatchList[iCnt].matcher(key).matches(); return toReturn; } public boolean isWordDelimiter(char ch, Locale loc) { if (loc.getLanguage().equals("ca")) return ch != '' && !Character.isLetterOrDigit(ch); return !Character.isLetterOrDigit(ch); } public boolean shouldIgnore(String word, Locale loc) { if (wordsToIgnore.contains(word.toLowerCase())) return true; HashSet byLocale = (HashSet) wordsByLocale.get(loc); if (byLocale != null) return byLocale.contains(word.toLowerCase(loc)); return false; } public boolean isGood(String word, Locale loc) throws IOException { if (shouldIgnore(word, loc)) return true; SpellChecker sp = getSpellChecker(loc); int length = word.length(); if (sp == null || length < MIN_WORD_LENGTH) return true; boolean allUppercase = true; for (int iCnt = 0; iCnt < word.length(); iCnt++) { char charz = word.charAt(iCnt); if (Character.isDigit(charz)) return true; if (allUppercase && !Character.isUpperCase(charz)) allUppercase = false; } if (allUppercase) return true; return sp.exist(word.toLowerCase(loc)); // try // { // return sp == null || sp.exist(word); // } // catch (IOException _ioe) // { // Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker // error", _ioe); // } // return true; } public WordTokenizer getWordTokenizer(String text, Locale loc) { return new DefaultWordTokenizer(text, loc); } public String[] suggestSimilar(String word, Locale loc, int max) { SpellChecker sp = getSpellChecker(loc); String[] toReturn; try { if (sp != null) toReturn = sp.suggestSimilar(word, max); else toReturn = new String[0]; } catch (IOException _ioe) { Logger.getLogger(getClass().getName()).log(Level.WARNING, "Spell checker error", _ioe); toReturn = new String[0]; } return toReturn; } public void ignoreWord(String word) { wordsToIgnore.add(word.toLowerCase()); } public void ignoreWord(String word, Locale loc) { HashSet toAdd = (HashSet) wordsByLocale.get(loc); if (toAdd == null) { toAdd = new HashSet(); wordsByLocale.put(loc, toAdd); } toAdd.add(word.toLowerCase()); } public boolean isIndexed(Locale loc) { File dir = new File(DIR, loc.toString()); return dir.exists() && dir.isDirectory() && dir.list().length > 0; } public boolean setup(Locale loc) throws IOException { if (isIndexed(loc)) { File fdir = new File(DIR, loc.toString()); FSDirectory dir = FSDirectory.getDirectory(fdir, false); SpellChecker sc = new SpellChecker(dir); spellCheckers.put(loc, sc); return true; } else return false; } public void setupAll() throws IOException { Locale[] locs = getAvailableDictionaries(); for (int iCnt = 0; iCnt < locs.length; iCnt++) setup(locs[iCnt]); } public void indexDictionary(Locale loc, InputStream[] words, String encoding) throws IOException { // unindexDictionary(loc); File fdir = new File(DIR, loc.toString()); fdir.mkdirs(); // File[] contents = fdir.listFiles(); // for (int iCnt=0; iCnt<contents.length; iCnt++) // { // contents[iCnt].delete(); // } FSDirectory dir = FSDirectory.getDirectory(fdir, false); SpellChecker sc = new SpellChecker(dir); for (int iCnt = 0; iCnt < words.length; iCnt++) { PlainTextDictionary dictio = new PlainTextDictionary(words[iCnt], encoding); sc.indexDictionnary(dictio); } } public void unindexDictionary(Locale locale) throws IOException { if (!isIndexed(locale)) return; SpellChecker sc = getSpellChecker(locale); if (sc != null) { sc.clearIndex(); Directory dir = sc.getSpellIndex(); String[] list = dir.list(); for (int iCnt = 0; iCnt < list.length; iCnt++) dir.deleteFile(list[iCnt]); dir.close(); spellCheckers.remove(locale); } File fdir = new File(DIR, locale.toString()); // fdir.mkdirs(); if (fdir.exists() && fdir.isDirectory()) { File[] contents = fdir.listFiles(); for (int iCnt = 0; iCnt < contents.length; iCnt++) { contents[iCnt].delete(); } fdir.delete(); } } public void tearAllDown() throws IOException { for (Iterator it = spellCheckers.keySet().iterator(); it.hasNext();) { Object key = it.next(); ((SpellChecker) spellCheckers.get(key)).clearIndex(); } } public void loadWordsToIgnore(InputStream in) throws IOException { try { Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in); NodeList list = doc.getElementsByTagName(WORDS_TAG); int length = list.getLength(); for (int iCnt = 0; iCnt < length; iCnt++) { Element elem = (Element) list.item(iCnt); HashSet toAdd = wordsToIgnore; String localeStr = elem.getAttribute(LOCALE_ATTR); if (localeStr != null && localeStr.trim().length() > 0) { Locale locale = Util.parseLocale(localeStr); toAdd = (HashSet) wordsByLocale.get(locale); if (toAdd == null) { toAdd = new HashSet(); wordsByLocale.put(locale, toAdd); } } NodeList childs = elem.getChildNodes(); int clength = childs.getLength(); for (int jCnt = 0; jCnt < clength; jCnt++) { Node child = childs.item(jCnt); if (child.getNodeType() == Node.TEXT_NODE) { StringTokenizer parser = new StringTokenizer(child.getNodeValue()); while (parser.hasMoreTokens()) { toAdd.add(parser.nextToken().toLowerCase()); } } } } } catch (Exception _e) { if (_e instanceof IOException) throw (IOException) _e; else { IOException _ioe = new IOException(); _ioe.initCause(_e); throw (_ioe); } } } public void saveWordsToIgnore(OutputStream out) throws IOException { try { Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); Element root = doc.createElement(WORDS_ROOT_TAG); doc.appendChild(root); root.appendChild(doc.createTextNode("\n")); int size = wordsToIgnore.size(); if (size > 0) { Element words = doc.createElement(WORDS_TAG); root.appendChild(words); StringBuffer strBuf = new StringBuffer(); ArrayList list = new ArrayList(wordsToIgnore); Collections.sort(list); for (int iCnt = 0; iCnt < size; iCnt++) { strBuf.append("\n"); strBuf.append(list.get(iCnt)); //words.setAttribute(LOCALE_ATTR, matchList[iCnt].pattern()); } strBuf.append("\n"); words.appendChild(doc.createTextNode(strBuf.toString())); } for (Iterator it = wordsByLocale.keySet().iterator(); it.hasNext();) { Locale locale = (Locale) it.next(); HashSet wordSet = (HashSet) wordsByLocale.get(locale); size = wordSet.size(); if (size > 0) { Element words = doc.createElement(WORDS_TAG); words.setAttribute(LOCALE_ATTR, locale.toString()); root.appendChild(words); StringBuffer strBuf = new StringBuffer(); ArrayList list = new ArrayList(wordSet); Collections.sort(list); for (int iCnt = 0; iCnt < size; iCnt++) { strBuf.append("\n"); strBuf.append(list.get(iCnt)); } strBuf.append("\n"); words.appendChild(doc.createTextNode(strBuf.toString())); } } TransformerFactory.newInstance().newTransformer().transform(new DOMSource(doc), new StreamResult(out)); } catch (Exception _e) { if (_e instanceof IOException) throw (IOException) _e; else { IOException _ioe = new IOException(); _ioe.initCause(_e); throw (_ioe); } } } public void load(InputStream in) throws IOException { try { Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in); String enabled = doc.getDocumentElement().getAttribute(ENABLED_ATTR); isEnabled = Boolean.toString(true).equalsIgnoreCase(enabled); NodeList maLt = doc.getElementsByTagName(MATCHES_TAG); Pattern[] pmatches = new Pattern[maLt.getLength()]; for (int iCnt = 0; iCnt < pmatches.length; iCnt++) { Element pe = (Element) maLt.item(iCnt); pmatches[iCnt] = Pattern.compile(pe.getAttribute(PATTERN_ATTR), Boolean.toString(true).equalsIgnoreCase(pe.getAttribute(MATCH_CASE_ATTR)) ? 0 : Pattern.CASE_INSENSITIVE); } matchList = pmatches; NodeList nomaLt = doc.getElementsByTagName(NO_MATCHES_TAG); Pattern[] pnmatches = new Pattern[nomaLt.getLength()]; for (int iCnt = 0; iCnt < pnmatches.length; iCnt++) { Element pe = (Element) nomaLt.item(iCnt); pnmatches[iCnt] = Pattern.compile(pe.getAttribute(PATTERN_ATTR), Boolean.toString(true).equalsIgnoreCase(pe.getAttribute(MATCH_CASE_ATTR)) ? 0 : Pattern.CASE_INSENSITIVE); } nomatchList = pnmatches; } catch (Exception _e) { if (_e instanceof IOException) throw (IOException) _e; else { IOException _ioe = new IOException(); _ioe.initCause(_e); throw (_ioe); } } } public void save(OutputStream out) throws IOException { try { Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); Element root = doc.createElement("spellChecker"); root.setAttribute(ENABLED_ATTR, Boolean.toString(isEnabled())); doc.appendChild(root); root.appendChild(doc.createTextNode("\n")); for (int iCnt = 0; iCnt < matchList.length; iCnt++) { root.appendChild(doc.createTextNode("\t")); Element match = doc.createElement(MATCHES_TAG); match.setAttribute(PATTERN_ATTR, matchList[iCnt].pattern()); match.setAttribute(MATCH_CASE_ATTR, Boolean.toString((matchList[iCnt].flags() & Pattern.CASE_INSENSITIVE) == 0)); root.appendChild(match); root.appendChild(doc.createTextNode("\n")); } for (int iCnt = 0; iCnt < nomatchList.length; iCnt++) { root.appendChild(doc.createTextNode("\t")); Element match = doc.createElement(NO_MATCHES_TAG); match.setAttribute(PATTERN_ATTR, nomatchList[iCnt].pattern()); match.setAttribute(MATCH_CASE_ATTR, Boolean.toString((nomatchList[iCnt].flags() & Pattern.CASE_INSENSITIVE) == 0)); root.appendChild(match); root.appendChild(doc.createTextNode("\n")); } TransformerFactory.newInstance().newTransformer().transform(new DOMSource(doc), new StreamResult(out)); } catch (Exception _e) { if (_e instanceof IOException) throw (IOException) _e; else { IOException _ioe = new IOException(); _ioe.initCause(_e); throw (_ioe); } } } private SpellChecker getSpellChecker(Locale loc) { SpellChecker toReturn = (SpellChecker) spellCheckers.get(loc); toReturn = (SpellChecker) spellCheckers.get(loc); if (toReturn == null && loc.getVariant() != null && loc.getVariant().trim().length() > 0) toReturn = (SpellChecker) spellCheckers.get(new Locale(loc.getLanguage(), loc.getCountry())); if (toReturn == null && loc.getCountry() != null && loc.getCountry().trim().length() > 0) toReturn = (SpellChecker) spellCheckers.get(new Locale(loc.getLanguage())); return toReturn; } // private static Locale parseLocale(String localeName) // { // StringTokenizer parser = new StringTokenizer(localeName, "_"); // Locale locale; // String language = parser.nextToken(); // if (parser.hasMoreTokens()) // { // String country = parser.nextToken(); // if (parser.hasMoreTokens()) // locale = new Locale(language, country, parser.nextToken()); // else // locale = new Locale(language, country); // } // else // locale = new Locale(language); // return locale; // } private class DefaultWordTokenizer implements WordTokenizer { private String text; private Locale locale; private String next; private int curPos; private int nextWordPos; private int wordPos; public DefaultWordTokenizer(String text, Locale loc) { this.text = text; locale = loc; nextWord(); } public int getWordPosition() { return wordPos; } public String nextWord() { String toReturn = next; next = null; wordPos = nextWordPos; int wordIni = -1; for (; next == null && curPos < text.length(); curPos++) { boolean delimiter = isWordDelimiter(text.charAt(curPos), locale); if (wordIni == -1 && !delimiter) wordIni = curPos; else if (wordIni >= 0 && delimiter) { nextWordPos = wordIni; next = text.substring(wordIni, curPos); } } if (next == null && wordIni != -1) { nextWordPos = wordIni; next = text.substring(wordIni); } return toReturn; } public boolean hasMoreWords() { return next != null; } } }