List of usage examples for org.apache.lucene.analysis.cz CzechAnalyzer CzechAnalyzer
public CzechAnalyzer(CharArraySet stopwords)
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>/*from w w w . j ava 2s. c o m*/ * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java
License:Apache License
public void init(Configuration conf) throws ApplicationFault, SystemFault { languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version)); languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version)); languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version)); languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version)); languageMap.put("de", new GermanAnalyzer(LuceneConstants.version)); languageMap.put("el", new GreekAnalyzer(LuceneConstants.version)); languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version)); languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version)); }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }
From source file:org.compass.core.lucene.engine.analyzer.ExtendedAnalyzerBuilderDelegate.java
License:Apache License
public Analyzer buildAnalyzer(String analyzerName, CompassSettings settings, DefaultLuceneAnalyzerFactory analyzerFactory) throws SearchEngineException { String analyzerSetting = settings.getSetting(LuceneEnvironment.Analyzer.TYPE, LuceneEnvironment.Analyzer.CoreTypes.STANDARD); Analyzer analyzer = null;// w w w . java 2s. co m if (LuceneEnvironment.Analyzer.ExtendedTypes.BRAZILIAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new BrazilianAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, BrazilianAnalyzer.BRAZILIAN_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CJK.equalsIgnoreCase(analyzerSetting)) { analyzer = new CJKAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, CJKAnalyzer.STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CHINESE.equalsIgnoreCase(analyzerSetting)) { analyzer = new ChineseAnalyzer(); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CZECH.equalsIgnoreCase(analyzerSetting)) { analyzer = new CzechAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, CzechAnalyzer.CZECH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.GERMAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new GermanAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, GermanAnalyzer.GERMAN_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.GREEK.equalsIgnoreCase(analyzerSetting)) { analyzer = new GreekAnalyzer(); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.FRENCH.equalsIgnoreCase(analyzerSetting)) { analyzer = new FrenchAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, FrenchAnalyzer.FRENCH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.DUTCH.equalsIgnoreCase(analyzerSetting)) { analyzer = new DutchAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, DutchAnalyzer.DUTCH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.RUSSIAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new RussianAnalyzer(); } return analyzer; }
From source file:org.eclipse.help.internal.search.AnalyzerFactory.java
License:Open Source License
public Analyzer create() { if (locale == null) return null; Version version = Version.LUCENE_35; if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("ja".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("ko".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("cs".equals(locale)) //$NON-NLS-1$ return new CzechAnalyzer(version); if ("de".equals(locale)) //$NON-NLS-1$ return new GermanAnalyzer(version); if ("el".equals(locale)) //$NON-NLS-1$ return new GreekAnalyzer(version); if ("fr".equals(locale)) //$NON-NLS-1$ return new FrenchAnalyzer(version); if ("nl".equals(locale)) //$NON-NLS-1$ return new DutchAnalyzer(version); if ("ru".equals(locale)) //$NON-NLS-1$ return new RussianAnalyzer(version); //unknown language return null;/*from w ww . j a va 2s .c o m*/ }
From source file:perLucene.Server.java
License:Open Source License
private static void initAnalyzers() { ha = new HashMap<String, Analyzer>(); ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41)); ha.put("el", new GreekAnalyzer(Version.LUCENE_41)); ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41)); ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41)); ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41)); ha.put("cz", new CzechAnalyzer(Version.LUCENE_41)); ha.put("da", new DanishAnalyzer(Version.LUCENE_41)); ha.put("de", new GermanAnalyzer(Version.LUCENE_41)); ha.put("en", new EnglishAnalyzer(Version.LUCENE_41)); ha.put("es", new SpanishAnalyzer(Version.LUCENE_41)); ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41)); ha.put("fa", new PersianAnalyzer(Version.LUCENE_41)); ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41)); ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41)); ha.put("ga", new IrishAnalyzer(Version.LUCENE_41)); ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41)); ha.put("hi", new HindiAnalyzer(Version.LUCENE_41)); ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41)); ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41)); ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41)); ha.put("it", new ItalianAnalyzer(Version.LUCENE_41)); ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41)); ha.put("nl", new DutchAnalyzer(Version.LUCENE_41)); ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41)); ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41)); ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41)); ha.put("ru", new RussianAnalyzer(Version.LUCENE_41)); ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41)); ha.put("th", new ThaiAnalyzer(Version.LUCENE_41)); ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41)); ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41)); }