List of usage examples for org.apache.lucene.analysis.en EnglishMinimalStemFilter EnglishMinimalStemFilter
public EnglishMinimalStemFilter(TokenStream input)
From source file:org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new ClassicTokenizer(); TokenStream filter = new LowerCaseFilter(source); if (stopSet != null) { filter = new StopFilter(filter, stopSet); }//www . j av a2s. co m switch (stemFilterType) { case PORTERSTEM_FILTER: filter = new PorterStemFilter(filter); break; case ENGLISHMINIMALSTEM_FILTER: filter = new EnglishMinimalStemFilter(filter); break; default: break; } return new TokenStreamComponents(source, filter); }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
private TokenStream applyStemmer(StemFilterType stemFilterType) { switch (stemFilterType) { case ENGLISHMINIMALSTEM_FILTER: tokenStream = new EnglishMinimalStemFilter(tokenStream); break;//ww w .j a va 2 s .c o m case PORTERSTEM_FILTER: tokenStream = new PorterStemFilter(tokenStream); break; default: break; } return tokenStream; }
From source file:org.apache.solr.analysis.EnglishMinimalStemFilterFactory.java
License:Apache License
public TokenStream create(TokenStream input) { return new EnglishMinimalStemFilter(input); }
From source file:org.edits.LuceneTokenizer.java
License:Open Source License
@Override public List<Annotation> annotate(String text) throws Exception { text = SimpleTokenizer.format(text); Analyzer analyser = new EnglishAnalyzer(Version.LUCENE_47, CharArraySet.EMPTY_SET); TokenFilter filter = new EnglishMinimalStemFilter(analyser.tokenStream("text", new StringReader(text))); List<Annotation> out = Lists.newArrayList(); while (filter.incrementToken()) { CharTermAttribute az = filter.getAttribute(CharTermAttribute.class); OffsetAttribute o = filter.getAttribute(OffsetAttribute.class); String token = text.substring(o.startOffset(), o.endOffset()); String lemma = az.toString(); Annotation t = new Annotation(); t.setForm(token);/*from w w w . ja v a2 s.com*/ t.setLemma(lemma); out.add(t); } if (out.size() == 0) { log.debug("Input string is empty"); } filter.close(); analyser.close(); return out; }
From source file:org.elasticsearch.analysis.common.StemmerTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { final Version indexVersion = indexSettings.getIndexVersionCreated(); if ("arabic".equalsIgnoreCase(language)) { return new ArabicStemFilter(tokenStream); } else if ("armenian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ArmenianStemmer()); } else if ("basque".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new BasqueStemmer()); } else if ("bengali".equalsIgnoreCase(language)) { return new BengaliStemFilter(tokenStream); } else if ("brazilian".equalsIgnoreCase(language)) { return new BrazilianStemFilter(tokenStream); } else if ("bulgarian".equalsIgnoreCase(language)) { return new BulgarianStemFilter(tokenStream); } else if ("catalan".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new CatalanStemmer()); } else if ("czech".equalsIgnoreCase(language)) { return new CzechStemFilter(tokenStream); } else if ("danish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DanishStemmer()); // Dutch stemmers } else if ("dutch".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DutchStemmer()); } else if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer()); // English stemmers } else if ("english".equalsIgnoreCase(language)) { return new PorterStemFilter(tokenStream); } else if ("light_english".equalsIgnoreCase(language) || "lightEnglish".equalsIgnoreCase(language) || "kstem".equalsIgnoreCase(language)) { return new KStemFilter(tokenStream); } else if ("lovins".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new LovinsStemmer()); } else if ("porter".equalsIgnoreCase(language)) { return new PorterStemFilter(tokenStream); } else if ("porter2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(tokenStream); // Finnish stemmers } else if ("finnish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FinnishStemmer()); } else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) { // leaving this for backward compatibility return new FinnishLightStemFilter(tokenStream); } else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) { return new FinnishLightStemFilter(tokenStream); // French stemmers } else if ("french".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FrenchStemmer()); } else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) { return new FrenchLightStemFilter(tokenStream); } else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) { return new FrenchMinimalStemFilter(tokenStream); // Galician stemmers } else if ("galician".equalsIgnoreCase(language)) { return new GalicianStemFilter(tokenStream); } else if ("minimal_galician".equalsIgnoreCase(language)) { return new GalicianMinimalStemFilter(tokenStream); // German stemmers } else if ("german".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new GermanStemmer()); } else if ("german2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new German2Stemmer()); } else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) { return new GermanLightStemFilter(tokenStream); } else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) { return new GermanMinimalStemFilter(tokenStream); } else if ("greek".equalsIgnoreCase(language)) { return new GreekStemFilter(tokenStream); } else if ("hindi".equalsIgnoreCase(language)) { return new HindiStemFilter(tokenStream); // Hungarian stemmers } else if ("hungarian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new HungarianStemmer()); } else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) { return new HungarianLightStemFilter(tokenStream); } else if ("indonesian".equalsIgnoreCase(language)) { return new IndonesianStemFilter(tokenStream); // Irish stemmer } else if ("irish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new IrishStemmer()); // Italian stemmers } else if ("italian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ItalianStemmer()); } else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) { return new ItalianLightStemFilter(tokenStream); } else if ("latvian".equalsIgnoreCase(language)) { return new LatvianStemFilter(tokenStream); } else if ("lithuanian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new LithuanianStemmer()); // Norwegian (Bokml) stemmers } else if ("norwegian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new NorwegianStemmer()); } else if ("light_norwegian".equalsIgnoreCase(language) || "lightNorwegian".equalsIgnoreCase(language)) { return new NorwegianLightStemFilter(tokenStream); } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { return new NorwegianMinimalStemFilter(tokenStream); // Norwegian (Nynorsk) stemmers } else if ("light_nynorsk".equalsIgnoreCase(language) || "lightNynorsk".equalsIgnoreCase(language)) { return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); } else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) { return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); // Portuguese stemmers } else if ("portuguese".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PortugueseStemmer()); } else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) { return new PortugueseLightStemFilter(tokenStream); } else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) { return new PortugueseMinimalStemFilter(tokenStream); } else if ("portuguese_rslp".equalsIgnoreCase(language)) { return new PortugueseStemFilter(tokenStream); } else if ("romanian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RomanianStemmer()); // Russian stemmers } else if ("russian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RussianStemmer()); } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) { return new RussianLightStemFilter(tokenStream); // Spanish stemmers } else if ("spanish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SpanishStemmer()); } else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) { return new SpanishLightStemFilter(tokenStream); // Sorani Kurdish stemmer } else if ("sorani".equalsIgnoreCase(language)) { return new SoraniStemFilter(tokenStream); // Swedish stemmers } else if ("swedish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SwedishStemmer()); } else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) { return new SwedishLightStemFilter(tokenStream); } else if ("turkish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new TurkishStemmer()); }//w ww . ja v a2 s . c o m return new SnowballFilter(tokenStream, language); }
From source file:org.elasticsearch.index.analysis.StemmerTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { if ("arabic".equalsIgnoreCase(language)) { return new ArabicStemFilter(tokenStream); } else if ("armenian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ArmenianStemmer()); } else if ("basque".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new BasqueStemmer()); } else if ("brazilian".equalsIgnoreCase(language)) { return new BrazilianStemFilter(tokenStream); } else if ("bulgarian".equalsIgnoreCase(language)) { return new BulgarianStemFilter(tokenStream); } else if ("catalan".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new CatalanStemmer()); } else if ("czech".equalsIgnoreCase(language)) { return new CzechStemFilter(tokenStream); } else if ("danish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DanishStemmer()); } else if ("dutch".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DutchStemmer()); } else if ("english".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("finnish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FinnishStemmer()); } else if ("french".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FrenchStemmer()); } else if ("german".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new GermanStemmer()); } else if ("german2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new German2Stemmer()); } else if ("hungarian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new HungarianStemmer()); } else if ("italian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ItalianStemmer()); } else if ("kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer()); } else if ("kstem".equalsIgnoreCase(language)) { return new KStemFilter(tokenStream); } else if ("lovins".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new LovinsStemmer()); } else if ("latvian".equalsIgnoreCase(language)) { return new LatvianStemFilter(tokenStream); } else if ("norwegian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new NorwegianStemmer()); } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { return new NorwegianMinimalStemFilter(tokenStream); } else if ("porter".equalsIgnoreCase(language)) { return new PorterStemFilter(tokenStream); } else if ("porter2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PorterStemmer()); } else if ("portuguese".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PortugueseStemmer()); } else if ("romanian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RomanianStemmer()); } else if ("russian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RussianStemmer()); } else if ("spanish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SpanishStemmer()); } else if ("swedish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SwedishStemmer()); } else if ("turkish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new TurkishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(version, tokenStream); } else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) { // leaving this for backward compatibility return new FinnishLightStemFilter(tokenStream); } else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) { return new FinnishLightStemFilter(tokenStream); } else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) { return new FrenchLightStemFilter(tokenStream); } else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) { return new FrenchMinimalStemFilter(tokenStream); } else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) { return new GermanLightStemFilter(tokenStream); } else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) { return new GermanMinimalStemFilter(tokenStream); } else if ("hindi".equalsIgnoreCase(language)) { return new HindiStemFilter(tokenStream); } else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) { return new HungarianLightStemFilter(tokenStream); } else if ("indonesian".equalsIgnoreCase(language)) { return new IndonesianStemFilter(tokenStream); } else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) { return new ItalianLightStemFilter(tokenStream); } else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) { return new PortugueseLightStemFilter(tokenStream); } else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) { return new PortugueseMinimalStemFilter(tokenStream); } else if ("portuguese".equalsIgnoreCase(language)) { return new PortugueseStemFilter(tokenStream); } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) { return new RussianLightStemFilter(tokenStream); } else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) { return new SpanishLightStemFilter(tokenStream); } else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) { return new SwedishLightStemFilter(tokenStream); } else if ("greek".equalsIgnoreCase(language)) { return new GreekStemFilter(tokenStream); }/*from w w w.jav a2 s. c om*/ return new SnowballFilter(tokenStream, language); }
From source file:org.splevo.vpm.analyzer.semantic.lucene.Stemming.java
License:Open Source License
/** * Wrap the current stream with the configured stemming option. * * @param stream//from ww w .java 2s. com * The token stream to wrap. * @param stemming * The stemming option selected. * @return The wrapped stream, or the stream itself in case of none configured. */ public static TokenStream wrapStemmingFilter(TokenStream stream, Stemming stemming) { switch (stemming) { case SNOWBALL_PORTER: return new SnowballFilter(stream, new EnglishStemmer()); case PORTER: return new PorterStemFilter(stream); case KSTEM: return new KStemFilter(stream); case SSTEMMER: return new EnglishMinimalStemFilter(stream); case PLING: return new PlingStemmingFilter(stream); case NONE: default: return stream; } }