List of usage examples for org.apache.lucene.analysis.en KStemFilter KStemFilter
public KStemFilter(TokenStream in)
From source file:EnglishAnalyzerConfigurable.java
License:Apache License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * /*from w ww .j a v a 2 s. com*/ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(matchVersion, result); if (this.doLowerCase) result = new LowerCaseFilter(matchVersion, result); if (this.doStopwordRemoval) result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); if (this.stemmer == StemmerType.PORTER) result = new PorterStemFilter(result); else if (this.stemmer == StemmerType.KSTEM) result = new KStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:analyzers.FormalAnalyzer.java
License:Apache License
/** * Define how tokens are processed.//w ww.ja va2 s . c o m * * @param fieldName required input * @param reader reader for document */ @Override protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new StandardTokenizer(reader); TokenStream chain = tokenizer; if (!tokenOpts.disableAllFilters) { // the chain of token filters... chain = new StandardFilter(chain); // discard tokens based on their type attribute chain = new StandardTagFilter(chain, tokenOpts); // convert tokens to lowercase chain = new LowerCaseFilter(chain); // replace accented chars with non-accented ASCII equivalents chain = new ASCIIFoldingFilter(chain); // remove stop words (must come after lowercasing) chain = new StopFilter(chain, stopWordSet); // remove 's chain = new EnglishPossessiveFilter(Version.LATEST, chain); // spelling correction if (!spellingHashtable.isEmpty()) chain = new SpellingCorrectionFilter(chain, spellingHashtable); if (!tokenOpts.disableStemming) { // Krovets stemmer (smarter than the Porter stemmer) chain = new KStemFilter(chain); } } return new Analyzer.TokenStreamComponents(tokenizer, chain); }
From source file:com.nutrisystem.orange.java.text.EnglishTextAnalyzer.java
License:Apache License
/** * Creates a/*from www . java 2 s . c om*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which * tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link KeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new ClassicTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for // us. result = new EnglishPossessiveFilter(matchVersion, result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new NumberFilter(result); result = new KStemFilter(result); result = new PorterStemFilter(result); //result = new ShingleFilter(result, 3); return new TokenStreamComponents(source, result); }
From source file:crawler.util.analyzer.TwitterAnalyzer.java
License:Apache License
public TokenStream tokenStream(String fileName, Reader reader) { TokenStream toks = new TwitterLetterDigitTokenizer(Version.LUCENE_30, reader); //keep @ # url and all letter&digit toks = new LengthFilter(true, toks, 3, 255); //token(word) length longer than 3 toks = new LowerCaseFilter(Version.LUCENE_30, toks); //to lower case toks = new StopFilter(Version.LUCENE_30, toks, stopWordList); //stop word reduce toks = new KStemFilter(toks);//stem remianing words toks = new TwitterAttri(toks); //idetify url mention hash and pounctuation toks = new TwitterFilter(true, toks); //elimanate the above types return toks;/*w w w .j a v a2s . co m*/ }
From source file:edu.cmu.lti.f13.hw4.hw4_dateng.EnglishAnalyzerConfigurable.java
License:Apache License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * /*from w w w. j a v a 2 s . co m*/ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(matchVersion, result); if (this.doLowerCase) result = new LowerCaseFilter(matchVersion, result); if (this.doStopwordRemoval) result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); if (this.stemmer == StemmerType.PORTER) result = new PorterStemFilter(result); else if (this.stemmer == StemmerType.KSTEM) result = new KStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet()); filter = new KStemFilter(filter); //filter = new PorterStemFilter(filter); filter = new ASCIIFoldingFilter(filter); filter = new ConcatFilter(filter); return new Analyzer.TokenStreamComponents(source, filter); }
From source file:ie.cmrc.smtx.lucene.analysis.EuropeanAnalyzer.java
License:Apache License
/** * Returns a minimal/light stemming filter suitable for the provided language * @param language Two-letter code of a language * @param input {@code org.apache.lucene.analysis.TokenStream} input to * filter/*from w ww . j a v a 2 s .c o m*/ * @return {@code org.apache.lucene.analysis.TokenStream} that filters the * provided {@code input} */ protected TokenStream getMinimalStemFilter(String language, TokenStream input) { String lang = language; if (lang != null) lang = lang.trim().toLowerCase(); if (SUPPORTED_LANGUAGES.contains(lang)) { if (lang.equals(LANG_EN)) { return new KStemFilter(input); } else if (lang.equals(LANG_FR)) { return new FrenchMinimalStemFilter(input); } else if (lang.equals(LANG_ES)) { return new SpanishLightStemFilter(input); } else if (lang.equals(LANG_PT)) { return new PortugueseMinimalStemFilter(input); } else if (lang.equals(LANG_IT)) { return new ItalianLightStemFilter(input); } else if (lang.equals(LANG_DE)) { return new GermanMinimalStemFilter(input); } else if (lang.equals(LANG_NO)) { return new NorwegianMinimalStemFilter(input); } } return input; }
From source file:ie.cmrc.smtx.lucene.analysis.LanguageBasedAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //Tokenizer source = new StandardTokenizer(Version.LUCENE_46, reader); //Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_46, reader); Tokenizer source = new ClassicTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, this.getStopWordsSet(language)); filter = new KStemFilter(filter); filter = new ASCIIFoldingFilter(filter); return new TokenStreamComponents(source, filter); }
From source file:org.apache.solr.analysis.KStemFilterFactory.java
License:Apache License
public TokenFilter create(TokenStream input) { return new KStemFilter(input); }
From source file:org.elasticsearch.analysis.common.KStemTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { return new KStemFilter(tokenStream); }