List of usage examples for org.apache.lucene.analysis.miscellaneous SetKeywordMarkerFilter SetKeywordMarkerFilter
public SetKeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet)
From source file:EnglishAnalyzerConfigurable.java
License:Apache License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * /* w ww .ja v a 2 s . co m*/ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(matchVersion, result); if (this.doLowerCase) result = new LowerCaseFilter(matchVersion, result); if (this.doStopwordRemoval) result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); if (this.stemmer == StemmerType.PORTER) result = new PorterStemFilter(result); else if (this.stemmer == StemmerType.KSTEM) result = new KStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.doculibre.analyzer.FrenchAccentPlurielAnalyzer.java
License:Apache License
/** * Creates/*from w ww .j av a 2 s . c o m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link ElisionFilter}, * {@link LowerCaseFilter}, {@link StopFilter}, * {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, and {@link FrenchLightStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new ApostropheFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new FrenchFilter(result); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); // if (matchVersion.onOrAfter(Version.LUCENE_36)) { // result = new FrenchLightStemFilter(result); // } else { // result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); // } return new TokenStreamComponents(source, result); } else { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java
License:Open Source License
/** * Creates/* w w w. java2s.co m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}, * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LATEST)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) { result = new DecimalDigitFilter(result); } // the order here is important: the stopword list is not normalized! result = new StopFilter(result, stopwords); result = new ArabicRootExtractorStemFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, result); }
From source file:com.maktashaf.taymiyyah.repository.lucene.analysis.ar.ArabicCustomizedAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new ArabicTransliterationFilter(result); result = new ArabicDiacriticsFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new ArabicExtendedNormalizationFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); }// w w w .j ava 2 s . co m result = new ArabicStemFilter(result); result = new ArabicLetterSubstituteFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.maktashaf.taymiyyah.repository.lucene.analysis.ur.UrduAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new UrduTransliterationFilter(result); result = new ArabicDiacriticsFilter(result); result = new StopFilter(matchVersion, result, stopwords); // TODO find more urdu stop words. result = new ArabicExtendedNormalizationFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); }/*w w w .j a v a 2 s .c o m*/ // result = new ArabicStemFilter(result); //TODO Urdu stem Filter. result = new UrduLetterSubstituteFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.netcrest.pado.index.provider.lucene.analyzer.ExtendedEnglishAnalyzer.java
License:Open Source License
/** * Creates a//w ww. ja v a 2s. c o m * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which * tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for // us. if (matchVersion.onOrAfter(Version.LUCENE_31)) result = new EnglishPossessiveFilter(matchVersion, result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java
License:Open Source License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, * {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link PorterStemFilter}.//from w w w. j av a 2 s .c om */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source; source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Remove possessives (trailing 's) result = new EnglishPossessiveFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java
License:Open Source License
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return/*from w w w. j ava 2 s . c om*/ */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!exclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, exclusionSet); } // Normalize German special characters result = new KeywordAwareGermanNormalizationFilter(result); // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new GermanStemFilter(result); // result = new GermanLightStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.cmu.lti.f13.hw4.hw4_dateng.EnglishAnalyzerConfigurable.java
License:Apache License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * //from www .j a v a 2s . c om * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(matchVersion, result); if (this.doLowerCase) result = new LowerCaseFilter(matchVersion, result); if (this.doStopwordRemoval) result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); if (this.stemmer == StemmerType.PORTER) result = new PorterStemFilter(result); else if (this.stemmer == StemmerType.KSTEM) result = new KStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.elasticsearch.analysis.common.BrazilianStemTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { return new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions)); }