List of usage examples for org.apache.lucene.analysis.core LowerCaseFilter LowerCaseFilter
public LowerCaseFilter(TokenStream in)
From source file:analyzers.FormalAnalyzer.java
License:Apache License
/** * Define how tokens are processed./*from w ww .j a va2 s .c om*/ * * @param fieldName required input * @param reader reader for document */ @Override protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new StandardTokenizer(reader); TokenStream chain = tokenizer; if (!tokenOpts.disableAllFilters) { // the chain of token filters... chain = new StandardFilter(chain); // discard tokens based on their type attribute chain = new StandardTagFilter(chain, tokenOpts); // convert tokens to lowercase chain = new LowerCaseFilter(chain); // replace accented chars with non-accented ASCII equivalents chain = new ASCIIFoldingFilter(chain); // remove stop words (must come after lowercasing) chain = new StopFilter(chain, stopWordSet); // remove 's chain = new EnglishPossessiveFilter(Version.LATEST, chain); // spelling correction if (!spellingHashtable.isEmpty()) chain = new SpellingCorrectionFilter(chain, spellingHashtable); if (!tokenOpts.disableStemming) { // Krovets stemmer (smarter than the Porter stemmer) chain = new KStemFilter(chain); } } return new Analyzer.TokenStreamComponents(tokenizer, chain); }
From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); filter = new EdgeNGramTokenFilter(filter, 1, 4); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.PostalCodeAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new EdgeNGramTokenFilter(filter, 3, 4); return new TokenStreamComponents(source, filter); }
From source file:at.itbh.bev.index.TextAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); filter = new PhoneticFilter(filter, new ColognePhonetic(), true); filter = new NGramTokenFilter(filter, 2, 6); return new TokenStreamComponents(source, filter); }
From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new JapanesePartOfSpeechStopFilter(stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(stream); return new TokenStreamComponents(tokenizer, stream); }
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java
License:Open Source License
/** * Creates//from w w w .ja v a 2s.co m * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}, * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LATEST)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) { result = new DecimalDigitFilter(result); } // the order here is important: the stopword list is not normalized! result = new StopFilter(result, stopwords); result = new ArabicRootExtractorStemFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, result); }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); ShingleFilter sf = new ShingleFilter( new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET)); sf.setMaxShingleSize(gap);/* www.j ava 2 s .c o m*/ // sf.setFillerToken(""); // sf.setOutputUnigrams(false); return new TokenStreamComponents(tokenizer, sf); }
From source file:com.NGramTokenBaseAnalyzer.java
@Override protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer src = new StandardTokenizer(reader); TokenStream tok = new LowerCaseFilter(src); tok = filter(tok, this.unigramOutput); return new TokenStreamComponents(src, tok); }