List of usage examples for org.apache.lucene.analysis.core StopFilter StopFilter
public StopFilter(TokenStream in, CharArraySet stopWords)
From source file:MyStandardAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); // tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override/* w w w . j a va 2s .co m*/ protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(MyStandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:analyzers.FormalAnalyzer.java
License:Apache License
/** * Define how tokens are processed./*from w w w . j a va 2s . c om*/ * * @param fieldName required input * @param reader reader for document */ @Override protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new StandardTokenizer(reader); TokenStream chain = tokenizer; if (!tokenOpts.disableAllFilters) { // the chain of token filters... chain = new StandardFilter(chain); // discard tokens based on their type attribute chain = new StandardTagFilter(chain, tokenOpts); // convert tokens to lowercase chain = new LowerCaseFilter(chain); // replace accented chars with non-accented ASCII equivalents chain = new ASCIIFoldingFilter(chain); // remove stop words (must come after lowercasing) chain = new StopFilter(chain, stopWordSet); // remove 's chain = new EnglishPossessiveFilter(Version.LATEST, chain); // spelling correction if (!spellingHashtable.isEmpty()) chain = new SpellingCorrectionFilter(chain, spellingHashtable); if (!tokenOpts.disableStemming) { // Krovets stemmer (smarter than the Porter stemmer) chain = new KStemFilter(chain); } } return new Analyzer.TokenStreamComponents(tokenizer, chain); }
From source file:br.pucminas.ri.jsearch.utils.PorterStemAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String string) { Reader reader = new StringReader(string); LowerCaseTokenizer source = new LowerCaseTokenizer(); source.setReader(reader);/*from w w w .j a v a 2s.co m*/ StopFilter filter = new StopFilter(source, stopWords); PorterStemFilter stem = new PorterStemFilter(filter); return new TokenStreamComponents(source, stem); }
From source file:cn.tung.javacn.pinyin.SimpleChineseAnalyzer.java
License:Apache License
@Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer tokenizer = new HMMChineseTokenizer(reader); TokenStream result = new PorterStemFilter(tokenizer); if (!stopWords.isEmpty()) { result = new StopFilter(result, stopWords); }/*w ww . j a va 2 s. c o m*/ return new TokenStreamComponents(tokenizer, result); }
From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new JapanesePartOfSpeechStopFilter(stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(stream); return new TokenStreamComponents(tokenizer, stream); }
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java
License:Open Source License
/** * Creates/*from ww w. j a va 2 s .c o m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}, * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LATEST)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) { result = new DecimalDigitFilter(result); } // the order here is important: the stopword list is not normalized! result = new StopFilter(result, stopwords); result = new ArabicRootExtractorStemFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, result); }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); ShingleFilter sf = new ShingleFilter( new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET)); sf.setMaxShingleSize(gap);//from w w w.j a v a2 s. c om // sf.setFillerToken(""); // sf.setOutputUnigrams(false); return new TokenStreamComponents(tokenizer, sf); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java
License:Open Source License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, * {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link PorterStemFilter}.//from w w w .j a v a 2 s . c o m */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source; source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Remove possessives (trailing 's) result = new EnglishPossessiveFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java
License:Open Source License
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return// ww w . j a v a 2 s . co m */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!exclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, exclusionSet); } // Normalize German special characters result = new KeywordAwareGermanNormalizationFilter(result); // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new GermanStemFilter(result); // result = new GermanLightStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java
License:Apache License
@Test public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false)); TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) { stubAnnotator(TWO);//from w w w.j ava 2s .c om tok.setReader(new StringReader(ONE_TWO)); assertTokenInfos(f, new TokenInfo(TWO, 1)); } }