Example usage for org.apache.lucene.analysis.core StopFilter StopFilter

List of usage examples for org.apache.lucene.analysis.core StopFilter StopFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core StopFilter StopFilter.

Prototype

public StopFilter(TokenStream in, CharArraySet stopWords) 

Source Link

Document

Constructs a filter which removes words from the input TokenStream that are named in the Set.

Usage

From source file:MyStandardAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(src);
    // tok = new LowerCaseFilter(tok);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok) {
        @Override/* w w w  .  j  a va  2s .co m*/
        protected void setReader(final Reader reader) throws IOException {
            src.setMaxTokenLength(MyStandardAnalyzer.this.maxTokenLength);
            super.setReader(reader);
        }
    };
}

From source file:analyzers.FormalAnalyzer.java

License:Apache License

/**
* Define how tokens are processed./*from   w w w .  j  a va  2s  . c  om*/
*
* @param    fieldName    required input
* @param    reader       reader for document
*/
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new StandardTokenizer(reader);
    TokenStream chain = tokenizer;

    if (!tokenOpts.disableAllFilters) {
        // the chain of token filters...

        chain = new StandardFilter(chain);

        // discard tokens based on their type attribute
        chain = new StandardTagFilter(chain, tokenOpts);

        // convert tokens to lowercase
        chain = new LowerCaseFilter(chain);

        // replace accented chars with non-accented ASCII equivalents
        chain = new ASCIIFoldingFilter(chain);

        // remove stop words (must come after lowercasing)
        chain = new StopFilter(chain, stopWordSet);

        // remove 's
        chain = new EnglishPossessiveFilter(Version.LATEST, chain);

        // spelling correction            
        if (!spellingHashtable.isEmpty())
            chain = new SpellingCorrectionFilter(chain, spellingHashtable);

        if (!tokenOpts.disableStemming) {
            // Krovets stemmer (smarter than the Porter stemmer)
            chain = new KStemFilter(chain);
        }
    }

    return new Analyzer.TokenStreamComponents(tokenizer, chain);
}

From source file:br.pucminas.ri.jsearch.utils.PorterStemAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String string) {
    Reader reader = new StringReader(string);
    LowerCaseTokenizer source = new LowerCaseTokenizer();
    source.setReader(reader);/*from  w  w  w  .j a v  a 2s.co  m*/
    StopFilter filter = new StopFilter(source, stopWords);
    PorterStemFilter stem = new PorterStemFilter(filter);
    return new TokenStreamComponents(source, stem);
}

From source file:cn.tung.javacn.pinyin.SimpleChineseAnalyzer.java

License:Apache License

@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer tokenizer = new HMMChineseTokenizer(reader);
    TokenStream result = new PorterStemFilter(tokenizer);
    if (!stopWords.isEmpty()) {
        result = new StopFilter(result, stopWords);
    }/*w ww  . j a va 2  s. c  o m*/
    return new TokenStreamComponents(tokenizer, result);
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
    stream = new CJKWidthFilter(stream);
    stream = new StopFilter(stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    stream = new LowerCaseFilter(stream);
    return new TokenStreamComponents(tokenizer, stream);
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java

License:Open Source License

/**
 * Creates/*from ww  w. j  a  va 2 s .c  o m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
 * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source;
    if (getVersion().onOrAfter(Version.LATEST)) {
        source = new StandardTokenizer();
    } else {
        source = new StandardTokenizer40();
    }
    TokenStream result = new LowerCaseFilter(source);
    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
        result = new DecimalDigitFilter(result);
    }
    // the order here is important: the stopword list is not normalized!
    result = new StopFilter(result, stopwords);
    result = new ArabicRootExtractorStemFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    return new TokenStreamComponents(source, result);
}

From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    ShingleFilter sf = new ShingleFilter(
            new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET));
    sf.setMaxShingleSize(gap);//from  w  w w.j  a v  a2 s.  c  om
    // sf.setFillerToken("");
    // sf.setOutputUnigrams(false);
    return new TokenStreamComponents(tokenizer, sf);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java

License:Open Source License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
 * the text in the provided {@link Reader}.
 *
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
 *         {@link StandardTokenizer} filtered with {@link StandardFilter},
 *         {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
 *         {@link PorterStemFilter}.//from   w  w w  .j  a  v  a  2  s  . c  o  m
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source;
    source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Remove possessives (trailing 's)
    result = new EnglishPossessiveFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new PorterStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java

License:Open Source License

/**
 * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
 * the text in the provided {@link Reader}.
 *
 * @return//  ww w  . j  a  v  a  2 s .  co m
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!exclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, exclusionSet);
    }

    // Normalize German special characters
    result = new KeywordAwareGermanNormalizationFilter(result);

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new GermanStemFilter(result);
    // result = new GermanLightStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO);//from  w w w.j ava  2s  .c om
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }
}