Example usage for org.apache.lucene.analysis.en KStemFilter KStemFilter

List of usage examples for org.apache.lucene.analysis.en KStemFilter KStemFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en KStemFilter KStemFilter.

Prototype

public KStemFilter(TokenStream in) 

Source Link

Usage

From source file:EnglishAnalyzerConfigurable.java

License:Apache License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * /*from w ww .j  a  v a 2 s. com*/
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.

    result = new EnglishPossessiveFilter(matchVersion, result);

    if (this.doLowerCase)
        result = new LowerCaseFilter(matchVersion, result);

    if (this.doStopwordRemoval)
        result = new StopFilter(matchVersion, result, stopwords);

    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);

    if (this.stemmer == StemmerType.PORTER)
        result = new PorterStemFilter(result);
    else if (this.stemmer == StemmerType.KSTEM)
        result = new KStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:analyzers.FormalAnalyzer.java

License:Apache License

/**
* Define how tokens are processed.//w  ww.ja  va2 s  .  c  o  m
*
* @param    fieldName    required input
* @param    reader       reader for document
*/
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new StandardTokenizer(reader);
    TokenStream chain = tokenizer;

    if (!tokenOpts.disableAllFilters) {
        // the chain of token filters...

        chain = new StandardFilter(chain);

        // discard tokens based on their type attribute
        chain = new StandardTagFilter(chain, tokenOpts);

        // convert tokens to lowercase
        chain = new LowerCaseFilter(chain);

        // replace accented chars with non-accented ASCII equivalents
        chain = new ASCIIFoldingFilter(chain);

        // remove stop words (must come after lowercasing)
        chain = new StopFilter(chain, stopWordSet);

        // remove 's
        chain = new EnglishPossessiveFilter(Version.LATEST, chain);

        // spelling correction            
        if (!spellingHashtable.isEmpty())
            chain = new SpellingCorrectionFilter(chain, spellingHashtable);

        if (!tokenOpts.disableStemming) {
            // Krovets stemmer (smarter than the Porter stemmer)
            chain = new KStemFilter(chain);
        }
    }

    return new Analyzer.TokenStreamComponents(tokenizer, chain);
}

From source file:com.nutrisystem.orange.java.text.EnglishTextAnalyzer.java

License:Apache License

/**
 * Creates a/*from www  . java  2  s .  c  om*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which
 * tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link KeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new ClassicTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for
    // us.
    result = new EnglishPossessiveFilter(matchVersion, result);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new NumberFilter(result);
    result = new KStemFilter(result);
    result = new PorterStemFilter(result);
    //result = new ShingleFilter(result, 3);
    return new TokenStreamComponents(source, result);
}

From source file:crawler.util.analyzer.TwitterAnalyzer.java

License:Apache License

public TokenStream tokenStream(String fileName, Reader reader) {
    TokenStream toks = new TwitterLetterDigitTokenizer(Version.LUCENE_30, reader); //keep @ # url and all letter&digit
    toks = new LengthFilter(true, toks, 3, 255); //token(word) length longer than 3
    toks = new LowerCaseFilter(Version.LUCENE_30, toks); //to lower case
    toks = new StopFilter(Version.LUCENE_30, toks, stopWordList); //stop word reduce
    toks = new KStemFilter(toks);//stem remianing words
    toks = new TwitterAttri(toks); //idetify url mention hash and pounctuation
    toks = new TwitterFilter(true, toks); //elimanate the above types
    return toks;/*w w  w  .j  a  v  a2s . co  m*/
}

From source file:edu.cmu.lti.f13.hw4.hw4_dateng.EnglishAnalyzerConfigurable.java

License:Apache License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * /*from w w  w.  j a  v  a  2 s . co m*/
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.

    result = new EnglishPossessiveFilter(matchVersion, result);

    if (this.doLowerCase)
        result = new LowerCaseFilter(matchVersion, result);

    if (this.doStopwordRemoval)
        result = new StopFilter(matchVersion, result, stopwords);

    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);

    if (this.stemmer == StemmerType.PORTER)
        result = new PorterStemFilter(result);
    else if (this.stemmer == StemmerType.KSTEM)
        result = new KStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet());
    filter = new KStemFilter(filter);
    //filter = new PorterStemFilter(filter);
    filter = new ASCIIFoldingFilter(filter);
    filter = new ConcatFilter(filter);
    return new Analyzer.TokenStreamComponents(source, filter);
}

From source file:ie.cmrc.smtx.lucene.analysis.EuropeanAnalyzer.java

License:Apache License

/**
 * Returns a minimal/light stemming filter suitable for the provided language
 * @param language Two-letter code of a language
 * @param input {@code org.apache.lucene.analysis.TokenStream} input to
 * filter/*from  w ww  .  j a  v a 2  s  .c  o m*/
 * @return {@code org.apache.lucene.analysis.TokenStream} that filters the
 * provided {@code input}
 */
protected TokenStream getMinimalStemFilter(String language, TokenStream input) {
    String lang = language;
    if (lang != null)
        lang = lang.trim().toLowerCase();
    if (SUPPORTED_LANGUAGES.contains(lang)) {
        if (lang.equals(LANG_EN)) {
            return new KStemFilter(input);
        } else if (lang.equals(LANG_FR)) {
            return new FrenchMinimalStemFilter(input);
        } else if (lang.equals(LANG_ES)) {
            return new SpanishLightStemFilter(input);
        } else if (lang.equals(LANG_PT)) {
            return new PortugueseMinimalStemFilter(input);
        } else if (lang.equals(LANG_IT)) {
            return new ItalianLightStemFilter(input);
        } else if (lang.equals(LANG_DE)) {
            return new GermanMinimalStemFilter(input);
        } else if (lang.equals(LANG_NO)) {
            return new NorwegianMinimalStemFilter(input);
        }
    }
    return input;
}

From source file:ie.cmrc.smtx.lucene.analysis.LanguageBasedAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //Tokenizer source = new StandardTokenizer(Version.LUCENE_46, reader);
    //Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_46, reader);
    Tokenizer source = new ClassicTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, this.getStopWordsSet(language));
    filter = new KStemFilter(filter);
    filter = new ASCIIFoldingFilter(filter);
    return new TokenStreamComponents(source, filter);
}

From source file:org.apache.solr.analysis.KStemFilterFactory.java

License:Apache License

public TokenFilter create(TokenStream input) {
    return new KStemFilter(input);
}

From source file:org.elasticsearch.analysis.common.KStemTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new KStemFilter(tokenStream);
}