Example usage for org.apache.lucene.analysis.miscellaneous SetKeywordMarkerFilter SetKeywordMarkerFilter

List of usage examples for org.apache.lucene.analysis.miscellaneous SetKeywordMarkerFilter SetKeywordMarkerFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous SetKeywordMarkerFilter SetKeywordMarkerFilter.

Prototype

public SetKeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) 

Source Link

Document

Create a new KeywordSetMarkerFilter, that marks the current token as a keyword if the tokens term buffer is contained in the given set via the KeywordAttribute .

Usage

From source file:EnglishAnalyzerConfigurable.java

License:Apache License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * /*  w ww .ja  v a 2  s  . co  m*/
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.

    result = new EnglishPossessiveFilter(matchVersion, result);

    if (this.doLowerCase)
        result = new LowerCaseFilter(matchVersion, result);

    if (this.doStopwordRemoval)
        result = new StopFilter(matchVersion, result, stopwords);

    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);

    if (this.stemmer == StemmerType.PORTER)
        result = new PorterStemFilter(result);
    else if (this.stemmer == StemmerType.KSTEM)
        result = new KStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.doculibre.analyzer.FrenchAccentPlurielAnalyzer.java

License:Apache License

/**
 * Creates/*from  w  ww .j  av a 2 s  .  c  o  m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 * 
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from a {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link ElisionFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter},
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is
 *         provided, and {@link FrenchLightStemFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new ElisionFilter(result, DEFAULT_ARTICLES);
        result = new LowerCaseFilter(matchVersion, result);
        result = new ApostropheFilter(result);
        result = new StopFilter(matchVersion, result, stopwords);
        result = new FrenchFilter(result);
        if (!excltable.isEmpty())
            result = new SetKeywordMarkerFilter(result, excltable);
        //      if (matchVersion.onOrAfter(Version.LUCENE_36)) {
        //        result = new FrenchLightStemFilter(result);
        //      } else {
        //        result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
        //      }
        return new TokenStreamComponents(source, result);
    } else {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new StopFilter(matchVersion, result, stopwords);
        if (!excltable.isEmpty())
            result = new SetKeywordMarkerFilter(result, excltable);
        result = new FrenchStemFilter(result);
        // Convert to lowercase after stemming!
        return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
    }
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java

License:Open Source License

/**
 * Creates/*  w w  w. java2s.co m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
 * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source;
    if (getVersion().onOrAfter(Version.LATEST)) {
        source = new StandardTokenizer();
    } else {
        source = new StandardTokenizer40();
    }
    TokenStream result = new LowerCaseFilter(source);
    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
        result = new DecimalDigitFilter(result);
    }
    // the order here is important: the stopword list is not normalized!
    result = new StopFilter(result, stopwords);
    result = new ArabicRootExtractorStemFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    return new TokenStreamComponents(source, result);
}

From source file:com.maktashaf.taymiyyah.repository.lucene.analysis.ar.ArabicCustomizedAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    result = new ArabicTransliterationFilter(result);
    result = new ArabicDiacriticsFilter(result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new ArabicExtendedNormalizationFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }//  w w  w  .j ava  2  s  . co m
    result = new ArabicStemFilter(result);
    result = new ArabicLetterSubstituteFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:com.maktashaf.taymiyyah.repository.lucene.analysis.ur.UrduAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    result = new UrduTransliterationFilter(result);
    result = new ArabicDiacriticsFilter(result);
    result = new StopFilter(matchVersion, result, stopwords); // TODO find more urdu stop words.
    result = new ArabicExtendedNormalizationFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }/*w  w w  .j  a  v a  2  s  .c o  m*/
    //    result = new ArabicStemFilter(result); //TODO Urdu stem Filter.
    result = new UrduLetterSubstituteFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:com.netcrest.pado.index.provider.lucene.analyzer.ExtendedEnglishAnalyzer.java

License:Open Source License

/**
 * Creates a//w ww.  ja  v  a 2s.  c o m
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which
 * tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for
    // us.
    if (matchVersion.onOrAfter(Version.LUCENE_31))
        result = new EnglishPossessiveFilter(matchVersion, result);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java

License:Open Source License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
 * the text in the provided {@link Reader}.
 *
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
 *         {@link StandardTokenizer} filtered with {@link StandardFilter},
 *         {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
 *         {@link PorterStemFilter}.//from   w w  w.  j av a 2 s .c om
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source;
    source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Remove possessives (trailing 's)
    result = new EnglishPossessiveFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new PorterStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java

License:Open Source License

/**
 * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
 * the text in the provided {@link Reader}.
 *
 * @return/*from  w  w  w.  j  ava 2 s . c  om*/
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!exclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, exclusionSet);
    }

    // Normalize German special characters
    result = new KeywordAwareGermanNormalizationFilter(result);

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new GermanStemFilter(result);
    // result = new GermanLightStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:edu.cmu.lti.f13.hw4.hw4_dateng.EnglishAnalyzerConfigurable.java

License:Apache License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * //from  www .j  a  v a 2s  .  c  om
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.

    result = new EnglishPossessiveFilter(matchVersion, result);

    if (this.doLowerCase)
        result = new LowerCaseFilter(matchVersion, result);

    if (this.doStopwordRemoval)
        result = new StopFilter(matchVersion, result, stopwords);

    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);

    if (this.stemmer == StemmerType.PORTER)
        result = new PorterStemFilter(result);
    else if (this.stemmer == StemmerType.KSTEM)
        result = new KStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:org.elasticsearch.analysis.common.BrazilianStemTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions));
}