Example usage for org.apache.lucene.analysis.core LowerCaseFilter LowerCaseFilter

List of usage examples for org.apache.lucene.analysis.core LowerCaseFilter LowerCaseFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core LowerCaseFilter LowerCaseFilter.

Prototype

public LowerCaseFilter(TokenStream in) 

Source Link

Document

Create a new LowerCaseFilter, that normalizes token text to lower case.

Usage

From source file:analyzers.FormalAnalyzer.java

License:Apache License

/**
* Define how tokens are processed./*from  w  ww  .j  a va2  s .c om*/
*
* @param    fieldName    required input
* @param    reader       reader for document
*/
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new StandardTokenizer(reader);
    TokenStream chain = tokenizer;

    if (!tokenOpts.disableAllFilters) {
        // the chain of token filters...

        chain = new StandardFilter(chain);

        // discard tokens based on their type attribute
        chain = new StandardTagFilter(chain, tokenOpts);

        // convert tokens to lowercase
        chain = new LowerCaseFilter(chain);

        // replace accented chars with non-accented ASCII equivalents
        chain = new ASCIIFoldingFilter(chain);

        // remove stop words (must come after lowercasing)
        chain = new StopFilter(chain, stopWordSet);

        // remove 's
        chain = new EnglishPossessiveFilter(Version.LATEST, chain);

        // spelling correction            
        if (!spellingHashtable.isEmpty())
            chain = new SpellingCorrectionFilter(chain, spellingHashtable);

        if (!tokenOpts.disableStemming) {
            // Krovets stemmer (smarter than the Porter stemmer)
            chain = new KStemFilter(chain);
        }
    }

    return new Analyzer.TokenStreamComponents(tokenizer, chain);
}

From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    filter = new EdgeNGramTokenFilter(filter, 1, 4);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.PostalCodeAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new EdgeNGramTokenFilter(filter, 3, 4);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.TextAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    filter = new PhoneticFilter(filter, new ColognePhonetic(), true);
    filter = new NGramTokenFilter(filter, 2, 6);
    return new TokenStreamComponents(source, filter);
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
    stream = new CJKWidthFilter(stream);
    stream = new StopFilter(stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    stream = new LowerCaseFilter(stream);
    return new TokenStreamComponents(tokenizer, stream);
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java

License:Open Source License

/**
 * Creates//from  w  w  w  .ja  v a  2s.co m
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
 * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source;
    if (getVersion().onOrAfter(Version.LATEST)) {
        source = new StandardTokenizer();
    } else {
        source = new StandardTokenizer40();
    }
    TokenStream result = new LowerCaseFilter(source);
    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
        result = new DecimalDigitFilter(result);
    }
    // the order here is important: the stopword list is not normalized!
    result = new StopFilter(result, stopwords);
    result = new ArabicRootExtractorStemFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    return new TokenStreamComponents(source, result);
}

From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    ShingleFilter sf = new ShingleFilter(
            new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET));
    sf.setMaxShingleSize(gap);/* www.j ava  2 s  .c o m*/
    // sf.setFillerToken("");
    // sf.setOutputUnigrams(false);
    return new TokenStreamComponents(tokenizer, sf);
}

From source file:com.NGramTokenBaseAnalyzer.java

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {

    Tokenizer src = new StandardTokenizer(reader);

    TokenStream tok = new LowerCaseFilter(src);
    tok = filter(tok, this.unigramOutput);
    return new TokenStreamComponents(src, tok);
}