Example usage for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter ASCIIFoldingFilter

List of usage examples for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter ASCIIFoldingFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter ASCIIFoldingFilter.

Prototype

public ASCIIFoldingFilter(TokenStream input) 

Source Link

Usage

From source file:ModifiedRomanianAnalyzer.java

License:Apache License

/**
 * Creates a//w  w  w.  j  a  v  a2  s  .  c  om
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 *         , {@link KeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link SnowballFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new ASCIIFoldingFilter(result);
    //  if(!stemExclusionSet.isEmpty())
    //  result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new RomanianStemmer());
    return new TokenStreamComponents(source, result);
}

From source file:analyzers.FormalAnalyzer.java

License:Apache License

/**
* Define how tokens are processed./*w  w w . ja v a  2  s.  com*/
*
* @param    fieldName    required input
* @param    reader       reader for document
*/
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new StandardTokenizer(reader);
    TokenStream chain = tokenizer;

    if (!tokenOpts.disableAllFilters) {
        // the chain of token filters...

        chain = new StandardFilter(chain);

        // discard tokens based on their type attribute
        chain = new StandardTagFilter(chain, tokenOpts);

        // convert tokens to lowercase
        chain = new LowerCaseFilter(chain);

        // replace accented chars with non-accented ASCII equivalents
        chain = new ASCIIFoldingFilter(chain);

        // remove stop words (must come after lowercasing)
        chain = new StopFilter(chain, stopWordSet);

        // remove 's
        chain = new EnglishPossessiveFilter(Version.LATEST, chain);

        // spelling correction            
        if (!spellingHashtable.isEmpty())
            chain = new SpellingCorrectionFilter(chain, spellingHashtable);

        if (!tokenOpts.disableStemming) {
            // Krovets stemmer (smarter than the Porter stemmer)
            chain = new KStemFilter(chain);
        }
    }

    return new Analyzer.TokenStreamComponents(tokenizer, chain);
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.EnAnalyzer.java

License:Apache License

/**
 *
 * @param reader//www . ja v a 2  s  .c o m
 * @return
 */
@Override
protected TokenStreamComponents createComponents(Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    //        ShingleFilter sf = new ShingleFilter(result, 2, 3);
    //        sf.setFillerToken(null);
    //        result = sf;
    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.NlAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(Reader reader) {
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;// w  w w .j  a v a 2  s  .c  o  m

    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }

    result = new SnowballFilter(result, new DutchStemmer());

    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java

License:Apache License

/**
 *
 * @param fieldName//www  . jav a 2s  .co m
 * @param reader
 * @return
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.warn("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    ShingleFilter sf = new ShingleFilter(result, 2, 3);
    sf.setFillerToken(null);
    return new TokenStreamComponents(t, sf);

}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.lucene.analyzers.EnAnalyzer.java

License:Apache License

/**
 *
 * @param fieldName//  w  w  w . ja  v a 2  s  . c  o m
 * @param reader
 * @return
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    //        ShingleFilter sf = new ShingleFilter(result, 2, 3);
    //        sf.setFillerToken(null);
    //        result = sf;
    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:com.b2international.index.analyzer.ComponentTermAnalyzer.java

License:Apache License

private TokenFilter createFilterChain(final Tokenizer source) {
    return new ASCIIFoldingFilter(source);
}

From source file:com.nutrisystem.orange.java.text.EnglishTextAnalyzer.java

License:Apache License

/**
 * Creates a/*from w w w. j av  a2  s  .  c o m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which
 * tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link KeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new ClassicTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for
    // us.
    result = new EnglishPossessiveFilter(matchVersion, result);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new NumberFilter(result);
    result = new KStemFilter(result);
    result = new PorterStemFilter(result);
    //result = new ShingleFilter(result, 3);
    return new TokenStreamComponents(source, result);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.CharacterShingleAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new CharacterShingleTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new ShingleFilter(result, 3);

    //        result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null);

    return new TokenStreamComponents(source, result);
}