Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:MyStandardAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(src);
    // tok = new LowerCaseFilter(tok);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok) {
        @Override//w  ww . j ava 2  s .  c  o  m
        protected void setReader(final Reader reader) throws IOException {
            src.setMaxTokenLength(MyStandardAnalyzer.this.maxTokenLength);
            super.setReader(reader);
        }
    };
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java

License:Open Source License

/**
 * Creates//ww  w  .j  a v a 2s . co m
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
 * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source;
    if (getVersion().onOrAfter(Version.LATEST)) {
        source = new StandardTokenizer();
    } else {
        source = new StandardTokenizer40();
    }
    TokenStream result = new LowerCaseFilter(source);
    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
        result = new DecimalDigitFilter(result);
    }
    // the order here is important: the stopword list is not normalized!
    result = new StopFilter(result, stopwords);
    result = new ArabicRootExtractorStemFilter(result);
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    return new TokenStreamComponents(source, result);
}

From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    ShingleFilter sf = new ShingleFilter(
            new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET));
    sf.setMaxShingleSize(gap);//from   w w  w.  java 2s. com
    // sf.setFillerToken("");
    // sf.setOutputUnigrams(false);
    return new TokenStreamComponents(tokenizer, sf);
}

From source file:com.romeikat.datamessie.core.base.util.TokenizerAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    final TokenStream result = new StandardFilter(source);

    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.FullTextIndexingAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Convert to lower case
    result = new LowerCaseFilter(result);

    // Normalize German special characters
    result = new GermanNormalizationFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java

License:Open Source License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
 * the text in the provided {@link Reader}.
 *
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
 *         {@link StandardTokenizer} filtered with {@link StandardFilter},
 *         {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
 *         {@link PorterStemFilter}./*w w w  .ja  v  a 2s .  co  m*/
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source;
    source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Remove possessives (trailing 's)
    result = new EnglishPossessiveFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!stemExclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new PorterStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java

License:Open Source License

/**
 * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
 * the text in the provided {@link Reader}.
 *
 * @return//from www .j av  a2  s . c o  m
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);

    // Remove terms that do not contain any alphabetic character
    result = new NumberFilter(result);

    // Converting to lower case is not necessary as this is done before stemming
    // result = new LowerCaseFilter(result);

    // Remove stopwords
    result = new StopFilter(result, stopwords);

    // Mark keywords
    if (!exclusionSet.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, exclusionSet);
    }

    // Normalize German special characters
    result = new KeywordAwareGermanNormalizationFilter(result);

    // Stem
    result = new SnowballFilter(result, new German2Stemmer());
    // Alternatives to the SnowballFilter:
    // result = new GermanStemFilter(result);
    // result = new GermanLightStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzer.java

License:Open Source License

/**
 * This configuration must match with the configuration used for the index!
 *
 * @param fieldName Document field/*from   w ww  .j  av a2s .com*/
 * @return Token stream
 */
@SuppressWarnings("resource")
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new EnglishPossessiveFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, getStopwordSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzer.java

License:Open Source License

/**
 * This configuration must match with the configuration used for the index!
 *
 * @param fieldName Document field//from  w ww  .  jav  a2  s  .c  o m
 * @return Token stream
 */
@SuppressWarnings("resource")
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ElisionFilter(result, this.elisions);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, getStopwordSet());
    result = new FrenchLightStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzer.java

License:Open Source License

/**
 * This configuration must match with the configuration used for the index!
 *
 * @param fieldName Document field/*from www  .j ava 2  s  .co  m*/
 * @return Token stream
 */
@SuppressWarnings("resource")
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, getStopwordSet());
    result = new GermanNormalizationFilter(result);
    result = new GermanLightStemFilter(result);
    return new TokenStreamComponents(source, result);
}