Example usage for org.apache.lucene.analysis.util ElisionFilter ElisionFilter

List of usage examples for org.apache.lucene.analysis.util ElisionFilter ElisionFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.util ElisionFilter ElisionFilter.

Prototype

public ElisionFilter(TokenStream input, CharArraySet articles) 

Source Link

Document

Constructs an elision filter with a Set of stop words

Usage

From source file:com.doculibre.analyzer.FrenchAccentPlurielAnalyzer.java

License:Apache License

/**
 * Creates/*  w  w w  . jav a 2s . c o  m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 * 
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from a {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link ElisionFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter},
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is
 *         provided, and {@link FrenchLightStemFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new ElisionFilter(result, DEFAULT_ARTICLES);
        result = new LowerCaseFilter(matchVersion, result);
        result = new ApostropheFilter(result);
        result = new StopFilter(matchVersion, result, stopwords);
        result = new FrenchFilter(result);
        if (!excltable.isEmpty())
            result = new SetKeywordMarkerFilter(result, excltable);
        //      if (matchVersion.onOrAfter(Version.LUCENE_36)) {
        //        result = new FrenchLightStemFilter(result);
        //      } else {
        //        result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
        //      }
        return new TokenStreamComponents(source, result);
    } else {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new StopFilter(matchVersion, result, stopwords);
        if (!excltable.isEmpty())
            result = new SetKeywordMarkerFilter(result, excltable);
        result = new FrenchStemFilter(result);
        // Convert to lowercase after stemming!
        return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzer.java

License:Open Source License

/**
 * This configuration must match with the configuration used for the index!
 *
 * @param fieldName Document field/*from w  w w .  j  ava2  s .co  m*/
 * @return Token stream
 */
@SuppressWarnings("resource")
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ElisionFilter(result, this.elisions);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, getStopwordSet());
    result = new FrenchLightStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:di.uniba.it.tee2.analyzer.ItalianNoStemAnalyzer.java

/**
 * Creates a/*from w  w w.j a v  a 2 s. c  o m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which
 * tokenizes all the text in the provided {@link Reader}.
 *
 * @return A
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built
 * from an {@link StandardTokenizer} filtered with null null     {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
*         , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
 * {@link ItalianLightStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    if (matchVersion.onOrAfter(Version.LUCENE_32)) {
        result = new ElisionFilter(result, DEFAULT_ARTICLES);
    }
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    return new TokenStreamComponents(source, result);
}

From source file:fr.paris.lutece.plugins.lucene.service.analyzer.LuteceFrenchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    if (fieldName == null) {
        throw new IllegalArgumentException("fieldName must not be null");
    }//from   ww w  . j a v a2s  .  c  o m

    if (reader == null) {
        throw new IllegalArgumentException("reader must not be null");
    }

    Tokenizer source = new StandardTokenizer(_matchVersion, reader);
    TokenStream filter = new StandardFilter(_matchVersion, source);
    filter = new ElisionFilter(filter, _stoptable);
    filter = new StopFilter(_matchVersion, filter, _stoptable);
    filter = new ASCIIFoldingFilter(filter);
    filter = new SnowballFilter(filter, new FrenchStemmer());
    // Convert to lowercase after stemming!
    filter = new LowerCaseFilter(_matchVersion, filter);

    return new TokenStreamComponents(source, filter) {
        @Override
        protected void setReader(final Reader reader) throws IOException {
            super.setReader(reader);
        }
    };

}

From source file:io.vertigo.dynamo.plugins.collections.lucene.DefaultAnalyzer.java

License:Apache License

/**
   * Creates a TokenStream which tokenizes all the text in the provided Reader.
   */*from   w ww  .java  2 s.c o  m*/
   * @return A TokenStream build from a StandardTokenizer filtered with
   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
   */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    //-----
    /* on retire les lisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java

License:Apache License

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(//from w ww . ja v  a  2 s . com
            PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
            input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false,
            input -> new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false,
            input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    // TODO deprecate edgeNGram
    filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true,
            input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false,
            input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(
            PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false,
            input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("limit", false,
            input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
                    LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
    // TODO deprecate nGram
    filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false,
            input -> new SnowballFilter(input, "Russian")));
    filters.add(
            PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true,
            ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false,
            input -> new SnowballFilter(input, "English")));
    filters.add(
            PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false,
            input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false,
            input -> new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                            | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                            | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
                    null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false,
            input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                    | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}

From source file:org.elasticsearch.analysis.common.ElisionTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new ElisionFilter(tokenStream, articles);
}

From source file:org.elasticsearch.analysis.hunspell.fr.FrenchHunspellAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String field) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new ElisionFilter(source, FrenchAnalyzer.DEFAULT_ARTICLES);
    result = new StopFilter(result, stopwords);
    if (!this.stemExclusionTable.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionTable);
    }//from  w  w  w .j a  v a  2s .c  o m
    result = new HunspellStemFilter(result, dictionary);
    result = new LowerCaseFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:org.silverpeas.core.index.indexing.model.WAAnalyzer.java

License:Open Source License

/**
 * Returns a tokens stream built on top of the given reader.
 *
 *///  w w w .j av  a 2s .c o m
@Override
protected TokenStreamComponents createComponents(final String s) {
    final Tokenizer source = new StandardTokenizer();
    // remove 's and . from token
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    // remove some unexplicit terms
    result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet());
    // remove [cdjlmnst-qu]' from token
    result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES);
    if (snowballUsed) {
        // Important! Strings given to Snowball filter must contains accents
        // so accents must be removed after stemmer have done the job
        // ignoring singular/plural, male/female and conjugated forms
        result = new SnowballFilter(result, stemmer);
    }
    // remove accents
    result = new ASCIIFoldingFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:phoneticsearch.lucene.DefaultAnalyzer.java

License:Apache License

/**
   * Creates a TokenStream which tokenizes all the text in the provided Reader.
   */*  w ww  .j  a va  2s. com*/
   * @return A TokenStream build from a StandardTokenizer filtered with
   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
   */
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer(reader);
    //final Tokenizer source = new NGramTokenizer(reader, 2, 12);
    //---------------------------------------------------------------------
    /* on retire les lisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);

    if (withFrPhonetic || withMetaphone) {
        //final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList("any")));
        //filter = new BeiderMorseFilter(filter, new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true), languages);
        //filter = new DoubleMetaphoneFilter(filter, 8, true);
        filter = new FrDoubleMetaphoneFilter(filter, 8, true, withFrPhonetic, withMetaphone);
    }
    filter = new PrefixTokenFilter(filter, 6);
    return new TokenStreamComponents(source, filter);
}