List of usage examples for org.apache.lucene.analysis.util ElisionFilter ElisionFilter
public ElisionFilter(TokenStream input, CharArraySet articles)
From source file:com.doculibre.analyzer.FrenchAccentPlurielAnalyzer.java
License:Apache License
/** * Creates/* w w w . jav a 2s . c o m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link ElisionFilter}, * {@link LowerCaseFilter}, {@link StopFilter}, * {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, and {@link FrenchLightStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new ApostropheFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new FrenchFilter(result); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); // if (matchVersion.onOrAfter(Version.LUCENE_36)) { // result = new FrenchLightStemFilter(result); // } else { // result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); // } return new TokenStreamComponents(source, result); } else { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzer.java
License:Open Source License
/** * This configuration must match with the configuration used for the index! * * @param fieldName Document field/*from w w w . j ava2 s .co m*/ * @return Token stream */ @SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ElisionFilter(result, this.elisions); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new FrenchLightStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:di.uniba.it.tee2.analyzer.ItalianNoStemAnalyzer.java
/** * Creates a/*from w w w.j a v a 2 s. c o m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which * tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built * from an {@link StandardTokenizer} filtered with null null {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link ItalianLightStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_32)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); return new TokenStreamComponents(source, result); }
From source file:fr.paris.lutece.plugins.lucene.service.analyzer.LuteceFrenchAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { if (fieldName == null) { throw new IllegalArgumentException("fieldName must not be null"); }//from ww w . j a v a2s . c o m if (reader == null) { throw new IllegalArgumentException("reader must not be null"); } Tokenizer source = new StandardTokenizer(_matchVersion, reader); TokenStream filter = new StandardFilter(_matchVersion, source); filter = new ElisionFilter(filter, _stoptable); filter = new StopFilter(_matchVersion, filter, _stoptable); filter = new ASCIIFoldingFilter(filter); filter = new SnowballFilter(filter, new FrenchStemmer()); // Convert to lowercase after stemming! filter = new LowerCaseFilter(_matchVersion, filter); return new TokenStreamComponents(source, filter) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; }
From source file:io.vertigo.dynamo.plugins.collections.lucene.DefaultAnalyzer.java
License:Apache License
/** * Creates a TokenStream which tokenizes all the text in the provided Reader. */*from w ww .java 2 s.c o m*/ * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter */ @Override protected TokenStreamComponents createComponents(final String fieldName) { /* initialisation du token */ final Tokenizer source = new StandardTokenizer(); //----- /* on retire les lisions*/ final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true); TokenStream filter = new ElisionFilter(source, elisionSet); /* on retire article adjectif */ filter = new StopFilter(filter, stopWords); /* on retire les accents */ filter = new ASCIIFoldingFilter(filter); /* on met en minuscule */ filter = new LowerCaseFilter(filter); return new TokenStreamComponents(source, filter); }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java
License:Apache License
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(//from w ww . ja v a 2 s . com PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add( PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add( PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add( PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
From source file:org.elasticsearch.analysis.common.ElisionTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { return new ElisionFilter(tokenStream, articles); }
From source file:org.elasticsearch.analysis.hunspell.fr.FrenchHunspellAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String field) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new ElisionFilter(source, FrenchAnalyzer.DEFAULT_ARTICLES); result = new StopFilter(result, stopwords); if (!this.stemExclusionTable.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); }//from w w w .j a v a 2s .c o m result = new HunspellStemFilter(result, dictionary); result = new LowerCaseFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.silverpeas.core.index.indexing.model.WAAnalyzer.java
License:Open Source License
/** * Returns a tokens stream built on top of the given reader. * */// w w w .j av a 2s .c o m @Override protected TokenStreamComponents createComponents(final String s) { final Tokenizer source = new StandardTokenizer(); // remove 's and . from token TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); // remove some unexplicit terms result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet()); // remove [cdjlmnst-qu]' from token result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES); if (snowballUsed) { // Important! Strings given to Snowball filter must contains accents // so accents must be removed after stemmer have done the job // ignoring singular/plural, male/female and conjugated forms result = new SnowballFilter(result, stemmer); } // remove accents result = new ASCIIFoldingFilter(result); return new TokenStreamComponents(source, result); }
From source file:phoneticsearch.lucene.DefaultAnalyzer.java
License:Apache License
/** * Creates a TokenStream which tokenizes all the text in the provided Reader. */* w ww .j a va 2s. com*/ * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter */ @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { /* initialisation du token */ final Tokenizer source = new StandardTokenizer(reader); //final Tokenizer source = new NGramTokenizer(reader, 2, 12); //--------------------------------------------------------------------- /* on retire les lisions*/ final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true); TokenStream filter = new ElisionFilter(source, elisionSet); /* on retire article adjectif */ filter = new StopFilter(filter, stopWords); /* on retire les accents */ filter = new ASCIIFoldingFilter(filter); /* on met en minuscule */ filter = new LowerCaseFilter(filter); if (withFrPhonetic || withMetaphone) { //final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList("any"))); //filter = new BeiderMorseFilter(filter, new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true), languages); //filter = new DoubleMetaphoneFilter(filter, 8, true); filter = new FrDoubleMetaphoneFilter(filter, 8, true, withFrPhonetic, withMetaphone); } filter = new PrefixTokenFilter(filter, 6); return new TokenStreamComponents(source, filter); }