List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET
CharArraySet EMPTY_SET
To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.
Click Source Link
From source file:org.elasticsearch.analysis.common.CatalanAnalyzerProvider.java
License:Apache License
CatalanAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new CatalanAnalyzer(Analysis.parseStopWords(env, settings, CatalanAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);// w w w .j a v a2 s . c om }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java
License:Apache License
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(//from w w w .ja v a 2s.c om PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add( PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add( PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add( PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
From source file:org.elasticsearch.analysis.common.CzechAnalyzerProvider.java
License:Apache License
CzechAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new CzechAnalyzer(Analysis.parseStopWords(env, settings, CzechAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);//ww w.j a va 2 s .co m }
From source file:org.elasticsearch.analysis.common.DanishAnalyzerProvider.java
License:Apache License
DanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new DanishAnalyzer(Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);/*from w w w .j a v a 2 s . c o m*/ }
From source file:org.elasticsearch.analysis.common.DutchAnalyzerProvider.java
License:Apache License
DutchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);/*from w w w . j av a 2 s.co m*/ }
From source file:org.elasticsearch.analysis.common.DutchStemTokenFilterFactory.java
License:Apache License
DutchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {//from w w w. j a va2 s .co m super(indexSettings, name, settings); this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET); }
From source file:org.elasticsearch.analysis.common.EnglishAnalyzerProvider.java
License:Apache License
EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);// www . j a v a 2 s. c o m }
From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java
License:Apache License
public void testFingerprint() throws Exception { Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255); assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO", new String[] { "bar baz foo" }); }
From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java
License:Apache License
public void testReusableTokenStream() throws Exception { Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255); assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO", new String[] { "bar baz foo" }); assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc", new String[] { "123.2 abc xyz" }); }
From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java
License:Apache License
public void testAsciifolding() throws Exception { Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255); assertAnalyzesTo(a, "gdel escher bach", new String[] { "bach escher godel" }); assertAnalyzesTo(a, "gdel godel escher bach", new String[] { "bach escher godel" }); }