Example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET

List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Prototype

CharArraySet EMPTY_SET

To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Click Source Link

Document

An empty CharArraySet .

Usage

From source file:org.elasticsearch.analysis.common.CatalanAnalyzerProvider.java

License:Apache License

CatalanAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new CatalanAnalyzer(Analysis.parseStopWords(env, settings, CatalanAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//  w  w w .j  a  v a2  s  . c  om
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java

License:Apache License

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(//from w  w w  .ja  v  a 2s.c  om
            PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
            input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false,
            input -> new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false,
            input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    // TODO deprecate edgeNGram
    filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true,
            input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false,
            input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(
            PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false,
            input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("limit", false,
            input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
                    LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
    // TODO deprecate nGram
    filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false,
            input -> new SnowballFilter(input, "Russian")));
    filters.add(
            PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true,
            ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false,
            input -> new SnowballFilter(input, "English")));
    filters.add(
            PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false,
            input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false,
            input -> new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                            | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                            | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
                    null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false,
            input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                    | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}

From source file:org.elasticsearch.analysis.common.CzechAnalyzerProvider.java

License:Apache License

CzechAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new CzechAnalyzer(Analysis.parseStopWords(env, settings, CzechAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//ww  w.j a va  2  s  .co m
}

From source file:org.elasticsearch.analysis.common.DanishAnalyzerProvider.java

License:Apache License

DanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new DanishAnalyzer(Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);/*from  w  w w .j a v a  2 s .  c  o  m*/
}

From source file:org.elasticsearch.analysis.common.DutchAnalyzerProvider.java

License:Apache License

DutchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);/*from   w  w  w . j  av a  2  s.co  m*/
}

From source file:org.elasticsearch.analysis.common.DutchStemTokenFilterFactory.java

License:Apache License

DutchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
        Settings settings) {//from w w w. j  a  va2 s  .co m
    super(indexSettings, name, settings);
    this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

From source file:org.elasticsearch.analysis.common.EnglishAnalyzerProvider.java

License:Apache License

EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//  www  .  j  a v  a  2  s. c  o  m
}

From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java

License:Apache License

public void testFingerprint() throws Exception {
    Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
    assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO", new String[] { "bar baz foo" });
}

From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java

License:Apache License

public void testReusableTokenStream() throws Exception {
    Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
    assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO", new String[] { "bar baz foo" });
    assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc", new String[] { "123.2 abc xyz" });
}

From source file:org.elasticsearch.analysis.common.FingerprintAnalyzerTests.java

License:Apache License

public void testAsciifolding() throws Exception {
    Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
    assertAnalyzesTo(a, "gdel escher bach", new String[] { "bach escher godel" });

    assertAnalyzesTo(a, "gdel godel escher bach", new String[] { "bach escher godel" });
}