List of usage examples for org.apache.lucene.analysis.miscellaneous WordDelimiterFilter GENERATE_NUMBER_PARTS
int GENERATE_NUMBER_PARTS
To view the source code for org.apache.lucene.analysis.miscellaneous WordDelimiterFilter GENERATE_NUMBER_PARTS.
Click Source Link
"500-42" => "500" "42"
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { StandardTokenizer src = new StandardTokenizer(matchVersion, reader); TokenStream tok = new LowerCaseFilter(matchVersion, src); tok = new WordDelimiterFilter(tok, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE | this.INDEX_ORIGINAL_TERM | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);/* w w w . ja v a 2 s . c om*/ return new TokenStreamComponents(src, tok); }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java
License:Apache License
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(//from w ww. j ava2 s. co m PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add( PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add( PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add( PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
From source file:org.owasp.dependencycheck.data.lucene.FieldAnalyzer.java
License:Apache License
/** * Creates the TokenStreamComponents//from w ww. j a va 2s . c o m * * @param fieldName the field name being analyzed * @param reader the reader containing the input * @return the TokenStreamComponents */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new AlphaNumericTokenizer(version, reader); TokenStream stream = source; stream = new WordDelimiterFilter(stream, WordDelimiterFilter.CATENATE_WORDS | WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.PRESERVE_ORIGINAL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); stream = new LowerCaseFilter(version, stream); stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents(source, stream); }
From source file:org.owasp.dependencycheck.data.lucene.SearchFieldAnalyzer.java
License:Apache License
/** * Creates a the TokenStreamComponents used to analyze the stream. * * @param fieldName the field that this lucene analyzer will process * @param reader a reader containing the tokens * @return the token stream filter chain *//*from w w w . j a v a 2 s . c o m*/ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new AlphaNumericTokenizer(version, reader); TokenStream stream = source; stream = new WordDelimiterFilter(stream, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.PRESERVE_ORIGINAL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); stream = new LowerCaseFilter(version, stream); stream = new UrlTokenizingFilter(stream); concatenatingFilter = new TokenPairConcatenatingFilter(stream); stream = concatenatingFilter; stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents(source, stream); }