Example usage for org.apache.lucene.analysis.miscellaneous WordDelimiterFilter WordDelimiterFilter

List of usage examples for org.apache.lucene.analysis.miscellaneous WordDelimiterFilter WordDelimiterFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous WordDelimiterFilter WordDelimiterFilter.

Prototype

public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) 

Source Link

Document

Creates a new WordDelimiterFilter using WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE as its charTypeTable

Usage

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.MinimalAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new StopFilter(result, stopwords);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    TokenStream tok = new LowerCaseFilter(matchVersion, src);
    tok = new WordDelimiterFilter(tok,
            WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE
                    | this.INDEX_ORIGINAL_TERM | WordDelimiterFilter.GENERATE_NUMBER_PARTS,
            null);//from  w  w w.  j  a v  a2s.  com
    return new TokenStreamComponents(src, tok);
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java

License:Apache License

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(//from  w  w w.j a v  a  2s  .c  o  m
            PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
            input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false,
            input -> new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false,
            input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    // TODO deprecate edgeNGram
    filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true,
            input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false,
            input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(
            PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false,
            input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("limit", false,
            input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
                    LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
    // TODO deprecate nGram
    filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false,
            input -> new SnowballFilter(input, "Russian")));
    filters.add(
            PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true,
            ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false,
            input -> new SnowballFilter(input, "English")));
    filters.add(
            PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false,
            input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false,
            input -> new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                            | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                            | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
                    null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false,
            input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                    | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}

From source file:org.owasp.dependencycheck.data.lucene.FieldAnalyzer.java

License:Apache License

/**
 * Creates the TokenStreamComponents/*from   w  w  w  .  ja v a  2 s. c o  m*/
 *
 * @param fieldName the field name being analyzed
 * @param reader the reader containing the input
 * @return the TokenStreamComponents
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new AlphaNumericTokenizer(version, reader);

    TokenStream stream = source;

    stream = new WordDelimiterFilter(stream,
            WordDelimiterFilter.CATENATE_WORDS | WordDelimiterFilter.GENERATE_WORD_PARTS
                    | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.PRESERVE_ORIGINAL
                    | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
            null);

    stream = new LowerCaseFilter(version, stream);
    stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents(source, stream);
}

From source file:org.owasp.dependencycheck.data.lucene.SearchFieldAnalyzer.java

License:Apache License

/**
 * Creates a the TokenStreamComponents used to analyze the stream.
 *
 * @param fieldName the field that this lucene analyzer will process
 * @param reader a reader containing the tokens
 * @return the token stream filter chain
 *///from www  . ja v a  2  s .  c  o  m
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new AlphaNumericTokenizer(version, reader);

    TokenStream stream = source;

    stream = new WordDelimiterFilter(stream,
            WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterFilter.PRESERVE_ORIGINAL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                    | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
            null);

    stream = new LowerCaseFilter(version, stream);
    stream = new UrlTokenizingFilter(stream);
    concatenatingFilter = new TokenPairConcatenatingFilter(stream);
    stream = concatenatingFilter;
    stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents(source, stream);
}