Example usage for org.apache.lucene.analysis CharArraySet copy

List of usage examples for org.apache.lucene.analysis CharArraySet copy

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet copy.

Prototype

public static CharArraySet copy(final Set<?> set) 

Source Link

Document

Returns a copy of the given set as a CharArraySet .

Usage

From source file:org.apache.jena.query.text.filter.SelectiveFoldingFilter.java

License:Apache License

public SelectiveFoldingFilter(TokenStream input, CharArraySet whitelisted) {
    super(input);
    Objects.requireNonNull(whitelisted, "You must provide the list of whiltelisted characters.");
    this.whitelisted = CharArraySet.unmodifiableSet(CharArraySet.copy(whitelisted));
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType/*from w w  w.  j a va2  s. c o m*/
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java

License:Apache License

/** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(String name, CharArraySet stopWords) {
    this(name);// w w  w.j  a va  2 s .  c  o m
    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
}

From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java

License:Apache License

/** Builds the named analyzer with the given stop words. */
SnowballAnalyzer(String name, CharArraySet stopWords) {
    this(name);
    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
}

From source file:org.elasticsearch.analysis.common.WordDelimiterGraphTokenFilterFactory.java

License:Apache License

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*  w w  w. j  ava  2 s .  co m*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

From source file:org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.java

License:Apache License

public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*w w  w.  jav  a  2s .co m*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

From source file:org.elasticsearch.index.analysis.Kuromoji2AnalyzerProvider.java

License:Apache License

public Kuromoji2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = Kuromoji2TokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = Kuromoji2TokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords),
            JapaneseAnalyzer.getDefaultStopTags());
}

From source file:org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory.java

License:Apache License

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*from   w  w  w  .j  ava 2s .  c  om*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings,
            "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}