List of usage examples for org.apache.lucene.analysis CharArraySet copy
public static CharArraySet copy(final Set<?> set)
From source file:org.apache.jena.query.text.filter.SelectiveFoldingFilter.java
License:Apache License
public SelectiveFoldingFilter(TokenStream input, CharArraySet whitelisted) { super(input); Objects.requireNonNull(whitelisted, "You must provide the list of whiltelisted characters."); this.whitelisted = CharArraySet.unmodifiableSet(CharArraySet.copy(whitelisted)); }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
/** * Creates a tokenizer based on param values * @param content - The text to tokenize * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT * @param stopWords - Provide a set of user defined stop words * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set. * If false, then only the user provided words will be used as the stop set * @param stemFilterType/*from w w w. j a va2 s. c o m*/ */ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) { this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; if (addToDefault) { CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET); ; for (String word : stopWords) { stopSet.add(word); } this.stopSet = stopSet; } else { stopSet = new CharArraySet(stopWords, true); } tokenStream = createTokenStream(content); }
From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java
License:Apache License
/** Builds the named analyzer with the given stop words. */ public SnowballAnalyzer(String name, CharArraySet stopWords) { this(name);// w w w.j a va 2 s . c o m stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords)); }
From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java
License:Apache License
/** Builds the named analyzer with the given stop words. */ SnowballAnalyzer(String name, CharArraySet stopWords) { this(name); stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords)); }
From source file:org.elasticsearch.analysis.common.WordDelimiterGraphTokenFilterFactory.java
License:Apache License
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {/* w w w. j ava 2 s . co m*/ super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
From source file:org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.java
License:Apache License
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {/*w w w. jav a 2s .co m*/ super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
From source file:org.elasticsearch.index.analysis.Kuromoji2AnalyzerProvider.java
License:Apache License
public Kuromoji2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = Kuromoji2TokenizerFactory.getMode(settings); final UserDictionary userDictionary = Kuromoji2TokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
From source file:org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory.java
License:Apache License
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {/*from w w w .j ava 2s . c om*/ super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }