List of usage examples for org.apache.lucene.analysis.miscellaneous WordDelimiterIterator DEFAULT_WORD_DELIM_TABLE
null DEFAULT_WORD_DELIM_TABLE
To view the source code for org.apache.lucene.analysis.miscellaneous WordDelimiterIterator DEFAULT_WORD_DELIM_TABLE.
Click Source Link
From source file:org.elasticsearch.analysis.common.WordDelimiterGraphTokenFilterFactory.java
License:Apache License
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {/*from w w w . ja va2s. com*/ super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
From source file:org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.java
License:Apache License
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {/* ww w . jav a 2 s . com*/ super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
From source file:org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.java
License:Apache License
/** * parses a list of MappingCharFilter style rules into a custom byte[] type table *///from w w w. jav a2 s. co m static byte[] parseTypes(Collection<String> rules) { SortedMap<Character, Byte> typeMap = new TreeMap<>(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) types[mapping.getKey()] = mapping.getValue(); return types; }
From source file:org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory.java
License:Apache License
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {//from www . j a v a2 s.com super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
From source file:org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.java
License:Apache License
@Inject public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else {//from w w w .j a v a2 s . co m this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words", version); this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords); this.flags = flags; }
From source file:org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.java
License:Apache License
/** * parses a list of MappingCharFilter style rules into a custom byte[] type table *//* www . j a va 2 s.com*/ private byte[] parseTypes(Collection<String> rules) { SortedMap<Character, Byte> typeMap = new TreeMap<Character, Byte>(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) types[mapping.getKey()] = mapping.getValue(); return types; }