List of usage examples for org.apache.lucene.analysis.core UnicodeWhitespaceTokenizer UnicodeWhitespaceTokenizer
public UnicodeWhitespaceTokenizer()
From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) { Tokenizer tokenizer = new LowerCaseTokenizer(); return new TokenStreamComponents(tokenizer); } else if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("whitespace")) { Tokenizer tokenizer = new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer); } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese Tokenizer tokenizer = new HMMChineseTokenizer(); return new TokenStreamComponents(tokenizer, tokenizer); } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang)); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) { Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new GreekCustomFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else { // default case Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); }//from w w w. j ava2 s. c o m }