List of usage examples for org.apache.lucene.analysis.util CharTokenizer CharTokenizer
public CharTokenizer()
From source file:org.elasticsearch.analysis.common.CharGroupTokenizerFactory.java
License:Apache License
@Override public Tokenizer create() { return new CharTokenizer() { @Override//from w w w. ja v a 2 s. c o m protected boolean isTokenChar(int c) { if (tokenizeOnSpace && Character.isWhitespace(c)) { return false; } if (tokenizeOnLetter && Character.isLetter(c)) { return false; } if (tokenizeOnDigit && Character.isDigit(c)) { return false; } if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { return false; } if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) { return false; } return !tokenizeOnChars.contains(c); } }; }
From source file:org.python.pydev.shared_core.index.CodeAnalyzer.java
License:Open Source License
public static TokenStreamComponents createDefaultComponents(String... ignoreWords) { Tokenizer src = new CharTokenizer() { @Override//w ww. j a v a 2s . c o m protected boolean isTokenChar(int c) { return Character.isJavaIdentifierPart(c); } @Override protected int normalize(int c) { return Character.toLowerCase(c); } }; TokenFilter tok = new LowerCaseFilter(src); CharArraySet stopWords = StopFilter.makeStopSet(ignoreWords); tok = new StopFilter(tok, stopWords); TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(src, tok); return tokenStreamComponents; }