Example usage for org.apache.lucene.analysis.util CharTokenizer CharTokenizer

List of usage examples for org.apache.lucene.analysis.util CharTokenizer CharTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.util CharTokenizer CharTokenizer.

Prototype

public CharTokenizer() 

Source Link

Document

Creates a new CharTokenizer instance

Usage

From source file:org.elasticsearch.analysis.common.CharGroupTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create() {
    return new CharTokenizer() {
        @Override//from   w  w  w.  ja v  a  2  s.  c o m
        protected boolean isTokenChar(int c) {
            if (tokenizeOnSpace && Character.isWhitespace(c)) {
                return false;
            }
            if (tokenizeOnLetter && Character.isLetter(c)) {
                return false;
            }
            if (tokenizeOnDigit && Character.isDigit(c)) {
                return false;
            }
            if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
                return false;
            }
            if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
                return false;
            }
            return !tokenizeOnChars.contains(c);
        }
    };
}

From source file:org.python.pydev.shared_core.index.CodeAnalyzer.java

License:Open Source License

public static TokenStreamComponents createDefaultComponents(String... ignoreWords) {
    Tokenizer src = new CharTokenizer() {

        @Override//w ww.  j  a v  a  2s  .  c  o  m
        protected boolean isTokenChar(int c) {
            return Character.isJavaIdentifierPart(c);
        }

        @Override
        protected int normalize(int c) {
            return Character.toLowerCase(c);
        }
    };

    TokenFilter tok = new LowerCaseFilter(src);
    CharArraySet stopWords = StopFilter.makeStopSet(ignoreWords);
    tok = new StopFilter(tok, stopWords);

    TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(src, tok);
    return tokenStreamComponents;
}