Example usage for org.apache.lucene.analysis.core UnicodeWhitespaceTokenizer UnicodeWhitespaceTokenizer

List of usage examples for org.apache.lucene.analysis.core UnicodeWhitespaceTokenizer UnicodeWhitespaceTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core UnicodeWhitespaceTokenizer UnicodeWhitespaceTokenizer.

Prototype

public UnicodeWhitespaceTokenizer() 

Source Link

Document

Construct a new UnicodeWhitespaceTokenizer.

Usage

From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) {
        Tokenizer tokenizer = new LowerCaseTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("whitespace")) {
        Tokenizer tokenizer = new UnicodeWhitespaceTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese
        Tokenizer tokenizer = new HMMChineseTokenizer();
        return new TokenStreamComponents(tokenizer, tokenizer);
    } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan
        Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang));
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) {
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new GreekCustomFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else { // default case
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    }//from w  w  w.  j ava2 s.  c  o m
}