Example usage for org.apache.lucene.analysis.cn.smart HMMChineseTokenizer HMMChineseTokenizer

List of usage examples for org.apache.lucene.analysis.cn.smart HMMChineseTokenizer HMMChineseTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.cn.smart HMMChineseTokenizer HMMChineseTokenizer.

Prototype

public HMMChineseTokenizer() 

Source Link

Document

Creates a new HMMChineseTokenizer

Usage

From source file:org.carrot2.text.linguistic.lucene.ChineseTokenizerAdapter.java

License:Open Source License

public ChineseTokenizerAdapter() {
    this.tempCharSequence = new MutableCharArray(new char[0]);
    this.sentenceTokenizer = new HMMChineseTokenizer();
}

From source file:org.elasticsearch.index.analysis.SmartChineseTokenizerTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create() {
    return new HMMChineseTokenizer();
}

From source file:org.elasticsearch.indices.analysis.smartcn.SmartChineseIndicesAnalysis.java

License:Apache License

@Inject
public SmartChineseIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    // Register smartcn analyzer
    indicesAnalysisService.analyzerProviderFactories().put("smartcn",
            new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer()));

    // Register smartcn_tokenizer tokenizer
    indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override//from w w w.  j a  va2s  .  co  m
                public String name() {
                    return "smartcn_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new HMMChineseTokenizer();
                }
            }));

    // Register smartcn_sentence tokenizer -- for backwards compat an alias to smartcn_tokenizer
    indicesAnalysisService.tokenizerFactories().put("smartcn_sentence",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override
                public String name() {
                    return "smartcn_sentence";
                }

                @Override
                public Tokenizer create() {
                    return new HMMChineseTokenizer();
                }
            }));

    // Register smartcn_word token filter -- noop
    indicesAnalysisService.tokenFilterFactories().put("smartcn_word",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "smartcn_word";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return tokenStream;
                }
            }));
}

From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) {
        Tokenizer tokenizer = new LowerCaseTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("whitespace")) {
        Tokenizer tokenizer = new UnicodeWhitespaceTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese
        Tokenizer tokenizer = new HMMChineseTokenizer();
        return new TokenStreamComponents(tokenizer, tokenizer);
    } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan
        Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang));
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) {
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new GreekCustomFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else { // default case
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    }/*from   w  w  w . j a  v  a  2  s.c  o m*/
}