List of usage examples for org.apache.lucene.analysis.cn.smart HMMChineseTokenizer HMMChineseTokenizer
public HMMChineseTokenizer()
From source file:org.carrot2.text.linguistic.lucene.ChineseTokenizerAdapter.java
License:Open Source License
public ChineseTokenizerAdapter() { this.tempCharSequence = new MutableCharArray(new char[0]); this.sentenceTokenizer = new HMMChineseTokenizer(); }
From source file:org.elasticsearch.index.analysis.SmartChineseTokenizerTokenizerFactory.java
License:Apache License
@Override public Tokenizer create() { return new HMMChineseTokenizer(); }
From source file:org.elasticsearch.indices.analysis.smartcn.SmartChineseIndicesAnalysis.java
License:Apache License
@Inject public SmartChineseIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); // Register smartcn analyzer indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer())); // Register smartcn_tokenizer tokenizer indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override//from w w w. j a va2s . co m public String name() { return "smartcn_tokenizer"; } @Override public Tokenizer create() { return new HMMChineseTokenizer(); } })); // Register smartcn_sentence tokenizer -- for backwards compat an alias to smartcn_tokenizer indicesAnalysisService.tokenizerFactories().put("smartcn_sentence", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override public String name() { return "smartcn_sentence"; } @Override public Tokenizer create() { return new HMMChineseTokenizer(); } })); // Register smartcn_word token filter -- noop indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "smartcn_word"; } @Override public TokenStream create(TokenStream tokenStream) { return tokenStream; } })); }
From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) { Tokenizer tokenizer = new LowerCaseTokenizer(); return new TokenStreamComponents(tokenizer); } else if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("whitespace")) { Tokenizer tokenizer = new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer); } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese Tokenizer tokenizer = new HMMChineseTokenizer(); return new TokenStreamComponents(tokenizer, tokenizer); } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang)); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) { Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new GreekCustomFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else { // default case Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); }/*from w w w . j a v a 2 s.c o m*/ }