List of usage examples for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer
public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode)
From source file:org.elasticsearch.index.analysis.Kuromoji2TokenizerFactory.java
License:Apache License
@Override public Tokenizer create() { JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); int nBestCost = this.nBestCost; if (nBestExamples != null) { nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); }/* ww w . ja v a 2s . com*/ t.setNBestCost(nBestCost); return t; }
From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java
License:Apache License
@Test public void testBaseFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_pos"); assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); String source = "????"; String[] expected = new String[] { "?", "?", "?", "", "" }; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java
License:Apache License
@Test public void testReadingFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_rf"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); String source = "?????"; String[] expected_tokens_romaji = new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); String[] expected_tokens_katakana = new String[] { "", "?", "?", "", "", "?", "" }; tokenFilter = analysisService.tokenFilter("kuromoji_readingform"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }
From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java
License:Apache License
@Test public void testKatakanaStemFilter() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_stemmer"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); String source = "??????????"; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // should be stemmed by default // (min len) should not be stemmed String[] expected_tokens_katakana = new String[] { "", "", "?", "??", "", "?", "?", "", "?", "", "", "", "?", "??", "?" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); tokenFilter = analysisService.tokenFilter("kuromoji_ks"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // should not be stemmed since min len == 6 // should not be stemmed expected_tokens_katakana = new String[] { "", "", "?", "??", "", "?", "?", "", "?", "", "", "", "?", "??", "?" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }
From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java
License:Apache License
@Test public void testJapaneseStopFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop"); assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class)); String source = "????"; String[] expected = new String[] { "?", "?", "?" }; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.index.analysis.KuromojiTokenizerFactory.java
License:Apache License
@Override public Tokenizer create() { return new JapaneseTokenizer(userDictionary, discartPunctuation, mode); }
From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java
License:Apache License
@Inject public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); indicesAnalysisService.analyzerProviderFactories().put("kuromoji", new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer())); indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { @Override// ww w.ja va2 s . co m public String name() { return "kuromoji_iteration_mark"; } @Override public Reader create(Reader reader) { return new JapaneseIterationMarkCharFilter(reader, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); } })); indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override public String name() { return "kuromoji_tokenizer"; } @Override public Tokenizer create() { return new JapaneseTokenizer(null, true, Mode.SEARCH); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_baseform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseBaseFormFilter(tokenStream); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_part_of_speech"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapanesePartOfSpeechStopFilter(tokenStream, JapaneseAnalyzer.getDefaultStopTags()); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_readingform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseReadingFormFilter(tokenStream, true); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_stemmer"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream); } })); }