Example usage for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer

List of usage examples for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer.

Prototype

public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) 

Source Link

Document

Create a new JapaneseTokenizer.

Usage

From source file:org.elasticsearch.index.analysis.Kuromoji2TokenizerFactory.java

License:Apache License

@Override
public Tokenizer create() {
    JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
    int nBestCost = this.nBestCost;
    if (nBestExamples != null) {
        nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
    }/*  ww  w .  ja  v a  2s . com*/
    t.setNBestCost(nBestCost);
    return t;
}

From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java

License:Apache License

@Test
public void testBaseFormFilterFactory() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_pos");
    assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
    String source = "????";
    String[] expected = new String[] { "?", "?", "?", "", "" };
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java

License:Apache License

@Test
public void testReadingFormFilterFactory() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_rf");
    assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
    String source = "?????";
    String[] expected_tokens_romaji = new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi",
            "ta" };

    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji);

    tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    String[] expected_tokens_katakana = new String[] { "", "?", "?", "",
            "", "?", "" };
    tokenFilter = analysisService.tokenFilter("kuromoji_readingform");
    assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}

From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java

License:Apache License

@Test
public void testKatakanaStemFilter() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_stemmer");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    String source = "??????????";

    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    //  should be stemmed by default
    // (min len)  should not be stemmed
    String[] expected_tokens_katakana = new String[] { "", "", "?", "??", "",
            "?", "?", "", "?", "", "", "", "?", "??", "?" };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);

    tokenFilter = analysisService.tokenFilter("kuromoji_ks");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    //  should not be stemmed since min len == 6
    //  should not be stemmed
    expected_tokens_katakana = new String[] { "", "", "?", "??", "", "?",
            "?", "", "?", "", "", "", "?", "??", "?" };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}

From source file:org.elasticsearch.index.analysis.KuromojiAnalysisTests.java

License:Apache License

@Test
public void testJapaneseStopFilterFactory() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
    assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
    String source = "????";
    String[] expected = new String[] { "?", "?", "?" };
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.index.analysis.KuromojiTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create() {
    return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
}

From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java

License:Apache License

@Inject
public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    indicesAnalysisService.analyzerProviderFactories().put("kuromoji",
            new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer()));

    indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
            new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
                @Override//  ww w.ja va2  s  .  co m
                public String name() {
                    return "kuromoji_iteration_mark";
                }

                @Override
                public Reader create(Reader reader) {
                    return new JapaneseIterationMarkCharFilter(reader,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
                }
            }));

    indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override
                public String name() {
                    return "kuromoji_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new JapaneseTokenizer(null, true, Mode.SEARCH);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_baseform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseBaseFormFilter(tokenStream);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_part_of_speech";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapanesePartOfSpeechStopFilter(tokenStream,
                            JapaneseAnalyzer.getDefaultStopTags());
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_readingform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseReadingFormFilter(tokenStream, true);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_stemmer";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseKatakanaStemFilter(tokenStream);
                }
            }));
}