Example usage for org.apache.lucene.analysis.ja JapaneseAnalyzer getDefaultStopTags

List of usage examples for org.apache.lucene.analysis.ja JapaneseAnalyzer getDefaultStopTags

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ja JapaneseAnalyzer getDefaultStopTags.

Prototype

public static Set<String> getDefaultStopTags() 

Source Link

Usage

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

public LuceneParser(boolean ignoreDefaultWordSet) throws IOException {
    CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer.getDefaultStopSet()
            : new CharArraySet(new ArrayList<String>(), true);
    Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer.getDefaultStopTags() : new HashSet<String>();
    analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags);
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
    final int arglen = arguments.length;
    if (arglen < 1 || arglen > 4) {
        throw new UDFArgumentException("Invalid number of arguments for `tokenize_ja`: " + arglen);
    }//from w w  w. j  av  a  2 s.  co m

    this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL;
    this._stopWordsArray = (arglen >= 3) ? HiveUtils.getConstStringArray(arguments[2]) : null;
    this._stoptags = (arglen >= 4) ? stopTags(arguments[3]) : JapaneseAnalyzer.getDefaultStopTags();
    this._analyzer = null;

    return ObjectInspectorFactory
            .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

@Nonnull
private static Set<String> stopTags(@Nonnull final ObjectInspector oi) throws UDFArgumentException {
    final String[] array = HiveUtils.getConstStringArray(oi);
    if (array == null) {
        return JapaneseAnalyzer.getDefaultStopTags();
    }/* w  ww . j a  v a2s  . co m*/
    final int length = array.length;
    if (length == 0) {
        return Collections.emptySet();
    }
    final Set<String> results = new HashSet<String>(length);
    for (int i = 0; i < length; i++) {
        String s = array[i];
        if (s != null) {
            results.add(s);
        }
    }
    return results;
}

From source file:org.elasticsearch.index.analysis.Kuromoji2AnalyzerProvider.java

License:Apache License

public Kuromoji2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = Kuromoji2TokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = Kuromoji2TokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords),
            JapaneseAnalyzer.getDefaultStopTags());
}

From source file:org.elasticsearch.index.analysis.KuromojiAnalyzerProvider.java

License:Apache License

@Inject
public KuromojiAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env,
        @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords),
            JapaneseAnalyzer.getDefaultStopTags());
}

From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java

License:Apache License

@Inject
public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    indicesAnalysisService.analyzerProviderFactories().put("kuromoji",
            new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer()));

    indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
            new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
                @Override/*from  w  w w. j  a v a2  s  . c om*/
                public String name() {
                    return "kuromoji_iteration_mark";
                }

                @Override
                public Reader create(Reader reader) {
                    return new JapaneseIterationMarkCharFilter(reader,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
                }
            }));

    indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override
                public String name() {
                    return "kuromoji_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new JapaneseTokenizer(null, true, Mode.SEARCH);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_baseform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseBaseFormFilter(tokenStream);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_part_of_speech";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapanesePartOfSpeechStopFilter(tokenStream,
                            JapaneseAnalyzer.getDefaultStopTags());
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_readingform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseReadingFormFilter(tokenStream, true);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_stemmer";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseKatakanaStemFilter(tokenStream);
                }
            }));
}

From source file:org.omegat.tokenizer.LuceneJapaneseTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) {
    if (stemsAllowed) {
        // Blank out tags when stemming only
        strOrig = blankOutTags(strOrig);
        CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet()
                : new CharArraySet(getBehavior(), 0, false);
        Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.EMPTY_SET;
        return new JapaneseAnalyzer(getBehavior(), null, Mode.SEARCH, stopWords, stopTags).tokenStream("",
                new StringReader(strOrig));
    } else {//from   w w  w  . j a  v a 2 s . c o  m
        return new TagJoiningFilter(new JapaneseTokenizer(new StringReader(strOrig), null, false, Mode.NORMAL));
    }
}