List of usage examples for org.apache.lucene.analysis.ja JapaneseAnalyzer getDefaultStopTags
public static Set<String> getDefaultStopTags()
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
public LuceneParser(boolean ignoreDefaultWordSet) throws IOException { CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer.getDefaultStopSet() : new CharArraySet(new ArrayList<String>(), true); Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer.getDefaultStopTags() : new HashSet<String>(); analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags); }
From source file:hivemall.nlp.tokenizer.KuromojiUDF.java
License:Apache License
@Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { final int arglen = arguments.length; if (arglen < 1 || arglen > 4) { throw new UDFArgumentException("Invalid number of arguments for `tokenize_ja`: " + arglen); }//from w w w. j av a 2 s. co m this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL; this._stopWordsArray = (arglen >= 3) ? HiveUtils.getConstStringArray(arguments[2]) : null; this._stoptags = (arglen >= 4) ? stopTags(arguments[3]) : JapaneseAnalyzer.getDefaultStopTags(); this._analyzer = null; return ObjectInspectorFactory .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); }
From source file:hivemall.nlp.tokenizer.KuromojiUDF.java
License:Apache License
@Nonnull private static Set<String> stopTags(@Nonnull final ObjectInspector oi) throws UDFArgumentException { final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopTags(); }/* w ww . j a v a2s . co m*/ final int length = array.length; if (length == 0) { return Collections.emptySet(); } final Set<String> results = new HashSet<String>(length); for (int i = 0; i < length; i++) { String s = array[i]; if (s != null) { results.add(s); } } return results; }
From source file:org.elasticsearch.index.analysis.Kuromoji2AnalyzerProvider.java
License:Apache License
public Kuromoji2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = Kuromoji2TokenizerFactory.getMode(settings); final UserDictionary userDictionary = Kuromoji2TokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
From source file:org.elasticsearch.index.analysis.KuromojiAnalyzerProvider.java
License:Apache License
@Inject public KuromojiAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java
License:Apache License
@Inject public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); indicesAnalysisService.analyzerProviderFactories().put("kuromoji", new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer())); indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { @Override/*from w w w. j a v a2 s . c om*/ public String name() { return "kuromoji_iteration_mark"; } @Override public Reader create(Reader reader) { return new JapaneseIterationMarkCharFilter(reader, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); } })); indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override public String name() { return "kuromoji_tokenizer"; } @Override public Tokenizer create() { return new JapaneseTokenizer(null, true, Mode.SEARCH); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_baseform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseBaseFormFilter(tokenStream); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_part_of_speech"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapanesePartOfSpeechStopFilter(tokenStream, JapaneseAnalyzer.getDefaultStopTags()); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_readingform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseReadingFormFilter(tokenStream, true); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_stemmer"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream); } })); }
From source file:org.omegat.tokenizer.LuceneJapaneseTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) { if (stemsAllowed) { // Blank out tags when stemming only strOrig = blankOutTags(strOrig); CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : new CharArraySet(getBehavior(), 0, false); Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.EMPTY_SET; return new JapaneseAnalyzer(getBehavior(), null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig)); } else {//from w w w . j a v a 2 s . c o m return new TagJoiningFilter(new JapaneseTokenizer(new StringReader(strOrig), null, false, Mode.NORMAL)); } }