List of usage examples for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:hivemall.nlp.tokenizer.SmartcnUDF.java
License:Apache License
@Override public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { SmartChineseAnalyzer analyzer = _analyzer; if (analyzer == null) { CharArraySet stopwords = stopWords(_stopWordsArray); analyzer = new SmartChineseAnalyzer(stopwords); this._analyzer = analyzer; }//from w ww.j a va2 s.c o m Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } String line = arg0.toString(); final List<Text> results = new ArrayList<Text>(32); TokenStream stream = null; try { stream = analyzer.tokenStream("", line); if (stream != null) { analyzeTokens(stream, results); } } catch (IOException e) { IOUtils.closeQuietly(analyzer); throw new HiveException(e); } finally { IOUtils.closeQuietly(stream); } return results; }
From source file:org.omegat.tokenizer.LuceneSmartChineseTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(getBehavior(), stopWordsAllowed); return analyzer.tokenStream("", new StringReader(strOrig)); } else {/*from w w w . j ava2 s .c o m*/ return new WordTokenFilter(new SentenceTokenizer(new StringReader(strOrig))); } }