List of usage examples for org.apache.lucene.analysis.cn.smart.hhmm HHMMSegmenter process
public List<SegToken> process(String sentence)
From source file:aak.as.preProcess.chinese.ZhSegmenter.java
License:Open Source License
public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); HHMMSegmenter segmenter = new HHMMSegmenter(); List<SegToken> ctokens = segmenter.process(text); for (SegToken ctoken : ctokens) { if (ctoken.startOffset < 0 || ctoken.endOffset > text.length()) continue; String word = text.substring(ctoken.startOffset, ctoken.endOffset); if (punctuation.contains(word)) continue; //System.out.println("<"+ctoken.startOffset+","+ ctoken.endOffset+">"); ret.add(word);//from w ww .j ava 2 s.c om } return ret; }