List of usage examples for org.apache.lucene.analysis.cn.smart.hhmm HHMMSegmenter HHMMSegmenter
HHMMSegmenter
From source file:aak.as.preProcess.chinese.ZhSegmenter.java
License:Open Source License
public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); HHMMSegmenter segmenter = new HHMMSegmenter(); List<SegToken> ctokens = segmenter.process(text); for (SegToken ctoken : ctokens) { if (ctoken.startOffset < 0 || ctoken.endOffset > text.length()) continue; String word = text.substring(ctoken.startOffset, ctoken.endOffset); if (punctuation.contains(word)) continue; //System.out.println("<"+ctoken.startOffset+","+ ctoken.endOffset+">"); ret.add(word);//from w w w. j a va 2 s.c o m } return ret; }