List of usage examples for org.apache.lucene.analysis.ja.tokenattributes BaseFormAttribute getBaseForm
public String getBaseForm();
From source file:aak.as.preProcess.japanese.JaSegmenter.java
License:Open Source License
@Override public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); StringReader textreader = new StringReader(text); JapaneseTokenizer segmenter = new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH); JaStemmer.lemma.clear();/*from w w w . ja v a 2s.c om*/ CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class); BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class); try { segmenter.reset(); while (segmenter.incrementToken()) { //segmenter.clearAttributes(); ret.add(termAtt.toString()); if (baseAtt.getBaseForm() != null) JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm()); } segmenter.close(); } catch (IOException e) { // TODO Auto-generated catch block. e.printStackTrace(); } return ret; }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readBaseForm(TokenStream tokenStream, LuceneToken token) { BaseFormAttribute baseForm = tokenStream.getAttribute(BaseFormAttribute.class); if (baseForm != null) { token.setBaseForm(baseForm.getBaseForm()); }//from w ww. ja va 2s. c o m }