Java tutorial
/* * Copyright 2015 Textocat * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.textocat.textokit.postagger; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.textocat.textokit.commons.cas.FSUtils; import com.textocat.textokit.morph.dictionary.AnnotationAdapterBase; import com.textocat.textokit.morph.fs.Word; import com.textocat.textokit.morph.model.Lemma; import com.textocat.textokit.morph.model.Wordform; import com.textocat.textokit.tokenizer.fstype.Token; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import java.util.BitSet; import java.util.Collection; import java.util.List; /** * <p/> * Uses Wordform.pos to set general lexical category, e.g., NOUN,VERB, etc. * <p/> * Uses Wordform.grammems to set all grammatical categories, including general * one. * * @author Rinat Gareev */ public class DefaultAnnotationAdapter extends AnnotationAdapterBase { @Override public void apply(JCas jcas, Annotation token, Collection<Wordform> dictWfs) { Word word = new Word(jcas); word.setBegin(token.getBegin()); word.setEnd(token.getEnd()); // TODO check token type word.setToken((Token) token); List<com.textocat.textokit.morph.fs.Wordform> casWfList = Lists.newLinkedList(); for (Wordform wf : dictWfs) { com.textocat.textokit.morph.fs.Wordform casWf = new com.textocat.textokit.morph.fs.Wordform(jcas); BitSet grammems = wf.getGrammems(); Lemma lemma = dict.getLemma(wf.getLemmaId()); // set lemma id casWf.setLemmaId(lemma.getId()); // set lemma norm casWf.setLemma(lemma.getString()); // set pos casWf.setPos(dict.getGramModel().getPos(lemma.getGrammems())); // set grammems grammems.or(lemma.getGrammems()); List<String> gramSet = dict.getGramModel().toGramSet(grammems); casWf.setGrammems(FSUtils.toStringArray(jcas, gramSet)); // set hosting word casWf.setWord(word); casWfList.add(casWf); } // set wordforms word.setWordforms(FSUtils.toFSArray(jcas, casWfList)); word.addToIndexes(); } @Override public void apply(JCas jcas, Annotation token, Integer lexemeId, final String _lemma, BitSet posBits) { Word word = new Word(jcas); word.setBegin(token.getBegin()); word.setEnd(token.getEnd()); // TODO check token type word.setToken((Token) token); com.textocat.textokit.morph.fs.Wordform casWf = new com.textocat.textokit.morph.fs.Wordform(jcas); String lemma = null; if (lexemeId != null) { Lemma lex = dict.getLemma(lexemeId); lemma = lex.getString(); casWf.setLemmaId(lexemeId); } else if (_lemma != null) { lemma = _lemma; } if (lemma != null) { casWf.setLemma(lemma); } // TODO set 'pos' feature // casWf.setPos(...); List<String> gramSet = dict.getGramModel().toGramSet(posBits); casWf.setGrammems(FSUtils.toStringArray(jcas, gramSet)); // set hosting word casWf.setWord(word); // set wordforms word.setWordforms(FSUtils.toFSArray(jcas, ImmutableList.of(casWf))); word.addToIndexes(); } }