List of usage examples for edu.stanford.nlp.ling CoreLabel CoreLabel
public CoreLabel()
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private CoreLabel getCoreLabel(int labelIndex) { if (originalCoreLabels[labelIndex] != null) { CoreLabel terminalLabel = originalCoreLabels[labelIndex]; if (terminalLabel.value() == null && terminalLabel.word() != null) { terminalLabel.setValue(terminalLabel.word()); }/*from w w w .j a v a 2 s . c om*/ return terminalLabel; } String wordStr = wordIndex.get(words[labelIndex]); CoreLabel terminalLabel = new CoreLabel(); terminalLabel.setValue(wordStr); terminalLabel.setWord(wordStr); terminalLabel.setBeginPosition(beginOffsets[labelIndex]); terminalLabel.setEndPosition(endOffsets[labelIndex]); if (originalTags[labelIndex] != null) { terminalLabel.setTag(originalTags[labelIndex].tag()); } return terminalLabel; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java
License:Open Source License
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { modelProvider.configure(aJCas.getCas()); List<Tree> trees = new ArrayList<Tree>(); List<CoreMap> sentences = new ArrayList<CoreMap>(); List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>(); for (ROOT root : select(aJCas, ROOT.class)) { // Copy all relevant information from the tokens List<CoreLabel> tokens = new ArrayList<CoreLabel>(); for (Token token : selectCovered(Token.class, root)) { tokens.add(tokenToWord(token)); }/*from www. ja va 2 s.c om*/ sentenceTokens.add(tokens); // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace // it with PRN to avoid NPEs. TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) { @Override public Tree newTreeNode(String aParent, List<Tree> aChildren) { String parent = aParent; if ("PRN0".equals(parent)) { parent = "PRN"; } Tree node = super.newTreeNode(parent, aChildren); return node; } }; // deep copy of the tree. These are modified inside coref! Tree treeCopy = TreeUtils.createStanfordTree(root, tFact).treeSkeletonCopy(); treeCopy.indexSpans(); trees.add(treeCopy); // Build the sentence CoreMap sentence = new CoreLabel(); sentence.set(TreeAnnotation.class, treeCopy); sentence.set(TokensAnnotation.class, tokens); sentence.set(RootKey.class, root); sentences.add(sentence); // https://code.google.com/p/dkpro-core-asl/issues/detail?id=590 // We currently do not copy over dependencies from the CAS. This is supposed to fill // in the dependencies so we do not get NPEs. TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), tlp.typedDependencyHeadFinder()); ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence, treeCopy, GrammaticalStructure.Extras.NONE); // https://code.google.com/p/dkpro-core-asl/issues/detail?id=582 SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); for (IndexedWord vertex : deps.vertexSet()) { vertex.setWord(vertex.value()); } // merge the new CoreLabels with the tree leaves MentionExtractor.mergeLabels(treeCopy, tokens); MentionExtractor.initializeUtterance(tokens); } Annotation document = new Annotation(aJCas.getDocumentText()); document.set(SentencesAnnotation.class, sentences); Coreferencer coref = modelProvider.getResource(); // extract all possible mentions // Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here // disables reparsing. RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false); List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0, coref.corefSystem.dictionaries()); // add the relevant info to mentions and order them for coref Map<Integer, CorefChain> result; try { Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees, allUnprocessedMentions); result = coref.corefSystem.coref(doc); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } for (CorefChain chain : result.values()) { CoreferenceLink last = null; for (CorefMention mention : chain.getMentionsInTextualOrder()) { CoreLabel beginLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class) .get(mention.startIndex - 1); CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class) .get(mention.endIndex - 2); CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class).getBegin(), endLabel.get(TokenKey.class).getEnd()); if (mention.mentionType != null) { link.setReferenceType(mention.mentionType.toString()); } if (last == null) { // This is the first mention. Here we'll initialize the chain CoreferenceChain corefChain = new CoreferenceChain(aJCas); corefChain.setFirst(link); corefChain.addToIndexes(); } else { // For the other mentions, we'll add them to the chain. last.setNext(link); } last = link; link.addToIndexes(); } } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordDependencyConverter.java
License:Open Source License
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String lang = language != null ? language : aJCas.getDocumentLanguage(); if (!languagePacks.containsKey(lang)) { throw new AnalysisEngineProcessException( new IllegalStateException("Unsupported language [" + aJCas.getDocumentLanguage() + "]")); }//from w w w .j ava2s.c om TreebankLanguagePack lp; try { lp = languagePacks.get(aJCas.getDocumentLanguage()).newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new AnalysisEngineProcessException(e); } List<CoreMap> sentences = new ArrayList<CoreMap>(); for (ROOT root : select(aJCas, ROOT.class)) { // Copy all relevant information from the tokens List<Token> tokens = selectCovered(Token.class, root); List<CoreLabel> coreTokens = new ArrayList<CoreLabel>(); for (Token token : tokens) { coreTokens.add(tokenToWord(token)); } // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace // it with PRN to avoid NPEs. TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) { @Override public Tree newTreeNode(String aParent, List<Tree> aChildren) { String parent = aParent; if ("PRN0".equals(parent)) { parent = "PRN"; } Tree node = super.newTreeNode(parent, aChildren); return node; } }; Tree tree = TreeUtils.createStanfordTree(root, tFact); Trees.convertToCoreLabels(tree); tree.indexSpans(); // Build the sentence CoreMap sentence = new CoreLabel(); sentence.set(TreeAnnotation.class, tree); sentence.set(TokensAnnotation.class, coreTokens); sentence.set(RootKey.class, root); sentences.add(sentence); doCreateDependencyTags(aJCas, lp, tree, tokens); } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java
License:Open Source License
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { List<Token> casTokens = null; // Use value from language parameter, document language or fallback language - whatever // is available String language = getLanguage(aJCas); if (isWriteToken()) { casTokens = new ArrayList<Token>(); final String text = aText; final Tokenizer<?> tokenizer = getTokenizer(language, aText); int offsetInSentence = 0; List<?> tokens = tokenizer.tokenize(); outer: for (int i = 0; i < tokens.size(); i++) { final Object token = tokens.get(i); // System.out.println("Token class: "+token.getClass()); String t = null;/* w w w.j a va2 s .c o m*/ if (token instanceof String) { t = (String) token; } if (token instanceof CoreLabel) { CoreLabel l = (CoreLabel) token; t = l.word(); int begin = l.get(CharacterOffsetBeginAnnotation.class); int end = l.get(CharacterOffsetEndAnnotation.class); casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i)); offsetInSentence = end; continue; } if (token instanceof Word) { Word w = (Word) token; t = w.word(); } if (t == null) { throw new AnalysisEngineProcessException( new IllegalStateException("Unknown token type: " + token.getClass())); } // Skip whitespace while (isWhitespace(text.charAt(offsetInSentence))) { offsetInSentence++; if (offsetInSentence >= text.length()) { break outer; } } // Match if (text.startsWith(t, offsetInSentence)) { casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence, aZoneBegin + offsetInSentence + t.length(), i)); offsetInSentence = offsetInSentence + t.length(); } else { // System.out.println(aText); throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + t + "] CAS: [" + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length())))); } } } if (isWriteSentence()) { if (casTokens == null) { casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length()); } // Prepare the tokens for processing by WordToSentenceProcessor List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); for (Token token : casTokens) { CoreLabel l = new CoreLabel(); l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); l.setWord(token.getCoveredText()); tokensInDocument.add(l); } // The sentence splitter (probably) requires the escaped text, so we prepare it here PTBEscapingProcessor escaper = new PTBEscapingProcessor(); escaper.apply(tokensInDocument); // Apply the WordToSentenceProcessor to find the sentence boundaries WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex, boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences); List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); for (List<CoreLabel> sentence : sentencesInDocument) { int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); createSentence(aJCas, begin, end); } } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java
License:Open Source License
public static CoreLabel tokenToWord(Token aToken) { CoreLabel t = new CoreLabel(); t.setOriginalText(aToken.getCoveredText()); t.setWord(aToken.getCoveredText());//from w w w .j ava 2s . com t.setBeginPosition(aToken.getBegin()); t.setEndPosition(aToken.getEnd()); if (aToken.getLemma() != null) { t.setLemma(aToken.getLemma().getValue()); } if (aToken.getPos() != null) { t.setTag(aToken.getPos().getPosValue()); } return t; }
From source file:edu.cmu.ml.rtw.users.ssrivastava.RegexExtractor.java
public static CoreMap getStanfordSentence(DocumentNLP document, int sentIdx) { List<String> words = document.getSentenceTokenStrs(sentIdx); List<PoSTag> posTags = document.getSentencePoSTags(sentIdx); List<CoreLabel> tokenList = new ArrayList<CoreLabel>(); for (int i = 0; i < words.size(); i++) { /*Re-create Stanford tokens*/ CoreLabel token = new CoreLabel(); token.setWord(words.get(i));/*from ww w . ja v a 2 s. c o m*/ token.setTag(posTags.get(i).toString()); token.setNER("O"); token.setDocID(document.getName()); token.setSentIndex(sentIdx); token.setBeginPosition(document.getToken(sentIdx, i).getCharSpanStart()); token.setEndPosition(document.getToken(sentIdx, i).getCharSpanEnd()); //System.out.println(token.word()+" "+token.beginPosition()+" "+token.endPosition()); tokenList.add(token); } //Add NER labels for sentence List<Pair<TokenSpan, String>> ners = document.getNer(sentIdx); for (Pair<TokenSpan, String> p : ners) { for (int k = p.getFirst().getStartTokenIndex(); k < p.getFirst().getEndTokenIndex(); k++) { tokenList.get(k).setNER(p.getSecond()); } } //Convert to Stanford Sentence CoreMap sentence = new ArrayCoreMap(); sentence.set(TokensAnnotation.class, tokenList); sentence.set(CharacterOffsetBeginAnnotation.class, tokenList.get(0).beginPosition()); sentence.set(CharacterOffsetEndAnnotation.class, tokenList.get(words.size() - 1).endPosition()); return sentence; }
From source file:gate.stanford.NER.java
License:Open Source License
@Override public void execute() throws ExecutionException { // check the parameters if (document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Token Annotation Type provided!"); }//from w w w.j av a 2s . com if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Sentence Annotation Type provided!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("NER searching " + document.getName()); fireProgressChanged(0); // prepare the input for CRFClassifier List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>(); // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; int sentCnt = sentencesAS.size(); // go through sentence annotations in the document while (sentencesIter.hasNext()) { Annotation currentSentence = sentencesIter.next(); // reset sentence-level processing variables tokensInCurrentSentence.clear(); sentenceForTagger.clear(); // while we have sane tokens while (currentToken != null && currentToken.getEndNode().getOffset() .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only labelling Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if (currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); // build a stanford nlp representation of the token and add it to the sequence CoreLabel currentLabel = new CoreLabel(); currentLabel.setWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)); sentenceForTagger.add(currentLabel); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the labeller if (sentenceForTagger.isEmpty()) continue; // run the labeller List<CoreLabel> taggerResults = tagger.classifySentence(sentenceForTagger); // add the results // make sure no malfunction occurred if (taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException("NER labeller malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); // proceed through the annotated sequence Iterator<CoreLabel> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); String previousLabel = outsideLabel; Long previousEnd = new Long(-1); Long entityStart = new Long(-1); Long entityEnd = new Long(-1); Annotation annot; String nerLabel = ""; while (resIter.hasNext()) { // for each labelled token.. annot = tokIter.next(); CoreLabel word = resIter.next(); nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class); // falling edge transition: entity ends // guard against this triggering at document start if (!nerLabel.equals(previousLabel) && !previousLabel.equals(outsideLabel) && entityStart != -1) { // System.out.println("falling edge"); // get final bound; add new annotation in output AS try { outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } } // rising edge transition: entity starts if (!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) { // System.out.println("rising edge"); entityStart = annot.getStartNode().getOffset(); } // System.out.println(word.word() + "/" + nerLabel); previousLabel = nerLabel; previousEnd = annot.getEndNode().getOffset(); } // clean up, in case last token in sentence was in an entity if (!nerLabel.equals(outsideLabel)) { try { outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } } fireProgressChanged(sentIndex++ * 100 / sentCnt); } fireProcessFinished(); fireStatusChanged(document.getName() + " tagged in " + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000) + " seconds!"); } else { if (failOnMissingInputAnnotations) { throw new ExecutionException("No sentences or tokens to process in document " + document.getName() + "\n" + "Please run a sentence splitter " + "and tokeniser first!"); } else { Utils.logOnce(logger, Level.INFO, "NE labeller: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } }
From source file:lv.lnb.ner.tagFolders.java
License:Open Source License
private static int processFile(AbstractSequenceClassifier<CoreLabel> classifier, String filename, String doc_id, Writer writer) {//from w ww. ja v a 2s. c o m NECounterSingleDoc counter = new NECounterSingleDoc(doc_id); List<CoreLabel> document = new ArrayList<CoreLabel>(); int i = 0; try { BufferedReader ieeja = new BufferedReader( new InputStreamReader(new FileInputStream(filename), "UTF-8")); String line; while ((line = ieeja.readLine()) != null) { i++; if (line.contains("<doc") || line.contains("</doc>") || line.contains("<page") || line.contains("</page>") || line.contains("<p>")) continue; String[] info = line.split("\t"); CoreLabel word = new CoreLabel(); if (line.contains("</p>")) { word.set(TextAnnotation.class, "<p/>"); word.set(LemmaAnnotation.class, "<p/>"); word.set(PartOfSpeechAnnotation.class, "-"); continue; } else if (line.contains("<g />")) { word.set(TextAnnotation.class, "<g/>"); word.set(LemmaAnnotation.class, "<g/>"); word.set(PartOfSpeechAnnotation.class, "-"); continue; } else if (info.length < 3) { System.err.printf("%d @ %s:%s", i, filename, line); } else { word.set(TextAnnotation.class, info[0]); word.set(LemmaAnnotation.class, info[1]); word.set(PartOfSpeechAnnotation.class, info[2].substring(0, 1)); } document.add(word); } ieeja.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } List<CoreLabel> out = classifier.classify(document); String prevtag = ""; String name_part = ""; String lemma_part = ""; for (CoreLabel word : out) { String tag = word.get(AnswerAnnotation.class); if (tag.length() < 2) tag = ""; if (!tag.equalsIgnoreCase(prevtag)) { if (!prevtag.equalsIgnoreCase("")) counter.add(doc_id, name_part, lemma_part, prevtag); if (!tag.equalsIgnoreCase("")) { name_part = word.word(); lemma_part = word.get(LemmaAnnotation.class); } } else if (!tag.equalsIgnoreCase("")) { name_part = name_part + " " + word.word(); lemma_part = lemma_part + " " + word.get(LemmaAnnotation.class); } prevtag = tag; } try { counter.db_insert(writer); } catch (Exception e) { e.printStackTrace(); } return i; }
From source file:lv.lumii.morphotagger.MorphoPipe.java
License:Open Source License
public static List<List<CoreLabel>> readCONLL(BufferedReader in) throws IOException { String s;/*from w ww . java 2 s . c o m*/ List<CoreLabel> sentence = new LinkedList<CoreLabel>(); List<List<CoreLabel>> result = new LinkedList<List<CoreLabel>>(); CoreLabel stag = new CoreLabel(); stag.set(TextAnnotation.class, "<s>"); sentence.add(stag); while ((s = in.readLine()) != null) { if (s.trim().length() > 0) { String[] fields = s.split("\t"); String token = fields[1]; if (!token.equalsIgnoreCase("_")) token = token.replace('_', ' '); String extraColumns = ""; if (saveColumns) { for (int field_i = 6; field_i < fields.length; field_i++) extraColumns += fields[field_i] + "\t"; extraColumns.trim(); } String syntax = ""; if (fields.length >= 10) syntax = fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + fields[9]; CoreLabel word = new CoreLabel(); word.set(TextAnnotation.class, token); word.set(ParentAnnotation.class, syntax); word.set(ExtraColumnAnnotation.class, extraColumns); sentence.add(word); } else { stag = new CoreLabel(); stag.set(TextAnnotation.class, "<s>"); sentence.add(stag); result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence)); sentence = new LinkedList<CoreLabel>(); stag = new CoreLabel(); stag.set(TextAnnotation.class, "<s>"); sentence.add(stag); } } if (sentence.size() > 0) { stag = new CoreLabel(); stag.set(TextAnnotation.class, "<s>"); sentence.add(stag); result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence)); } return result; }
From source file:lv.pipe.MorphoTagger.java
License:Open Source License
public Annotation processSentence(Annotation sentence) { if (sentence.has(LabelTokens.class)) { List<Annotation> tokens = sentence.get(LabelTokens.class); // This is not working returns all "xf": // List<Word> sent = new ArrayList<Word>(tokens.size()); // for (Annotation token : tokens) { // String word = token.get(TextLabel.class); // sent.add(new Word(word)); // }//from w ww .j ava2s. c o m // List<CoreLabel> coreLabels = // LVMorphologyReaderAndWriter.analyzeSentence2(sent); List<CoreLabel> sent = new ArrayList<CoreLabel>(tokens.size()); for (Annotation token : tokens) { String word = token.get(LabelText.class); CoreLabel wi = new CoreLabel(); wi.setWord(word); sent.add(wi); } CoreLabel sEnd = new CoreLabel(); sEnd.setWord("<s>"); sent.add(sEnd); List<CoreLabel> coreLabels = LVMorphologyReaderAndWriter.analyzeLabels(sent); morphoClassifier.classify(coreLabels); sentence.remove(LabelTokens.class); List<Annotation> tLabels = new ArrayList<Annotation>(coreLabels.size()); int counter = 1; for (CoreLabel w : coreLabels) { Annotation tLabel = new Annotation(); String token = w.getString(TextAnnotation.class); // token = token.replace(' ', '_'); if (token.contains("<s>")) continue; tLabel.setText(token); tLabel.set(LabelIndex.class, counter++); Word analysis = w.get(LVMorphologyAnalysis.class); Wordform mainwf = analysis.getMatchingWordform(w.getString(AnswerAnnotation.class), false); if (mainwf != null) { String lemma = mainwf.getValue(AttributeNames.i_Lemma); // lemma = lemma.replace(' ', '_'); if (lemma == null || lemma.trim().isEmpty()) { lemma = "_"; log.log(Level.SEVERE, "Empty lemma for {0}", token); } tLabel.setLemma(lemma); String answer = w.getString(AnswerAnnotation.class); if (answer == null || answer.trim().isEmpty()) { answer = "_"; // no empty tag log.log(Level.SEVERE, "Empty simple pos tag for {0}", token); } tLabel.set(LabelPosTagSimple.class, answer); String posTag = mainwf.getTag(); if (posTag == null || posTag.trim().isEmpty()) { posTag = "_"; log.log(Level.SEVERE, "Empty pos tag for {0}", token); } tLabel.set(LabelPosTag.class, posTag); // Feature atribtu filtri if (MINI_TAG) mainwf.removeNonlexicalAttributes(); if (LETA_FEATURES) { addLETAfeatures(mainwf); // mainwf.removeAttribute(AttributeNames.i_SourceLemma); // FIXME - atvasin?tiem v?rdiem is var bt svargs, // atpriedekotas lemmas.. mainwf.removeTechnicalAttributes(); } // v?rda f?as StringBuilder s = mainwf.pipeDelimitedEntries(); if (FEATURES) { // visas f?as, ko lietoja trenjot Datum<String, String> d = morphoClassifier.makeDatum(coreLabels, counter, morphoClassifier.featureFactory); for (String feature : d.asFeatures()) { // noemam trailing |C, kas t?m f??m tur ir s.append(feature.substring(0, feature.length() - 2).replace(' ', '_')); s.append('|'); } } // noemam peedeejo | separatoru, kas ir lieks s.deleteCharAt(s.length() - 1); s.append('\t'); String morphoFeatures = s.toString(); tLabel.set(LabelMorphoFeatures.class, morphoFeatures); } else { log.log(Level.SEVERE, "Empty main word form for {0}", token); } tLabels.add(tLabel); } sentence.set(LabelTokens.class, tLabels); } return sentence; }