List of usage examples for edu.stanford.nlp.ling CoreLabel lemma
@Override
public String lemma()
From source file:SentencePair.java
License:Open Source License
private void lemmatize(List<POSTaggedToken> sentence) { String text = ""; /* Convert the sentence back to a single string */ for (POSTaggedToken tt : sentence) { text += tt.token + " "; }//from w w w .j a v a2s . c om Annotation d = new Annotation(text); nlp.annotate(d); for (CoreMap ss : d.get(CoreAnnotations.SentencesAnnotation.class)) { Iterator<CoreLabel> itToken = ss.get(CoreAnnotations.TokensAnnotation.class).iterator(); ListIterator<POSTaggedToken> itSentence = sentence.listIterator(); while (itToken.hasNext() && itSentence.hasNext()) { CoreLabel token = itToken.next(); POSTaggedToken tt = itSentence.next(); tt.lemma = token.lemma(); /* add a lemma to the POSTaggedToken */ itSentence.set(tt); } } }
From source file:ca.ualberta.exemplar.core.ParserMalt.java
License:Open Source License
private String[] sentenceToCoNLLInput(List<CoreLabel> tokens) { List<String> conllList = new ArrayList<String>(100); int num = 1;//from w ww . ja va 2 s. c o m for (CoreLabel token : tokens) { String word = token.word(); String lemmaA = token.lemma(); String lemma = lemmaA != null && lemmaA.length() > 0 ? lemmaA : "_"; String posA = token.get(PartOfSpeechAnnotation.class); String pos = posA != null && posA.length() > 0 ? posA : "_"; conllList.add(num + "\t" + word + "\t" + lemma + "\t" + pos + "\t" + pos + "\t" + "_"); num++; } String[] conll = new String[conllList.size()]; conll = conllList.toArray(conll); return conll; }
From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java
License:Open Source License
@Override public void annotate(Annotation annotation) { if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); for (CoreLabel token : tokens) { boolean isWordStopword = stopwords.contains(token.word().toLowerCase()); boolean isLemmaStopword = checkLemma ? stopwords.contains(token.lemma().toLowerCase()) : false; Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword); token.set(StopwordAnnotator.class, pair); }//from w w w.ja v a 2 s . c o m } }
From source file:com.graphbrain.eco.StanfordLemmatizer.java
License:Open Source License
public List<String> lemmatize(String documentText, int returnType) { List<String> words = new LinkedList<>(); List<String> lemmas = new LinkedList<>(); // create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas words.add(token.word());//from ww w .j a va2 s .com lemmas.add(token.lemma()); // lemmas.add(token.get(LemmaAnnotation.class)); } } if (returnType == 0) { return lemmas; } else { return words; } }
From source file:edu.cuhk.hccl.util.NLPUtil.java
License:Apache License
public static ArrayList<String[]> extractNounPhrases(StanfordCoreNLP pipeline, String text, int searchRange) { ArrayList<String[]> wordPairs = new ArrayList<String[]>(); Annotation document = new Annotation(text); pipeline.annotate(document);//www. ja v a2s .co m List<CoreMap> sentences = document.get(SentencesAnnotation.class); MAX_STEPS = searchRange; for (CoreMap sentence : sentences) { List<CoreLabel> labels = sentence.get(TokensAnnotation.class); // Check negation boolean hasNegation = false; for (CoreLabel label : labels) { if (NEGATIONS.contains(label.lemma().toLowerCase())) { hasNegation = true; } } for (int idx = 0; idx < labels.size(); idx++) { CoreLabel label = labels.get(idx); if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) { for (int step = 1; step <= MAX_STEPS; step++) { CoreLabel leftLabel = labels.get(Math.max(0, idx - step)); if (JJ_TAGS.contains(leftLabel.tag())) { if (hasNegation) addPair(wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair(wordPairs, leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1)); if (JJ_TAGS.contains(rightLabel.tag())) { if (hasNegation) addPair(wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair(wordPairs, rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } } } } } return wordPairs; }
From source file:edu.ucla.cs.scai.aztec.ir.tokenization.WordTokenizer.java
License:Apache License
public WordTokenizedDocument tokenize(String text, boolean lemmatize, boolean removeStopWords, boolean toLowerCase) { WordTokenizedDocument res = new WordTokenizedDocument(); Properties propsTokens = new Properties(); propsTokens.put("annotators", "tokenize, ssplit, pos, lemma, ner, regexner"); StanfordCoreNLP pipelineTokens = new StanfordCoreNLP(propsTokens); Annotation qaTokens = new Annotation(text); pipelineTokens.annotate(qaTokens);//from w w w . java2s. c o m List<CoreMap> sentences = qaTokens.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { WordTokenizedSentence s = new WordTokenizedSentence(); for (CoreLabel cl : (ArrayList<CoreLabel>) sentence.get(CoreAnnotations.TokensAnnotation.class)) { if (!removeStopWords || !stopwords.contains(cl.lemma())) { WordToken t = new WordToken(cl.word(), cl.lemma(), cl.tag()); if (lemmatize) { t.useLemma(); } if (toLowerCase) { t.useLowerCase(); } s.appendToken(t); } } res.appendSentence(s); } return res; }
From source file:edu.ucla.cs.scai.qa.questionclassifier.SyntacticTreeNode.java
public SyntacticTreeNode(Tree t, ArrayList<CoreLabel> tokens, SyntacticTreeNode parent) throws Exception { this.parent = parent; value = t.value();/*w w w . ja v a 2 s . c o m*/ if (t.isLeaf()) { CoreLabel c = tokens.remove(0); begin = c.beginPosition(); end = c.endPosition(); if (c == null) { throw new Exception("Mapping between TreeNode and CoreLabel not found"); } else { lemma = c.lemma(); ner = c.ner(); //System.out.println(value + " -> " + c.value()); if (!value.equals(c.value())) { throw new Exception("Different words have been matched!"); } } } else { boolean hasNPchildren = false; boolean hasWHNPchildren = false; boolean hasQPchildren = false; begin = Integer.MAX_VALUE; end = Integer.MIN_VALUE; for (Tree c : t.children()) { SyntacticTreeNode child = new SyntacticTreeNode(c, tokens, this); children.add(child); if (child.value.equals("NP")) { hasNPchildren = true; } else if (child.value.equals("QP")) { hasQPchildren = true; } else if (child.value.equals("WHNP")) { hasWHNPchildren = true; } begin = Math.min(begin, child.begin); end = Math.max(end, child.end); } if (value.equals("NP")) { if (hasNPchildren) { npCompound = true; } else if (hasQPchildren) { npQp = true; } else { npSimple = true; } } else if (value.equals("WHNP")) { //can a WHNP node have QP children? if (hasNPchildren || hasWHNPchildren) { whnpCompound = true; } else if (!hasQPchildren) { whnpSimple = true; } } } }
From source file:eu.modelwriter.semantic.stanford_corenlp.MorphologySimilarityProvider.java
License:Open Source License
/** * {@inheritDoc}//from w w w . j a va2s . co m * * @see eu.modelwriter.semantic.ISemanticSimilarityProvider#getSemanticSimilarities(java.util.Map) */ public Map<String, Set<Object>> getSemanticSimilarities(Map<String, Set<Object>> labels) { final Map<String, Set<Object>> res = new LinkedHashMap<String, Set<Object>>(); final StringBuilder builder = new StringBuilder(); final Set<String> words = labels.keySet(); if (!words.isEmpty()) { for (String label : words) { builder.append(label); builder.append(' '); } Annotation document = new Annotation(builder.substring(0, builder.length() - 1)); PIPELINE.annotate(document); final List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { final String label = token.originalText(); final String lemma = token.lemma(); Set<Object> lemmaSet = res.get(lemma); if (lemmaSet == null) { lemmaSet = new LinkedHashSet<Object>(); res.put(lemma, lemmaSet); } final Set<Object> concepts = labels.get(label); if (concepts != null) { lemmaSet.addAll(concepts); } } } } return res; }
From source file:ims.cs.corenlp.TokenAligner.java
License:Open Source License
/** * Splits a CoreNLP token based on a position. We split only the word form as we don't have sufficient information * to split the lemma./*w w w. j av a 2 s . co m*/ * @param token * @param absPosition * @return */ private CoreLabel[] splitToken(CoreLabel token, int absPosition) { String word = token.word(); String origText = token.originalText(); // initialize parts CoreLabel[] splitting = new CoreLabel[2]; splitting[0] = new CoreLabel(token); splitting[1] = new CoreLabel(token); // calculate split position int relPosition = absPosition - token.beginPosition(); // cut up original text if (origText.length() >= relPosition) { String origText1 = origText.substring(0, relPosition); String origText2 = origText.substring(relPosition); splitting[0].setOriginalText(origText1); splitting[1].setOriginalText(origText2); } // cut up predicted text if (word.length() >= relPosition) { String word1 = word.substring(0, relPosition); String word2 = word.substring(relPosition); splitting[0].setWord(word1); splitting[1].setWord(word2); } // we could do the same with POS and lemma, but that would be complicated ... splitting[0].setEndPosition(absPosition); /* set a new end as we just shortened this token */ splitting[1].setBeginPosition(absPosition); /* set a new position as we just shortened this token */ // copy lemmas splitting[0].setLemma(token.lemma()); splitting[1].setLemma(token.lemma()); return splitting; }
From source file:ims.cs.corenlp.TokenAligner.java
License:Open Source License
/** * Combines my token and a CoreNlp token using predicted information * @param tok/* w ww . j a va2 s. c om*/ * @param cl * @param currentCoreNlpSentenceIndex * @return */ public static Token combineTokensPred(Token tok, CoreLabel cl, int currentCoreNlpSentenceIndex) { Token combined = new Token(tok); combined.predText = cl.word(); combined.predLemma = cl.lemma(); combined.predPosition = -1; /* will be determined by document aligner */ combined.predPosTag = cl.tag(); combined.predSentencePosition = currentCoreNlpSentenceIndex; combined.predNer = Helper.translateNer(cl.ner()); combined.predByteCount = new ByteCount(cl.beginPosition(), cl.endPosition()); return combined; }