List of usage examples for edu.stanford.nlp.ling IndexedWord get
@Override
public <VALUE> VALUE get(Class<? extends TypesafeMap.Key<VALUE>> key)
From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java
License:Creative Commons License
/** * Gets the candidate hypernyms form the provided subdef * * @returns a mapping from the candidate to the heuristics that generated it *//*from w ww . j a v a 2s. c om*/ MultiMap<String, String> getCandidates(SemanticGraph dependencies, String subdef, POS spos_) { MultiMap<String, String> candidates = new HashMultiMap<String, String>(); char sensePos = toChar(spos_); Collection<IndexedWord> roots = dependencies.getRoots(); next_root: for (IndexedWord root : roots) { String word = root.get(TextAnnotation.class); String lemma = root.get(LemmaAnnotation.class); String pos = root.get(PartOfSpeechAnnotation.class); char lemmaPos = pos.substring(0, 1).toLowerCase().charAt(0); String lemmaLc = lemma.toLowerCase(); //System.out.println("testing: " + lemma + "/" + pos); // If the lemma is a verb, check for phrasal verbal particle (e.g., // "lead on", "edge out") and if present, add them to the lemma if (lemmaPos == 'v') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("prt")) { IndexedWord dep = e.getDependent(); lemma = lemma + " " + dep.get(LemmaAnnotation.class); break; } } } // Heuristic 1: root matches exact POS if (lemmaPos == sensePos) { // Edge case for Heuristics 7: If the lemma is a noun and is // saying that this is an instance (e.g., "An instance of ..."), // then we take the dependent noun from instance // // Terrible example: // The second of the two Books of Chronicles and the // fourteenth book of the Old Testament of the Bible. // boolean foundExistentialDependent = false; if (lemma.equals("instance") || lemma.equals("example") || lemma.equals("first") || lemma.equals("second") || lemma.equals("third") || lemma.equals("fourth") || lemma.equals("fifth") || lemma.equals("sixth") || lemma.equals("series")) { // Check that there's actually a prepositional phrase // attached List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("prep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase() .charAt(0); //System.out.println("HEURISTIC 7"); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-7"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-7"); foundExistentialDependent = true; } } } } if (foundExistentialDependent) continue next_root; // Heuristic 10: In the case of noun phrases, take the last noun // in the phrase, e.g., "Molten material", "pringtime snow // runoff" List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); boolean foundDependent = false; for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); //System.out.println("HEURISTIC 10"); if (depPos == sensePos) { foundDependent = true; candidates.put(depLemma, "Heuristic-10"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-10"); } } } if (!foundDependent) { //System.out.println("HEURISTIC 1"); candidates.put(lemma, "Heuristic-1"); addSiblings(root, candidates, sensePos, dependencies, "Heuristic-1"); } } // Heuristic 2: subdef is either (1) one word or (2) two or more // word that *must be connected by a conjunction, and (3) the lemma // has the wrong part of speech, but could have the same POS (i.e., // the lemma was probably POS-tagged incorrectly). if (sensePos != lemmaPos) { // Only one word in the subdef, which can manifest itself as the // graph having no vertices! (size == 0) if (dependencies.size() < 1) { // System.out.println("HEURISTIC 2a"); IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-2a"); else { // Sometimes adjectves get lemmatized to a verb form // which is in correct. Check to see if the token // matches String token = root.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-2a"); } } else { // System.out.println("HEURISTIC 2b"); Set<IndexedWord> tmp = new HashSet<IndexedWord>(); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("conj")) { if (tmp.size() == 0) tmp.add(root); tmp.add(e.getDependent()); } } if (!tmp.isEmpty()) { for (IndexedWord iw : tmp) { String lem = iw.get(LemmaAnnotation.class); IIndexWord iword = dict.getIndexWord(lem, spos_); if (iword != null) candidates.put(lem, "Heuristic-2b"); else { // Sometimes adjectves get lemmatized to a verb // form which is in correct. Check to see if // the token matches String token = iw.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-2b"); } } //System.out.println(tmp); } } } // Heuristics 3: the subdef is phrased as an overly-general description // of a person using "one", e.g., "one who does X". Replace this with // "person" if (sensePos == 'n' && (lemma.equals("one") || lemma.equals("someone"))) { // check the dependency graph for a "who" attachment // TODO // ... or be lazy and just check for the token Matcher m = WHO.matcher(subdef); if (m.find()) { candidates.put("person", "Heuristic-3: Person"); } } // Heuristic 4: if the root lemma is an adjective and the target // sense is a noun, look for a modifying a noun or set of nouns, // report those /// // Example: "a small, arched passageway" if (sensePos == 'n' && lemmaPos == 'j') { //System.out.println("HEURISTIC 4"); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("appos") || e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); // System.out.println("!!! " + depLemma); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-4: Head Noun"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-4: Head Noun"); } //break; } } } // Heuristic 5: if the root lemma is a verb and the target sense is // a noun, look for a subject noun if (sensePos == 'n' && lemmaPos == 'v') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("nsubj")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-5: Subject Noun"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-5: Subject Noun"); } break; } } } // Heuristic 6: if the root lemma is an existential quantifier or // something like it (e.g., "Any of ...") and // the target sense is a noun, look for a subject noun if (sensePos == 'n' && lemmaPos == 'd') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("prep") || e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); // System.out.println(depLemma + "/" + depPos); // This should be the common case if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-6: Existential Example"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-6: Existential Example"); } // This is for some really (really) unusually parsed // edge cases else { List<SemanticGraphEdge> depEdges = dependencies.outgoingEdgeList(dep); for (SemanticGraphEdge e2 : depEdges) { if (e2.getRelation().getShortName().equals("rcmod")) { IndexedWord dep2 = e2.getDependent(); String depLemma2 = dep2.get(LemmaAnnotation.class); char depPos2 = dep2.get(PartOfSpeechAnnotation.class).substring(0, 1) .toLowerCase().charAt(0); if (depPos2 == sensePos) { candidates.put(depLemma2, "Heuristic-6: Existential Example"); addSiblings(dep2, candidates, sensePos, dependencies, "Heuristic-6: Existential Example"); } } } } } } } // Heuristic 8: if the root lemma is a verb and the sense is an // adjective, but the verb is modified by an adverb, this catches // that cases that Heuristics 2 does not if (sensePos == 'j' && lemmaPos == 'v') { Set<IndexedWord> tmp = new HashSet<IndexedWord>(); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("advmod")) { IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-8: Adv-modified Verb"); else { // Sometimes adjectves get lemmatized to a verb // form which is in correct. Check to see if // the token matches String token = root.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-8: Adv-modified Verb"); } } } } // Heuristic 9: if the sense is an adjective and the root lemma // begins with with a negative *and* the gloss contains something // like "not [x]", then pull out the "x" and use it as the hypernym if (sensePos == 'j' && lemma.equals("not")) { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-9: negated adj"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-9: negated adj"); } break; } } } // Heuristic 11: if the sense is a verb and the root lemma // is "to", this is probably a case of mistaken POS-tagging if (sensePos == 'v' && lemma.equals("to")) { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("pobj")) { IndexedWord dep = e.getDependent(); IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-11: verbal infinitive"); else { // Sometimes verbs get lemmatized to a noun form // that is incorrect. Check to see if the token // matches String token = dep.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-9: verbal infinitive"); } } } } } return candidates; }
From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java
License:Creative Commons License
/** * If we know we want {@code toAdd}, get all of its siblings that are joined * by conjunctions as candidates too//from w ww .ja v a 2s . co m */ void addSiblings(IndexedWord toAdd, MultiMap<String, String> candidates, char targetPos, SemanticGraph parse, String reason) { List<SemanticGraphEdge> edges = parse.outgoingEdgeList(toAdd); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("conj")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (targetPos == depPos) { if (targetPos != 'v') { candidates.put(depLemma, reason + " (In conjunction)"); } // Check for phrasal verb particles else { List<SemanticGraphEdge> depEdges = parse.outgoingEdgeList(dep); for (SemanticGraphEdge e2 : depEdges) { if (e2.getRelation().getShortName().equals("prt")) { IndexedWord dep2 = e.getDependent(); depLemma = depLemma + " " + dep2.get(LemmaAnnotation.class); break; } } } } } } }
From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro.java
License:Open Source License
public static void convertDependencies(JCas aJCas, Annotation document, MappingProvider mappingProvider, boolean internStrings) { for (CoreMap s : document.get(SentencesAnnotation.class)) { SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class); //SemanticGraph graph = s.get(EnhancedDependenciesAnnotation.class); // If there are no dependencies for this sentence, skip it. Might well mean we // skip all sentences because normally either there are dependencies for all or for // none./*from ww w . ja va 2 s . c o m*/ if (graph == null) { continue; } for (IndexedWord root : graph.getRoots()) { Dependency dep = new ROOT(aJCas); dep.setDependencyType("root"); dep.setDependent(root.get(TokenKey.class)); dep.setGovernor(root.get(TokenKey.class)); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.setFlavor(DependencyFlavor.BASIC); dep.addToIndexes(); } for (SemanticGraphEdge edge : graph.edgeListSorted()) { Token dependent = edge.getDependent().get(TokenKey.class); Token governor = edge.getGovernor().get(TokenKey.class); // For the type mapping, we use getShortName() instead, because the <specific> // actually doesn't change the relation type String labelUsedForMapping = edge.getRelation().getShortName(); // The nndepparser may produce labels in which the shortName contains a colon. // These represent language-specific labels of the UD, cf: // http://universaldependencies.github.io/docs/ext-dep-index.html labelUsedForMapping = StringUtils.substringBefore(labelUsedForMapping, ":"); // Need to use toString() here to get "<shortname>_<specific>" String actualLabel = edge.getRelation().toString(); Type depRel = mappingProvider.getTagType(labelUsedForMapping); Dependency dep = (Dependency) aJCas.getCas().createFS(depRel); dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel); dep.setDependent(dependent); dep.setGovernor(governor); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.setFlavor(edge.isExtra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); dep.addToIndexes(); } } }
From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);//from w w w . j a va2 s . com token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }
From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/* w w w.j a v a 2 s .co m*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); //String line = mention.getCoveredText(); //System.out.println(line); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { //NamedEntityMention mention=entity.getMentions(3); //System.out.println(mention.getBegin()); entity.addToIndexes(); } }
From source file:knu.univ.lingvo.coref.Mention.java
License:Open Source License
private static Pair<IndexedWord, String> findDependentVerb(Mention m) { Pair<IndexedWord, String> ret = new Pair<IndexedWord, String>(); int headIndex = m.headIndex + 1; try {/*from w ww . j ava 2 s .c om*/ IndexedWord w = m.dependency.getNodeByIndex(headIndex); if (w == null) return ret; while (true) { IndexedWord p = null; for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) { if (ret.second() == null) { String relation = parent.first().getShortName(); ret.setSecond(relation); } p = parent.second(); } if (p == null || p.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) { ret.setFirst(p); break; } if (w == p) return ret; w = p; } } catch (Exception e) { return ret; } return ret; }
From source file:org.textmining.annotator.StanfordCoreNlpAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/*from w w w. ja v a 2 s . c om*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } //end of process-method }