List of usage examples for edu.stanford.nlp.semgraph SemanticGraph getRoots
public Collection<IndexedWord> getRoots()
From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java
License:Creative Commons License
/** * Gets the candidate hypernyms form the provided subdef * * @returns a mapping from the candidate to the heuristics that generated it *///from w w w . j a v a2s .co m MultiMap<String, String> getCandidates(SemanticGraph dependencies, String subdef, POS spos_) { MultiMap<String, String> candidates = new HashMultiMap<String, String>(); char sensePos = toChar(spos_); Collection<IndexedWord> roots = dependencies.getRoots(); next_root: for (IndexedWord root : roots) { String word = root.get(TextAnnotation.class); String lemma = root.get(LemmaAnnotation.class); String pos = root.get(PartOfSpeechAnnotation.class); char lemmaPos = pos.substring(0, 1).toLowerCase().charAt(0); String lemmaLc = lemma.toLowerCase(); //System.out.println("testing: " + lemma + "/" + pos); // If the lemma is a verb, check for phrasal verbal particle (e.g., // "lead on", "edge out") and if present, add them to the lemma if (lemmaPos == 'v') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("prt")) { IndexedWord dep = e.getDependent(); lemma = lemma + " " + dep.get(LemmaAnnotation.class); break; } } } // Heuristic 1: root matches exact POS if (lemmaPos == sensePos) { // Edge case for Heuristics 7: If the lemma is a noun and is // saying that this is an instance (e.g., "An instance of ..."), // then we take the dependent noun from instance // // Terrible example: // The second of the two Books of Chronicles and the // fourteenth book of the Old Testament of the Bible. // boolean foundExistentialDependent = false; if (lemma.equals("instance") || lemma.equals("example") || lemma.equals("first") || lemma.equals("second") || lemma.equals("third") || lemma.equals("fourth") || lemma.equals("fifth") || lemma.equals("sixth") || lemma.equals("series")) { // Check that there's actually a prepositional phrase // attached List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("prep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase() .charAt(0); //System.out.println("HEURISTIC 7"); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-7"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-7"); foundExistentialDependent = true; } } } } if (foundExistentialDependent) continue next_root; // Heuristic 10: In the case of noun phrases, take the last noun // in the phrase, e.g., "Molten material", "pringtime snow // runoff" List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); boolean foundDependent = false; for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); //System.out.println("HEURISTIC 10"); if (depPos == sensePos) { foundDependent = true; candidates.put(depLemma, "Heuristic-10"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-10"); } } } if (!foundDependent) { //System.out.println("HEURISTIC 1"); candidates.put(lemma, "Heuristic-1"); addSiblings(root, candidates, sensePos, dependencies, "Heuristic-1"); } } // Heuristic 2: subdef is either (1) one word or (2) two or more // word that *must be connected by a conjunction, and (3) the lemma // has the wrong part of speech, but could have the same POS (i.e., // the lemma was probably POS-tagged incorrectly). if (sensePos != lemmaPos) { // Only one word in the subdef, which can manifest itself as the // graph having no vertices! (size == 0) if (dependencies.size() < 1) { // System.out.println("HEURISTIC 2a"); IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-2a"); else { // Sometimes adjectves get lemmatized to a verb form // which is in correct. Check to see if the token // matches String token = root.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-2a"); } } else { // System.out.println("HEURISTIC 2b"); Set<IndexedWord> tmp = new HashSet<IndexedWord>(); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("conj")) { if (tmp.size() == 0) tmp.add(root); tmp.add(e.getDependent()); } } if (!tmp.isEmpty()) { for (IndexedWord iw : tmp) { String lem = iw.get(LemmaAnnotation.class); IIndexWord iword = dict.getIndexWord(lem, spos_); if (iword != null) candidates.put(lem, "Heuristic-2b"); else { // Sometimes adjectves get lemmatized to a verb // form which is in correct. Check to see if // the token matches String token = iw.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-2b"); } } //System.out.println(tmp); } } } // Heuristics 3: the subdef is phrased as an overly-general description // of a person using "one", e.g., "one who does X". Replace this with // "person" if (sensePos == 'n' && (lemma.equals("one") || lemma.equals("someone"))) { // check the dependency graph for a "who" attachment // TODO // ... or be lazy and just check for the token Matcher m = WHO.matcher(subdef); if (m.find()) { candidates.put("person", "Heuristic-3: Person"); } } // Heuristic 4: if the root lemma is an adjective and the target // sense is a noun, look for a modifying a noun or set of nouns, // report those /// // Example: "a small, arched passageway" if (sensePos == 'n' && lemmaPos == 'j') { //System.out.println("HEURISTIC 4"); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("appos") || e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); // System.out.println("!!! " + depLemma); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-4: Head Noun"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-4: Head Noun"); } //break; } } } // Heuristic 5: if the root lemma is a verb and the target sense is // a noun, look for a subject noun if (sensePos == 'n' && lemmaPos == 'v') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("nsubj")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-5: Subject Noun"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-5: Subject Noun"); } break; } } } // Heuristic 6: if the root lemma is an existential quantifier or // something like it (e.g., "Any of ...") and // the target sense is a noun, look for a subject noun if (sensePos == 'n' && lemmaPos == 'd') { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("prep") || e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); // System.out.println(depLemma + "/" + depPos); // This should be the common case if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-6: Existential Example"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-6: Existential Example"); } // This is for some really (really) unusually parsed // edge cases else { List<SemanticGraphEdge> depEdges = dependencies.outgoingEdgeList(dep); for (SemanticGraphEdge e2 : depEdges) { if (e2.getRelation().getShortName().equals("rcmod")) { IndexedWord dep2 = e2.getDependent(); String depLemma2 = dep2.get(LemmaAnnotation.class); char depPos2 = dep2.get(PartOfSpeechAnnotation.class).substring(0, 1) .toLowerCase().charAt(0); if (depPos2 == sensePos) { candidates.put(depLemma2, "Heuristic-6: Existential Example"); addSiblings(dep2, candidates, sensePos, dependencies, "Heuristic-6: Existential Example"); } } } } } } } // Heuristic 8: if the root lemma is a verb and the sense is an // adjective, but the verb is modified by an adverb, this catches // that cases that Heuristics 2 does not if (sensePos == 'j' && lemmaPos == 'v') { Set<IndexedWord> tmp = new HashSet<IndexedWord>(); List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("advmod")) { IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-8: Adv-modified Verb"); else { // Sometimes adjectves get lemmatized to a verb // form which is in correct. Check to see if // the token matches String token = root.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-8: Adv-modified Verb"); } } } } // Heuristic 9: if the sense is an adjective and the root lemma // begins with with a negative *and* the gloss contains something // like "not [x]", then pull out the "x" and use it as the hypernym if (sensePos == 'j' && lemma.equals("not")) { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { // System.out.printf("edge from %s -> %s %s%n", lemma, // e.getRelation().getShortName(), // e.getRelation().getLongName()); if (e.getRelation().getShortName().equals("dep")) { IndexedWord dep = e.getDependent(); String depLemma = dep.get(LemmaAnnotation.class); char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0); if (depPos == sensePos) { candidates.put(depLemma, "Heuristic-9: negated adj"); addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-9: negated adj"); } break; } } } // Heuristic 11: if the sense is a verb and the root lemma // is "to", this is probably a case of mistaken POS-tagging if (sensePos == 'v' && lemma.equals("to")) { List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root); for (SemanticGraphEdge e : edges) { if (e.getRelation().getShortName().equals("pobj")) { IndexedWord dep = e.getDependent(); IIndexWord iword = dict.getIndexWord(lemma, spos_); if (iword != null) candidates.put(lemma, "Heuristic-11: verbal infinitive"); else { // Sometimes verbs get lemmatized to a noun form // that is incorrect. Check to see if the token // matches String token = dep.get(TextAnnotation.class); iword = dict.getIndexWord(token, spos_); if (iword != null) candidates.put(token, "Heuristic-9: verbal infinitive"); } } } } } return candidates; }
From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro.java
License:Open Source License
public static void convertDependencies(JCas aJCas, Annotation document, MappingProvider mappingProvider, boolean internStrings) { for (CoreMap s : document.get(SentencesAnnotation.class)) { SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class); //SemanticGraph graph = s.get(EnhancedDependenciesAnnotation.class); // If there are no dependencies for this sentence, skip it. Might well mean we // skip all sentences because normally either there are dependencies for all or for // none./*from w ww . j a v a2 s. c o m*/ if (graph == null) { continue; } for (IndexedWord root : graph.getRoots()) { Dependency dep = new ROOT(aJCas); dep.setDependencyType("root"); dep.setDependent(root.get(TokenKey.class)); dep.setGovernor(root.get(TokenKey.class)); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.setFlavor(DependencyFlavor.BASIC); dep.addToIndexes(); } for (SemanticGraphEdge edge : graph.edgeListSorted()) { Token dependent = edge.getDependent().get(TokenKey.class); Token governor = edge.getGovernor().get(TokenKey.class); // For the type mapping, we use getShortName() instead, because the <specific> // actually doesn't change the relation type String labelUsedForMapping = edge.getRelation().getShortName(); // The nndepparser may produce labels in which the shortName contains a colon. // These represent language-specific labels of the UD, cf: // http://universaldependencies.github.io/docs/ext-dep-index.html labelUsedForMapping = StringUtils.substringBefore(labelUsedForMapping, ":"); // Need to use toString() here to get "<shortname>_<specific>" String actualLabel = edge.getRelation().toString(); Type depRel = mappingProvider.getTagType(labelUsedForMapping); Dependency dep = (Dependency) aJCas.getCas().createFS(depRel); dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel); dep.setDependent(dependent); dep.setGovernor(governor); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.setFlavor(edge.isExtra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); dep.addToIndexes(); } } }
From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/*from ww w. j ava 2 s . c om*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }
From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/*from w w w .ja va 2 s . c o m*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); //String line = mention.getCoveredText(); //System.out.println(line); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { //NamedEntityMention mention=entity.getMentions(3); //System.out.println(mention.getBegin()); entity.addToIndexes(); } }
From source file:edu.jhu.hlt.concrete.stanford.PreNERCoreMapWrapper.java
License:Open Source License
private List<Dependency> makeDependencies(SemanticGraph graph) { List<Dependency> depList = new ArrayList<Dependency>(); for (IndexedWord root : graph.getRoots()) { // this mimics CoreNLP's handling String rel = GrammaticalRelation.ROOT.getLongName().replaceAll("\\s+", ""); int dep = root.index() - 1; Dependency depend = DependencyFactory.create(dep, rel); depList.add(depend);// ww w .j av a 2s. c o m } for (SemanticGraphEdge edge : graph.edgeListSorted()) { String rel = edge.getRelation().toString().replaceAll("\\s+", ""); int gov = edge.getSource().index() - 1; int dep = edge.getTarget().index() - 1; Dependency depend = DependencyFactory.create(dep, rel, gov); depList.add(depend); } return depList; }
From source file:featureExtractor.NLPFeatures.java
static void processLine(String text, int lineId) throws IOException { bw_root.write(Integer.toString(lineId)); bw_subj.write(Integer.toString(lineId)); bw_underRoot.write(Integer.toString(lineId)); bw_nerType.write(Integer.toString(lineId)); //text = "A gigantic Hong Kong set was constructed in downtown Detroit. The set was so big that the Detroit People Mover track ended up becoming part of the set and shooting had to be adjusted to allow the track to move through the set. ";//"One of three new television series scheduled for release in 2014 based on DC Comics characters. The others being Constantine (2014) and The Flash (2014). "; HashMap<String, Integer> nerCount = new HashMap<>(); int superlativePOS = 0; try {// w w w .j a v a 2 s . c om Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { SemanticGraph dependencies = sentence .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); // getting root words for (IndexedWord rword : dependencies.getRoots()) { //System.out.println(rword.lemma()); //System.out.println(rword.ner()); if (rword.ner().equals("O")) bw_root.write("\t" + rword.ner() + ":" + rword.lemma()); //else if(rword.ner().equals("PERSON")) else bw_root.write("\t" + rword.ner() + ":" + rword.originalText()); /* else bw_root.write(" entity_" + rword.ner()); */ // under root for (IndexedWord child : dependencies.getChildren(rword)) { //System.out.println("here: " + child.originalText()); /* if(child.ner().equals("PERSON")) bw_underRoot.write(" " + child.originalText()); else*/ if (!child.ner().equals("O")) bw_underRoot.write("\t" + child.ner() + ":" + child.originalText()); } // nsubj | nsubpass words GrammaticalRelation[] subjects = { EnglishGrammaticalRelations.NOMINAL_SUBJECT, EnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT }; for (IndexedWord current : dependencies.descendants(rword)) for (IndexedWord nsubWord : dependencies.getChildrenWithRelns(current, Arrays.asList(subjects))) { //System.out.println("wow: " + nsubWord.originalText()); if (!nsubWord.ner().equals("O")) bw_subj.write("\t" + nsubWord.ner() + ":" + nsubWord.originalText()); else { //System.out.println(nsubWord.lemma()); bw_subj.write("\t" + nsubWord.ner() + ":" + nsubWord.lemma()); } /* else bw_subj.write(" entity_"+nsubWord.ner()); */ } } // NER Types frequency for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); if (pos.equals("JJS") || pos.equals("RBS")) superlativePOS++; nerCount.putIfAbsent(ne, 0); nerCount.put(ne, nerCount.get(ne) + 1); } //System.out.println("dependency graph:\n" + dependencies); } } catch (Exception e) { System.out.println("IGNORED:"); } bw_nerType.write("\t" + Integer.toString(superlativePOS)); for (String ne : ners) { if (nerCount.containsKey(ne)) bw_nerType.write("\t" + nerCount.get(ne).toString()); else bw_nerType.write("\t0"); } bw_root.write("\n"); bw_underRoot.write("\n"); bw_nerType.write("\n"); bw_subj.write("\n"); if (lineId % 25 == 0) { bw_root.flush(); bw_underRoot.flush(); bw_nerType.flush(); bw_subj.flush(); } }
From source file:ie.pars.bnc.preprocess.ProcessNLP.java
License:Open Source License
/** * * @param inputStreamFile//from ww w. j a v a 2 s .c o m * @param morphology * @param posTagger * @param parser * @return * @throws Exception */ public static StringBuilder parseBNCXML(InputStream inputStreamFile, Morphology morphology, MaxentTagger posTagger, ParserGrammar parser) throws Exception { StringBuilder results = new StringBuilder(); int counterSent = 0; List<List<List<WordLemmaTag>>> parseBNCXMLTokenized = parseBNCXMLTokenized(inputStreamFile); for (List<List<WordLemmaTag>> xparseBNCXMLL : parseBNCXMLTokenized) { results.append("<p>\n"); for (List<WordLemmaTag> para : xparseBNCXMLL) { if (counterSent++ % 20 == 0) { System.out.print("."); } results.append("<s>\n"); List<TaggedWord> tagSentence = posTagger.tagSentence(para, true); Tree parseTree = parser.parse(tagSentence); GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree, parser.treebankLanguagePack().punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree(); SemanticGraph depTree = new SemanticGraph(deps); for (int i = 0; i < tagSentence.size(); ++i) { int head = -1; String deprel = null; // if (depTree != null) { Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index) .collect(Collectors.toSet()); IndexedWord node = depTree.getNodeByIndexSafe(i + 1); if (node != null) { List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node); if (!edgeList.isEmpty()) { assert edgeList.size() == 1; head = edgeList.get(0).getGovernor().index(); deprel = edgeList.get(0).getRelation().toString(); } else if (rootSet.contains(i + 1)) { head = 0; deprel = "ROOT"; } } // } // Write the token TaggedWord lexHead = null; if (head > 0) { lexHead = tagSentence.get(head - 1); } results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n"); } results.append("</s>\n"); } results.append("</p>\n"); } System.out.println(""); inputStreamFile.close(); return results; }
From source file:ie.pars.bnc.preprocess.ProcessNLP.java
License:Open Source License
private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger, ParserGrammar parser, String sid) { TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory(); // TokenizerFactory<WordLemmaTag> tokenizerFactory; // TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , ""); // TokenizerFactory<Word> factory1 = PTBTokenizer.factory(); StringBuilder results = new StringBuilder(); results.append("<s id='" + sid + "'>\n"); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize); Tree parseTree = parser.parse(tagSentence); GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree, parser.treebankLanguagePack().punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree(); SemanticGraph depTree = new SemanticGraph(deps); for (int i = 0; i < tagSentence.size(); ++i) { int head = -1; String deprel = null;//from ww w. j a v a 2s . c o m // if (depTree != null) { Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet()); IndexedWord node = depTree.getNodeByIndexSafe(i + 1); if (node != null) { List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node); if (!edgeList.isEmpty()) { assert edgeList.size() == 1; head = edgeList.get(0).getGovernor().index(); deprel = edgeList.get(0).getRelation().toString(); } else if (rootSet.contains(i + 1)) { head = 0; deprel = "ROOT"; } } // } // Write the token TaggedWord lexHead = null; if (head > 0) { lexHead = tagSentence.get(head - 1); } results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n"); } results.append("</s>\n"); return results; }
From source file:org.sam_agent.csparser.ContinuousParser.java
License:Open Source License
/** * Parse a sentence with the Stanford Parser, returning a JSON string of the dependencies and part-of-speech tags. * @param text/*ww w . j a va2 s . c o m*/ * @return */ public String parse(String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); List<String> sentencesList = new ArrayList<String>(); for (CoreMap sentence : sentences) { String sentenceString = sentence.get(CoreAnnotations.TextAnnotation.class); String wordsJSON = stringify(sentence); SemanticGraph dependencies = sentence .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class); String dependenciesJSON = stringify(dependencies); String rootsJSON = stringify(dependencies.getRoots()); sentencesList.add(String.format("{\"sentence\":\"%s\",%s,%s,\"roots\":%s}", sentenceString, wordsJSON, dependenciesJSON, rootsJSON)); } return String.format("{\"input\":\"%s\",\"sentences\":[%s]}", text, String.join(",", sentencesList)); }
From source file:org.textmining.annotator.StanfordCoreNlpAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/* w ww .j a v a 2s. c om*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } //end of process-method }