Example usage for edu.stanford.nlp.semgraph SemanticGraph getRoots

List of usage examples for edu.stanford.nlp.semgraph SemanticGraph getRoots

Introduction

In this page you can find the example usage for edu.stanford.nlp.semgraph SemanticGraph getRoots.

Prototype

public Collection<IndexedWord> getRoots() 

Source Link

Usage

From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java

License:Creative Commons License

/** 
 * Gets the candidate hypernyms form the provided subdef
 *
 * @returns a mapping from the candidate to the heuristics that generated it
 *///from   w w  w  . j a v a2s .co  m
MultiMap<String, String> getCandidates(SemanticGraph dependencies, String subdef, POS spos_) {

    MultiMap<String, String> candidates = new HashMultiMap<String, String>();
    char sensePos = toChar(spos_);

    Collection<IndexedWord> roots = dependencies.getRoots();
    next_root: for (IndexedWord root : roots) {
        String word = root.get(TextAnnotation.class);
        String lemma = root.get(LemmaAnnotation.class);
        String pos = root.get(PartOfSpeechAnnotation.class);
        char lemmaPos = pos.substring(0, 1).toLowerCase().charAt(0);

        String lemmaLc = lemma.toLowerCase();

        //System.out.println("testing: " + lemma + "/" + pos);

        // If the lemma is a verb, check for phrasal verbal particle (e.g.,
        // "lead on", "edge out") and if present, add them to the lemma
        if (lemmaPos == 'v') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("prt")) {
                    IndexedWord dep = e.getDependent();
                    lemma = lemma + " " + dep.get(LemmaAnnotation.class);
                    break;
                }
            }
        }

        // Heuristic 1: root matches exact POS
        if (lemmaPos == sensePos) {

            // Edge case for Heuristics 7: If the lemma is a noun and is
            // saying that this is an instance (e.g., "An instance of ..."),
            // then we take the dependent noun from instance
            //
            // Terrible example:
            //   The second of the two Books of Chronicles and the
            //   fourteenth book of the Old Testament of the Bible.
            //
            boolean foundExistentialDependent = false;
            if (lemma.equals("instance") || lemma.equals("example") || lemma.equals("first")
                    || lemma.equals("second") || lemma.equals("third") || lemma.equals("fourth")
                    || lemma.equals("fifth") || lemma.equals("sixth") || lemma.equals("series")) {
                // Check that there's actually a prepositional phrase
                // attached
                List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);

                for (SemanticGraphEdge e : edges) {
                    if (e.getRelation().getShortName().equals("prep")) {
                        IndexedWord dep = e.getDependent();
                        String depLemma = dep.get(LemmaAnnotation.class);
                        char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase()
                                .charAt(0);

                        //System.out.println("HEURISTIC 7");
                        if (depPos == sensePos) {
                            candidates.put(depLemma, "Heuristic-7");
                            addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-7");
                            foundExistentialDependent = true;
                        }
                    }
                }
            }
            if (foundExistentialDependent)
                continue next_root;

            // Heuristic 10: In the case of noun phrases, take the last noun
            // in the phrase, e.g., "Molten material", "pringtime snow
            // runoff"
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            boolean foundDependent = false;
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();
                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    //System.out.println("HEURISTIC 10");
                    if (depPos == sensePos) {
                        foundDependent = true;
                        candidates.put(depLemma, "Heuristic-10");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-10");
                    }
                }
            }

            if (!foundDependent) {
                //System.out.println("HEURISTIC 1");
                candidates.put(lemma, "Heuristic-1");
                addSiblings(root, candidates, sensePos, dependencies, "Heuristic-1");
            }
        }

        // Heuristic 2: subdef is either (1) one word or (2) two or more
        // word that *must be connected by a conjunction, and (3) the lemma
        // has the wrong part of speech, but could have the same POS (i.e.,
        // the lemma was probably POS-tagged incorrectly).  
        if (sensePos != lemmaPos) {

            // Only one word in the subdef, which can manifest itself as the
            // graph having no vertices! (size == 0)
            if (dependencies.size() < 1) {
                // System.out.println("HEURISTIC 2a");
                IIndexWord iword = dict.getIndexWord(lemma, spos_);
                if (iword != null)
                    candidates.put(lemma, "Heuristic-2a");
                else {
                    // Sometimes adjectves get lemmatized to a verb form
                    // which is in correct.  Check to see if the token
                    // matches
                    String token = root.get(TextAnnotation.class);
                    iword = dict.getIndexWord(token, spos_);
                    if (iword != null)
                        candidates.put(token, "Heuristic-2a");
                }
            } else {
                // System.out.println("HEURISTIC 2b");
                Set<IndexedWord> tmp = new HashSet<IndexedWord>();
                List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
                for (SemanticGraphEdge e : edges) {
                    // System.out.printf("edge from %s -> %s %s%n", lemma,
                    //                   e.getRelation().getShortName(),
                    //                   e.getRelation().getLongName());
                    if (e.getRelation().getShortName().equals("conj")) {
                        if (tmp.size() == 0)
                            tmp.add(root);
                        tmp.add(e.getDependent());
                    }
                }
                if (!tmp.isEmpty()) {
                    for (IndexedWord iw : tmp) {
                        String lem = iw.get(LemmaAnnotation.class);
                        IIndexWord iword = dict.getIndexWord(lem, spos_);
                        if (iword != null)
                            candidates.put(lem, "Heuristic-2b");
                        else {
                            // Sometimes adjectves get lemmatized to a verb
                            // form which is in correct.  Check to see if
                            // the token matches
                            String token = iw.get(TextAnnotation.class);
                            iword = dict.getIndexWord(token, spos_);
                            if (iword != null)
                                candidates.put(token, "Heuristic-2b");
                        }
                    }
                    //System.out.println(tmp);
                }
            }
        }

        // Heuristics 3: the subdef is phrased as an overly-general description
        // of a person using "one", e.g., "one who does X".  Replace this with
        // "person"
        if (sensePos == 'n' && (lemma.equals("one") || lemma.equals("someone"))) {
            // check the dependency graph for a "who" attachment

            // TODO

            // ... or be lazy and just check for the token
            Matcher m = WHO.matcher(subdef);
            if (m.find()) {
                candidates.put("person", "Heuristic-3: Person");
            }
        }

        // Heuristic 4: if the root lemma is an adjective and the target
        // sense is a noun, look for a modifying a noun or set of nouns,
        // report those
        ///
        // Example: "a small, arched passageway"
        if (sensePos == 'n' && lemmaPos == 'j') {
            //System.out.println("HEURISTIC 4");
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                   e.getRelation().getShortName(),
                //                   e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("appos")
                        || e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();
                    String depLemma = dep.get(LemmaAnnotation.class);
                    // System.out.println("!!! " + depLemma);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-4: Head Noun");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-4: Head Noun");
                    }
                    //break;

                }
            }

        }

        // Heuristic 5: if the root lemma is a verb and the target sense is
        // a noun, look for a subject noun
        if (sensePos == 'n' && lemmaPos == 'v') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("nsubj")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-5: Subject Noun");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-5: Subject Noun");
                    }
                    break;

                }
            }
        }

        // Heuristic 6: if the root lemma is an existential quantifier or
        // something like it (e.g., "Any of ...") and
        // the target sense is a noun, look for a subject noun
        if (sensePos == 'n' && lemmaPos == 'd') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                    e.getRelation().getShortName(),
                //                    e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("prep")
                        || e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    // System.out.println(depLemma + "/" + depPos);

                    // This should be the common case
                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-6: Existential Example");
                        addSiblings(dep, candidates, sensePos, dependencies,
                                "Heuristic-6: Existential Example");
                    }
                    // This is for some really (really) unusually parsed
                    // edge cases
                    else {
                        List<SemanticGraphEdge> depEdges = dependencies.outgoingEdgeList(dep);
                        for (SemanticGraphEdge e2 : depEdges) {

                            if (e2.getRelation().getShortName().equals("rcmod")) {
                                IndexedWord dep2 = e2.getDependent();
                                String depLemma2 = dep2.get(LemmaAnnotation.class);
                                char depPos2 = dep2.get(PartOfSpeechAnnotation.class).substring(0, 1)
                                        .toLowerCase().charAt(0);

                                if (depPos2 == sensePos) {
                                    candidates.put(depLemma2, "Heuristic-6: Existential Example");
                                    addSiblings(dep2, candidates, sensePos, dependencies,
                                            "Heuristic-6: Existential Example");
                                }
                            }
                        }
                    }
                }
            }
        }

        // Heuristic 8: if the root lemma is a verb and the sense is an
        // adjective, but the verb is modified by an adverb, this catches
        // that cases that Heuristics 2 does not
        if (sensePos == 'j' && lemmaPos == 'v') {

            Set<IndexedWord> tmp = new HashSet<IndexedWord>();
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                   e.getRelation().getShortName(),
                //                   e.getRelation().getLongName());
                if (e.getRelation().getShortName().equals("advmod")) {
                    IIndexWord iword = dict.getIndexWord(lemma, spos_);
                    if (iword != null)
                        candidates.put(lemma, "Heuristic-8: Adv-modified Verb");
                    else {
                        // Sometimes adjectves get lemmatized to a verb
                        // form which is in correct.  Check to see if
                        // the token matches
                        String token = root.get(TextAnnotation.class);
                        iword = dict.getIndexWord(token, spos_);
                        if (iword != null)
                            candidates.put(token, "Heuristic-8: Adv-modified Verb");
                    }
                }
            }
        }

        // Heuristic 9: if the sense is an adjective and the root lemma
        // begins with with a negative *and* the gloss contains something
        // like "not [x]", then pull out the "x" and use it as the hypernym
        if (sensePos == 'j' && lemma.equals("not")) {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                    e.getRelation().getShortName(),
                //                    e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-9: negated adj");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-9: negated adj");
                    }
                    break;

                }
            }
        }

        // Heuristic 11: if the sense is a verb and the root lemma
        // is "to", this is probably a case of mistaken POS-tagging
        if (sensePos == 'v' && lemma.equals("to")) {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("pobj")) {
                    IndexedWord dep = e.getDependent();
                    IIndexWord iword = dict.getIndexWord(lemma, spos_);
                    if (iword != null)
                        candidates.put(lemma, "Heuristic-11: verbal infinitive");
                    else {
                        // Sometimes verbs get lemmatized to a noun form
                        // that is incorrect.  Check to see if the token
                        // matches
                        String token = dep.get(TextAnnotation.class);
                        iword = dict.getIndexWord(token, spos_);
                        if (iword != null)
                            candidates.put(token, "Heuristic-9: verbal infinitive");
                    }
                }
            }
        }

    }
    return candidates;
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro.java

License:Open Source License

public static void convertDependencies(JCas aJCas, Annotation document, MappingProvider mappingProvider,
        boolean internStrings) {
    for (CoreMap s : document.get(SentencesAnnotation.class)) {
        SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class);
        //SemanticGraph graph = s.get(EnhancedDependenciesAnnotation.class);

        // If there are no dependencies for this sentence, skip it. Might well mean we
        // skip all sentences because normally either there are dependencies for all or for
        // none./*from   w  ww . j  a v a2  s. c o m*/
        if (graph == null) {
            continue;
        }

        for (IndexedWord root : graph.getRoots()) {
            Dependency dep = new ROOT(aJCas);
            dep.setDependencyType("root");
            dep.setDependent(root.get(TokenKey.class));
            dep.setGovernor(root.get(TokenKey.class));
            dep.setBegin(dep.getDependent().getBegin());
            dep.setEnd(dep.getDependent().getEnd());
            dep.setFlavor(DependencyFlavor.BASIC);
            dep.addToIndexes();
        }

        for (SemanticGraphEdge edge : graph.edgeListSorted()) {
            Token dependent = edge.getDependent().get(TokenKey.class);
            Token governor = edge.getGovernor().get(TokenKey.class);

            // For the type mapping, we use getShortName() instead, because the <specific>
            // actually doesn't change the relation type
            String labelUsedForMapping = edge.getRelation().getShortName();

            // The nndepparser may produce labels in which the shortName contains a colon.
            // These represent language-specific labels of the UD, cf: 
            // http://universaldependencies.github.io/docs/ext-dep-index.html
            labelUsedForMapping = StringUtils.substringBefore(labelUsedForMapping, ":");

            // Need to use toString() here to get "<shortname>_<specific>"
            String actualLabel = edge.getRelation().toString();

            Type depRel = mappingProvider.getTagType(labelUsedForMapping);
            Dependency dep = (Dependency) aJCas.getCas().createFS(depRel);
            dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel);
            dep.setDependent(dependent);
            dep.setGovernor(governor);
            dep.setBegin(dep.getDependent().getBegin());
            dep.setEnd(dep.getDependent().getEnd());
            dep.setFlavor(edge.isExtra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC);
            dep.addToIndexes();
        }
    }
}

From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/*from ww w. j ava 2 s .  c  om*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        entity.addToIndexes();
    }

}

From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/*from  w w w  .ja  va  2 s  . c o  m*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                //String line = mention.getCoveredText();
                //System.out.println(line);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        //NamedEntityMention mention=entity.getMentions(3);
        //System.out.println(mention.getBegin());
        entity.addToIndexes();
    }

}

From source file:edu.jhu.hlt.concrete.stanford.PreNERCoreMapWrapper.java

License:Open Source License

private List<Dependency> makeDependencies(SemanticGraph graph) {
    List<Dependency> depList = new ArrayList<Dependency>();
    for (IndexedWord root : graph.getRoots()) {
        // this mimics CoreNLP's handling
        String rel = GrammaticalRelation.ROOT.getLongName().replaceAll("\\s+", "");
        int dep = root.index() - 1;
        Dependency depend = DependencyFactory.create(dep, rel);
        depList.add(depend);// ww  w  .j  av a  2s. c o  m
    }
    for (SemanticGraphEdge edge : graph.edgeListSorted()) {
        String rel = edge.getRelation().toString().replaceAll("\\s+", "");
        int gov = edge.getSource().index() - 1;
        int dep = edge.getTarget().index() - 1;
        Dependency depend = DependencyFactory.create(dep, rel, gov);
        depList.add(depend);
    }
    return depList;
}

From source file:featureExtractor.NLPFeatures.java

static void processLine(String text, int lineId) throws IOException {
    bw_root.write(Integer.toString(lineId));
    bw_subj.write(Integer.toString(lineId));
    bw_underRoot.write(Integer.toString(lineId));
    bw_nerType.write(Integer.toString(lineId));

    //text = "A gigantic Hong Kong set was constructed in downtown Detroit. The set was so big that the Detroit People Mover track ended up becoming part of the set and shooting had to be adjusted to allow the track to move through the set.  ";//"One of three new television series scheduled for release in 2014 based on DC Comics characters. The others being Constantine (2014) and The Flash (2014).  ";
    HashMap<String, Integer> nerCount = new HashMap<>();
    int superlativePOS = 0;

    try {//  w w w .j a  v  a 2 s  .  c om
        Annotation document = new Annotation(text);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

        for (CoreMap sentence : sentences) {
            SemanticGraph dependencies = sentence
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
            // getting root words
            for (IndexedWord rword : dependencies.getRoots()) {
                //System.out.println(rword.lemma());
                //System.out.println(rword.ner());
                if (rword.ner().equals("O"))
                    bw_root.write("\t" + rword.ner() + ":" + rword.lemma());
                //else if(rword.ner().equals("PERSON"))
                else
                    bw_root.write("\t" + rword.ner() + ":" + rword.originalText());
                /*
                else
                bw_root.write(" entity_" + rword.ner());
                */
                // under root
                for (IndexedWord child : dependencies.getChildren(rword)) {
                    //System.out.println("here: " + child.originalText());
                    /*
                    if(child.ner().equals("PERSON"))
                    bw_underRoot.write(" " + child.originalText());
                    else*/
                    if (!child.ner().equals("O"))
                        bw_underRoot.write("\t" + child.ner() + ":" + child.originalText());
                }

                // nsubj | nsubpass words
                GrammaticalRelation[] subjects = { EnglishGrammaticalRelations.NOMINAL_SUBJECT,
                        EnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT };
                for (IndexedWord current : dependencies.descendants(rword))
                    for (IndexedWord nsubWord : dependencies.getChildrenWithRelns(current,
                            Arrays.asList(subjects))) {
                        //System.out.println("wow: " + nsubWord.originalText());
                        if (!nsubWord.ner().equals("O"))
                            bw_subj.write("\t" + nsubWord.ner() + ":" + nsubWord.originalText());
                        else {
                            //System.out.println(nsubWord.lemma());
                            bw_subj.write("\t" + nsubWord.ner() + ":" + nsubWord.lemma());
                        } /*
                          else
                          bw_subj.write(" entity_"+nsubWord.ner());
                          */
                    }
            }

            // NER Types frequency
            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

                if (pos.equals("JJS") || pos.equals("RBS"))
                    superlativePOS++;

                nerCount.putIfAbsent(ne, 0);
                nerCount.put(ne, nerCount.get(ne) + 1);
            }

            //System.out.println("dependency graph:\n" + dependencies);
        }
    } catch (Exception e) {
        System.out.println("IGNORED:");
    }

    bw_nerType.write("\t" + Integer.toString(superlativePOS));

    for (String ne : ners) {
        if (nerCount.containsKey(ne))
            bw_nerType.write("\t" + nerCount.get(ne).toString());
        else
            bw_nerType.write("\t0");
    }
    bw_root.write("\n");
    bw_underRoot.write("\n");
    bw_nerType.write("\n");
    bw_subj.write("\n");
    if (lineId % 25 == 0) {
        bw_root.flush();
        bw_underRoot.flush();
        bw_nerType.flush();
        bw_subj.flush();
    }
}

From source file:ie.pars.bnc.preprocess.ProcessNLP.java

License:Open Source License

/**
 *
 * @param inputStreamFile//from  ww  w.  j a v a 2  s  .c  o  m
 * @param morphology
 * @param posTagger
 * @param parser
 * @return
 * @throws Exception
 */
public static StringBuilder parseBNCXML(InputStream inputStreamFile, Morphology morphology,
        MaxentTagger posTagger, ParserGrammar parser) throws Exception {
    StringBuilder results = new StringBuilder();
    int counterSent = 0;
    List<List<List<WordLemmaTag>>> parseBNCXMLTokenized = parseBNCXMLTokenized(inputStreamFile);
    for (List<List<WordLemmaTag>> xparseBNCXMLL : parseBNCXMLTokenized) {
        results.append("<p>\n");
        for (List<WordLemmaTag> para : xparseBNCXMLL) {
            if (counterSent++ % 20 == 0) {
                System.out.print(".");
            }
            results.append("<s>\n");
            List<TaggedWord> tagSentence = posTagger.tagSentence(para, true);

            Tree parseTree = parser.parse(tagSentence);

            GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree,
                    parser.treebankLanguagePack().punctuationWordRejectFilter(),
                    parser.getTLPParams().typedDependencyHeadFinder());

            Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree();
            SemanticGraph depTree = new SemanticGraph(deps);

            for (int i = 0; i < tagSentence.size(); ++i) {

                int head = -1;
                String deprel = null;
                //                    if (depTree != null) {
                Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index)
                        .collect(Collectors.toSet());
                IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
                if (node != null) {
                    List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
                    if (!edgeList.isEmpty()) {
                        assert edgeList.size() == 1;
                        head = edgeList.get(0).getGovernor().index();
                        deprel = edgeList.get(0).getRelation().toString();
                    } else if (rootSet.contains(i + 1)) {
                        head = 0;
                        deprel = "ROOT";
                    }
                }
                //     }

                // Write the token
                TaggedWord lexHead = null;
                if (head > 0) {
                    lexHead = tagSentence.get(head - 1);
                }
                results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n");
            }
            results.append("</s>\n");
        }
        results.append("</p>\n");
    }
    System.out.println("");
    inputStreamFile.close();

    return results;
}

From source file:ie.pars.bnc.preprocess.ProcessNLP.java

License:Open Source License

private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger,
        ParserGrammar parser, String sid) {
    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory();
    //        TokenizerFactory<WordLemmaTag> tokenizerFactory;
    //        TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , "");
    //        TokenizerFactory<Word> factory1 = PTBTokenizer.factory();

    StringBuilder results = new StringBuilder();
    results.append("<s id='" + sid + "'>\n");

    StringReader sr = new StringReader(sentence);
    Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
    List<Word> tokenize = tokenizer.tokenize();

    List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize);

    Tree parseTree = parser.parse(tagSentence);

    GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree,
            parser.treebankLanguagePack().punctuationWordRejectFilter(),
            parser.getTLPParams().typedDependencyHeadFinder());

    Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree();
    SemanticGraph depTree = new SemanticGraph(deps);

    for (int i = 0; i < tagSentence.size(); ++i) {

        int head = -1;
        String deprel = null;//from  ww w. j a  v  a 2s  . c  o  m
        //                    if (depTree != null) {
        Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
        IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
        if (node != null) {
            List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
            if (!edgeList.isEmpty()) {
                assert edgeList.size() == 1;
                head = edgeList.get(0).getGovernor().index();
                deprel = edgeList.get(0).getRelation().toString();
            } else if (rootSet.contains(i + 1)) {
                head = 0;
                deprel = "ROOT";
            }
        }
        //     }

        // Write the token
        TaggedWord lexHead = null;
        if (head > 0) {
            lexHead = tagSentence.get(head - 1);
        }
        results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n");
    }
    results.append("</s>\n");
    return results;
}

From source file:org.sam_agent.csparser.ContinuousParser.java

License:Open Source License

/**
 * Parse a sentence with the Stanford Parser, returning a JSON string of the dependencies and part-of-speech tags.
 * @param text/*ww  w . j a  va2  s  . c o m*/
 * @return
 */
public String parse(String text) {

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> sentencesList = new ArrayList<String>();

    for (CoreMap sentence : sentences) {
        String sentenceString = sentence.get(CoreAnnotations.TextAnnotation.class);
        String wordsJSON = stringify(sentence);
        SemanticGraph dependencies = sentence
                .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
        String dependenciesJSON = stringify(dependencies);
        String rootsJSON = stringify(dependencies.getRoots());
        sentencesList.add(String.format("{\"sentence\":\"%s\",%s,%s,\"roots\":%s}", sentenceString, wordsJSON,
                dependenciesJSON, rootsJSON));
    }

    return String.format("{\"input\":\"%s\",\"sentences\":[%s]}", text, String.join(",", sentencesList));
}

From source file:org.textmining.annotator.StanfordCoreNlpAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/* w  ww  .j  a v  a 2s.  c om*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        entity.addToIndexes();
    }

    //end of process-method
}