Example usage for edu.stanford.nlp.ling IndexedWord get

List of usage examples for edu.stanford.nlp.ling IndexedWord get

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling IndexedWord get.

Prototype

@Override
    public <VALUE> VALUE get(Class<? extends TypesafeMap.Key<VALUE>> key) 

Source Link

Usage

From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java

License:Creative Commons License

/** 
 * Gets the candidate hypernyms form the provided subdef
 *
 * @returns a mapping from the candidate to the heuristics that generated it
 *//*from  w ww .  j  a  v  a  2s.  c om*/
MultiMap<String, String> getCandidates(SemanticGraph dependencies, String subdef, POS spos_) {

    MultiMap<String, String> candidates = new HashMultiMap<String, String>();
    char sensePos = toChar(spos_);

    Collection<IndexedWord> roots = dependencies.getRoots();
    next_root: for (IndexedWord root : roots) {
        String word = root.get(TextAnnotation.class);
        String lemma = root.get(LemmaAnnotation.class);
        String pos = root.get(PartOfSpeechAnnotation.class);
        char lemmaPos = pos.substring(0, 1).toLowerCase().charAt(0);

        String lemmaLc = lemma.toLowerCase();

        //System.out.println("testing: " + lemma + "/" + pos);

        // If the lemma is a verb, check for phrasal verbal particle (e.g.,
        // "lead on", "edge out") and if present, add them to the lemma
        if (lemmaPos == 'v') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("prt")) {
                    IndexedWord dep = e.getDependent();
                    lemma = lemma + " " + dep.get(LemmaAnnotation.class);
                    break;
                }
            }
        }

        // Heuristic 1: root matches exact POS
        if (lemmaPos == sensePos) {

            // Edge case for Heuristics 7: If the lemma is a noun and is
            // saying that this is an instance (e.g., "An instance of ..."),
            // then we take the dependent noun from instance
            //
            // Terrible example:
            //   The second of the two Books of Chronicles and the
            //   fourteenth book of the Old Testament of the Bible.
            //
            boolean foundExistentialDependent = false;
            if (lemma.equals("instance") || lemma.equals("example") || lemma.equals("first")
                    || lemma.equals("second") || lemma.equals("third") || lemma.equals("fourth")
                    || lemma.equals("fifth") || lemma.equals("sixth") || lemma.equals("series")) {
                // Check that there's actually a prepositional phrase
                // attached
                List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);

                for (SemanticGraphEdge e : edges) {
                    if (e.getRelation().getShortName().equals("prep")) {
                        IndexedWord dep = e.getDependent();
                        String depLemma = dep.get(LemmaAnnotation.class);
                        char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase()
                                .charAt(0);

                        //System.out.println("HEURISTIC 7");
                        if (depPos == sensePos) {
                            candidates.put(depLemma, "Heuristic-7");
                            addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-7");
                            foundExistentialDependent = true;
                        }
                    }
                }
            }
            if (foundExistentialDependent)
                continue next_root;

            // Heuristic 10: In the case of noun phrases, take the last noun
            // in the phrase, e.g., "Molten material", "pringtime snow
            // runoff"
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            boolean foundDependent = false;
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();
                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    //System.out.println("HEURISTIC 10");
                    if (depPos == sensePos) {
                        foundDependent = true;
                        candidates.put(depLemma, "Heuristic-10");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-10");
                    }
                }
            }

            if (!foundDependent) {
                //System.out.println("HEURISTIC 1");
                candidates.put(lemma, "Heuristic-1");
                addSiblings(root, candidates, sensePos, dependencies, "Heuristic-1");
            }
        }

        // Heuristic 2: subdef is either (1) one word or (2) two or more
        // word that *must be connected by a conjunction, and (3) the lemma
        // has the wrong part of speech, but could have the same POS (i.e.,
        // the lemma was probably POS-tagged incorrectly).  
        if (sensePos != lemmaPos) {

            // Only one word in the subdef, which can manifest itself as the
            // graph having no vertices! (size == 0)
            if (dependencies.size() < 1) {
                // System.out.println("HEURISTIC 2a");
                IIndexWord iword = dict.getIndexWord(lemma, spos_);
                if (iword != null)
                    candidates.put(lemma, "Heuristic-2a");
                else {
                    // Sometimes adjectves get lemmatized to a verb form
                    // which is in correct.  Check to see if the token
                    // matches
                    String token = root.get(TextAnnotation.class);
                    iword = dict.getIndexWord(token, spos_);
                    if (iword != null)
                        candidates.put(token, "Heuristic-2a");
                }
            } else {
                // System.out.println("HEURISTIC 2b");
                Set<IndexedWord> tmp = new HashSet<IndexedWord>();
                List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
                for (SemanticGraphEdge e : edges) {
                    // System.out.printf("edge from %s -> %s %s%n", lemma,
                    //                   e.getRelation().getShortName(),
                    //                   e.getRelation().getLongName());
                    if (e.getRelation().getShortName().equals("conj")) {
                        if (tmp.size() == 0)
                            tmp.add(root);
                        tmp.add(e.getDependent());
                    }
                }
                if (!tmp.isEmpty()) {
                    for (IndexedWord iw : tmp) {
                        String lem = iw.get(LemmaAnnotation.class);
                        IIndexWord iword = dict.getIndexWord(lem, spos_);
                        if (iword != null)
                            candidates.put(lem, "Heuristic-2b");
                        else {
                            // Sometimes adjectves get lemmatized to a verb
                            // form which is in correct.  Check to see if
                            // the token matches
                            String token = iw.get(TextAnnotation.class);
                            iword = dict.getIndexWord(token, spos_);
                            if (iword != null)
                                candidates.put(token, "Heuristic-2b");
                        }
                    }
                    //System.out.println(tmp);
                }
            }
        }

        // Heuristics 3: the subdef is phrased as an overly-general description
        // of a person using "one", e.g., "one who does X".  Replace this with
        // "person"
        if (sensePos == 'n' && (lemma.equals("one") || lemma.equals("someone"))) {
            // check the dependency graph for a "who" attachment

            // TODO

            // ... or be lazy and just check for the token
            Matcher m = WHO.matcher(subdef);
            if (m.find()) {
                candidates.put("person", "Heuristic-3: Person");
            }
        }

        // Heuristic 4: if the root lemma is an adjective and the target
        // sense is a noun, look for a modifying a noun or set of nouns,
        // report those
        ///
        // Example: "a small, arched passageway"
        if (sensePos == 'n' && lemmaPos == 'j') {
            //System.out.println("HEURISTIC 4");
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                   e.getRelation().getShortName(),
                //                   e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("appos")
                        || e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();
                    String depLemma = dep.get(LemmaAnnotation.class);
                    // System.out.println("!!! " + depLemma);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-4: Head Noun");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-4: Head Noun");
                    }
                    //break;

                }
            }

        }

        // Heuristic 5: if the root lemma is a verb and the target sense is
        // a noun, look for a subject noun
        if (sensePos == 'n' && lemmaPos == 'v') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("nsubj")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-5: Subject Noun");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-5: Subject Noun");
                    }
                    break;

                }
            }
        }

        // Heuristic 6: if the root lemma is an existential quantifier or
        // something like it (e.g., "Any of ...") and
        // the target sense is a noun, look for a subject noun
        if (sensePos == 'n' && lemmaPos == 'd') {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                    e.getRelation().getShortName(),
                //                    e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("prep")
                        || e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    // System.out.println(depLemma + "/" + depPos);

                    // This should be the common case
                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-6: Existential Example");
                        addSiblings(dep, candidates, sensePos, dependencies,
                                "Heuristic-6: Existential Example");
                    }
                    // This is for some really (really) unusually parsed
                    // edge cases
                    else {
                        List<SemanticGraphEdge> depEdges = dependencies.outgoingEdgeList(dep);
                        for (SemanticGraphEdge e2 : depEdges) {

                            if (e2.getRelation().getShortName().equals("rcmod")) {
                                IndexedWord dep2 = e2.getDependent();
                                String depLemma2 = dep2.get(LemmaAnnotation.class);
                                char depPos2 = dep2.get(PartOfSpeechAnnotation.class).substring(0, 1)
                                        .toLowerCase().charAt(0);

                                if (depPos2 == sensePos) {
                                    candidates.put(depLemma2, "Heuristic-6: Existential Example");
                                    addSiblings(dep2, candidates, sensePos, dependencies,
                                            "Heuristic-6: Existential Example");
                                }
                            }
                        }
                    }
                }
            }
        }

        // Heuristic 8: if the root lemma is a verb and the sense is an
        // adjective, but the verb is modified by an adverb, this catches
        // that cases that Heuristics 2 does not
        if (sensePos == 'j' && lemmaPos == 'v') {

            Set<IndexedWord> tmp = new HashSet<IndexedWord>();
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                   e.getRelation().getShortName(),
                //                   e.getRelation().getLongName());
                if (e.getRelation().getShortName().equals("advmod")) {
                    IIndexWord iword = dict.getIndexWord(lemma, spos_);
                    if (iword != null)
                        candidates.put(lemma, "Heuristic-8: Adv-modified Verb");
                    else {
                        // Sometimes adjectves get lemmatized to a verb
                        // form which is in correct.  Check to see if
                        // the token matches
                        String token = root.get(TextAnnotation.class);
                        iword = dict.getIndexWord(token, spos_);
                        if (iword != null)
                            candidates.put(token, "Heuristic-8: Adv-modified Verb");
                    }
                }
            }
        }

        // Heuristic 9: if the sense is an adjective and the root lemma
        // begins with with a negative *and* the gloss contains something
        // like "not [x]", then pull out the "x" and use it as the hypernym
        if (sensePos == 'j' && lemma.equals("not")) {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                // System.out.printf("edge from %s -> %s %s%n", lemma,
                //                    e.getRelation().getShortName(),
                //                    e.getRelation().getLongName());

                if (e.getRelation().getShortName().equals("dep")) {
                    IndexedWord dep = e.getDependent();

                    String depLemma = dep.get(LemmaAnnotation.class);
                    char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);

                    if (depPos == sensePos) {
                        candidates.put(depLemma, "Heuristic-9: negated adj");
                        addSiblings(dep, candidates, sensePos, dependencies, "Heuristic-9: negated adj");
                    }
                    break;

                }
            }
        }

        // Heuristic 11: if the sense is a verb and the root lemma
        // is "to", this is probably a case of mistaken POS-tagging
        if (sensePos == 'v' && lemma.equals("to")) {
            List<SemanticGraphEdge> edges = dependencies.outgoingEdgeList(root);
            for (SemanticGraphEdge e : edges) {
                if (e.getRelation().getShortName().equals("pobj")) {
                    IndexedWord dep = e.getDependent();
                    IIndexWord iword = dict.getIndexWord(lemma, spos_);
                    if (iword != null)
                        candidates.put(lemma, "Heuristic-11: verbal infinitive");
                    else {
                        // Sometimes verbs get lemmatized to a noun form
                        // that is incorrect.  Check to see if the token
                        // matches
                        String token = dep.get(TextAnnotation.class);
                        iword = dict.getIndexWord(token, spos_);
                        if (iword != null)
                            candidates.put(token, "Heuristic-9: verbal infinitive");
                    }
                }
            }
        }

    }
    return candidates;
}

From source file:ca.mcgill.cs.crown.procedure.ParseExtractor.java

License:Creative Commons License

/**
 * If we know we want {@code toAdd}, get all of its siblings that are joined
 * by conjunctions as candidates too//from  w  ww  .ja v  a 2s  . co  m
 */
void addSiblings(IndexedWord toAdd, MultiMap<String, String> candidates, char targetPos, SemanticGraph parse,
        String reason) {
    List<SemanticGraphEdge> edges = parse.outgoingEdgeList(toAdd);
    for (SemanticGraphEdge e : edges) {
        if (e.getRelation().getShortName().equals("conj")) {
            IndexedWord dep = e.getDependent();
            String depLemma = dep.get(LemmaAnnotation.class);
            char depPos = dep.get(PartOfSpeechAnnotation.class).substring(0, 1).toLowerCase().charAt(0);
            if (targetPos == depPos) {
                if (targetPos != 'v') {
                    candidates.put(depLemma, reason + " (In conjunction)");
                }
                // Check for phrasal verb particles
                else {
                    List<SemanticGraphEdge> depEdges = parse.outgoingEdgeList(dep);
                    for (SemanticGraphEdge e2 : depEdges) {
                        if (e2.getRelation().getShortName().equals("prt")) {
                            IndexedWord dep2 = e.getDependent();
                            depLemma = depLemma + " " + dep2.get(LemmaAnnotation.class);
                            break;
                        }
                    }
                }
            }
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro.java

License:Open Source License

public static void convertDependencies(JCas aJCas, Annotation document, MappingProvider mappingProvider,
        boolean internStrings) {
    for (CoreMap s : document.get(SentencesAnnotation.class)) {
        SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class);
        //SemanticGraph graph = s.get(EnhancedDependenciesAnnotation.class);

        // If there are no dependencies for this sentence, skip it. Might well mean we
        // skip all sentences because normally either there are dependencies for all or for
        // none./*from  ww  w .  ja  va  2 s .  c  o m*/
        if (graph == null) {
            continue;
        }

        for (IndexedWord root : graph.getRoots()) {
            Dependency dep = new ROOT(aJCas);
            dep.setDependencyType("root");
            dep.setDependent(root.get(TokenKey.class));
            dep.setGovernor(root.get(TokenKey.class));
            dep.setBegin(dep.getDependent().getBegin());
            dep.setEnd(dep.getDependent().getEnd());
            dep.setFlavor(DependencyFlavor.BASIC);
            dep.addToIndexes();
        }

        for (SemanticGraphEdge edge : graph.edgeListSorted()) {
            Token dependent = edge.getDependent().get(TokenKey.class);
            Token governor = edge.getGovernor().get(TokenKey.class);

            // For the type mapping, we use getShortName() instead, because the <specific>
            // actually doesn't change the relation type
            String labelUsedForMapping = edge.getRelation().getShortName();

            // The nndepparser may produce labels in which the shortName contains a colon.
            // These represent language-specific labels of the UD, cf: 
            // http://universaldependencies.github.io/docs/ext-dep-index.html
            labelUsedForMapping = StringUtils.substringBefore(labelUsedForMapping, ":");

            // Need to use toString() here to get "<shortname>_<specific>"
            String actualLabel = edge.getRelation().toString();

            Type depRel = mappingProvider.getTagType(labelUsedForMapping);
            Dependency dep = (Dependency) aJCas.getCas().createFS(depRel);
            dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel);
            dep.setDependent(dependent);
            dep.setGovernor(governor);
            dep.setBegin(dep.getDependent().getBegin());
            dep.setEnd(dep.getDependent().getEnd());
            dep.setFlavor(edge.isExtra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC);
            dep.addToIndexes();
        }
    }
}

From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);//from w w  w  . j  a  va2  s  . com
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        entity.addToIndexes();
    }

}

From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/* w w  w.j a  v a  2 s  .co m*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                //String line = mention.getCoveredText();
                //System.out.println(line);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        //NamedEntityMention mention=entity.getMentions(3);
        //System.out.println(mention.getBegin());
        entity.addToIndexes();
    }

}

From source file:knu.univ.lingvo.coref.Mention.java

License:Open Source License

private static Pair<IndexedWord, String> findDependentVerb(Mention m) {
    Pair<IndexedWord, String> ret = new Pair<IndexedWord, String>();
    int headIndex = m.headIndex + 1;
    try {/*from  w  ww  . j ava  2  s .c  om*/
        IndexedWord w = m.dependency.getNodeByIndex(headIndex);
        if (w == null)
            return ret;
        while (true) {
            IndexedWord p = null;
            for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) {
                if (ret.second() == null) {
                    String relation = parent.first().getShortName();
                    ret.setSecond(relation);
                }
                p = parent.second();
            }
            if (p == null || p.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) {
                ret.setFirst(p);
                break;
            }
            if (w == p)
                return ret;
            w = p;
        }
    } catch (Exception e) {
        return ret;
    }
    return ret;
}

From source file:org.textmining.annotator.StanfordCoreNlpAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/*from w w  w.  ja  v  a 2 s  . c om*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        entity.addToIndexes();
    }

    //end of process-method
}