Example usage for edu.stanford.nlp.trees Tree toString

List of usage examples for edu.stanford.nlp.trees Tree toString

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees Tree toString.

Prototype

@Override
public String toString() 

Source Link

Document

Converts parse tree to string in Penn Treebank format.

Usage

From source file:edu.cmu.ark.SentenceSimplifier.java

License:Open Source License

/**
 * e.g., In January, John wore his winter coat. -> John wore his winter coat in January.
 * /* w w  w .j  av  a 2s.  c  om*/
 * @param input
 * @return
 */
private void moveLeadingPPsAndQuotes(Question input) {
    String tregexOpStr;
    TregexPattern matchPattern;
    TregexMatcher matcher;
    Tree mainvp = null;
    Tree subj = null;

    tregexOpStr = "ROOT < (S|SINV=mainclause < (NP|SBAR=subj !$++ /,/) < VP=mainvp "
            + " [ < (PP=modifier < NP) " //must be a PP with an NP object
            + "| < (S=modifier < SBAR|NP <<# VB|VBD|VBP|VBZ) ] ) "; //OR: a quote, which is an S clause with a subject and finite main verb
    //the modifiers to move must be immediately followed by commas

    matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
    matcher = matchPattern.matcher(input.getIntermediateTree());

    List<Tree> modifiers = new ArrayList<Tree>();
    while (matcher.find()) {
        if (mainvp == null) {
            mainvp = matcher.getNode("mainvp").deeperCopy();
            subj = matcher.getNode("subj").deeperCopy();
        }
        Tree mainclause = matcher.getNode("mainclause");
        Tree modifier = matcher.getNode("modifier").deeperCopy();
        int idx = mainclause.indexOf(modifier);
        if (modifiers.contains(modifier))
            continue; //just in case the tregex expression catches duplicates
        //add commas and quotation marks if they appeared in the original
        if (idx > 0 && mainclause.getChild(idx - 1).label().toString().equals("``")) {
            modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("(, ,)"));
            modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("(`` ``)"));
            Tree sbar = factory.newTreeNode("SBAR", new ArrayList<Tree>());
            sbar.addChild(modifier);
            modifiers.add(sbar);
            modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("('' '')"));
        } else {
            modifiers.add(modifier);
        }
    }

    if (mainvp != null) { //any matches?
        for (Tree m : modifiers) {
            mainvp.addChild(m);
        }

        Tree newTree = factory.newTreeNode("ROOT", new ArrayList<Tree>());
        Tree clause = factory.newTreeNode("S", new ArrayList<Tree>());
        newTree.addChild(clause);
        clause.addChild(subj);
        clause.addChild(mainvp);

        AnalysisUtilities.addPeriodIfNeeded(newTree);
        addQuotationMarksIfNeeded(newTree);
        if (GlobalProperties.getDebug())
            System.err.println("moveLeadingModifiers: " + newTree.toString());
        input.setIntermediateTree(newTree);
        if (GlobalProperties.getComputeFeatures())
            input.setFeatureValue("movedLeadingPPs", 1.0);
    }

}

From source file:edu.cmu.ark.SentenceSimplifier.java

License:Open Source License

/**
 * e.g., John, hoping to get a good grade, studied. -> John hoped to get a good grade.
 *   Walking to the store, John saw Susan -> John was walking to the store.
 *   //from  w w w .java  2 s . c o m
 *   NOTE: This method produces false positives for sentences like, 
 *            "Broadly speaking, the project was successful."
 *         where the participial phrase does not modify the subject.
 *   
 * @param extracted
 * @param input
 */
private void extractNounParticipialModifiers(Collection<Question> extracted, Question input) {
    String tregexOpStr;
    TregexPattern matchPattern;
    TregexMatcher matcher;

    tregexOpStr = "ROOT < (S " + " [ << (NP < (NP=subj  $++ (/,/ $+ (VP=modifier <# VBN|VBG|VP=tense )))) " //modifiers that appear after nouns
            + " | < (S !< NP|SBAR < (VP=modifier <# VBN|VBG|VP=tense) $+ (/,/ $+ NP=subj)) " //modifiers before the subject. e.g., Founded by John, the company...
            + " | < (SBAR < (S !< NP|SBAR < (VP=modifier <# VBN|VBG=tense)) $+ (/,/ $+ NP=subj)) " //e.g., While walking to the store, John saw Susan.
            + " | < (PP=modifier !< NP <# VBG=tense $+ (/,/ $+ NP=subj)) ] ) " // e.g., Walking to the store, John saw Susan.
            + " <<# /^VB.*$/=maintense "; //tense determined by top-most verb

    matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
    matcher = matchPattern.matcher(input.getIntermediateTree());
    while (matcher.find()) {
        Tree nountree = matcher.getNode("subj").deeperCopy();
        Tree vptree = matcher.getNode("modifier");
        Tree verb = matcher.getNode("tense");
        makeDeterminerDefinite(nountree);

        if (vptree.label().toString().equals("PP"))
            vptree.label().setValue("VP");
        String verbPOS = findTense(matcher.getNode("maintense"));
        if (vptree == null || nountree == null)
            return;

        String newTreeStr;
        if (verb.label().toString().equals("VBG")) {
            //for present partcipials, change the tense to the tense of the main verb
            //e.g., walking to the store -> walked to the store
            String verbLemma = AnalysisUtilities.getInstance().getLemma(verb.getChild(0).label().toString(),
                    verb.label().toString());
            String newVerb = AnalysisUtilities.getInstance().getSurfaceForm(verbLemma, verbPOS);
            int verbIndex = vptree.indexOf(verb);
            vptree = vptree.deeperCopy();
            vptree.removeChild(verbIndex);
            vptree.addChild(verbIndex,
                    AnalysisUtilities.getInstance().readTreeFromString("(" + verbPOS + " " + newVerb + ")"));
            newTreeStr = "(ROOT (S " + matcher.getNode("subj").toString() + " " + vptree.toString()
                    + " (. .)))";
        } else {
            //for past participials, add a copula
            //e.g., John, exhausted, -> John was exhausted
            //(or for conjunctions, just add the copula---kind of a hack to make the moby dick sentence work out)
            String auxiliary;
            if (verbPOS.equals("VBP") || verbPOS.equals("VBD")) {
                if (isPlural(nountree))
                    auxiliary = "(VBD were)";
                else
                    auxiliary = "(VBD was)";
            } else {
                if (isPlural(nountree))
                    auxiliary = "(VB are)";
                else
                    auxiliary = "(VBZ is)";
            }

            newTreeStr = "(ROOT (S " + nountree + " (VP " + auxiliary + " " + vptree + ") (. .)))";
        }

        Tree newTree = AnalysisUtilities.getInstance().readTreeFromString(newTreeStr);
        correctTense(newTree.getChild(0).getChild(0), newTree.getChild(0));
        addQuotationMarksIfNeeded(newTree);

        if (GlobalProperties.getDebug())
            System.err.println("extractNounParticipialModifiers: " + newTree.toString());
        Question newTreeWithFeatures = input.deeperCopy();
        newTreeWithFeatures.setIntermediateTree(newTree);
        if (GlobalProperties.getComputeFeatures())
            newTreeWithFeatures.setFeatureValue("extractedFromParticipial", 1.0); //old feature name
        if (GlobalProperties.getComputeFeatures())
            newTreeWithFeatures.setFeatureValue("extractedFromNounParticipial", 1.0);
        extracted.add(newTreeWithFeatures);
    }

}

From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);//from w  w  w.  jav a  2  s .c  o  m
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        entity.addToIndexes();
    }

}

From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java

License:Open Source License

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

        // create the token annotation
        int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
        int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
        String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
        String lemma = tokenAnn.get(LemmaAnnotation.class);
        Token token = new Token(jCas, begin, end);
        token.setPos(pos);/*  w  w  w.  j a v a2s.  com*/
        token.setLemma(lemma);
        token.addToIndexes();

        // hackery to convert token-level named entity tag into phrase-level tag
        String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
        if (neTag.equals("O") && !lastNETag.equals("O")) {
            NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
            ne.setMentionType(lastNETag);
            ne.addToIndexes();
        } else {
            if (lastNETag.equals("O")) {
                lastNEBegin = begin;
            } else if (lastNETag.equals(neTag)) {
                // do nothing - begin was already set
            } else {
                NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
                ne.setMentionType(lastNETag);
                ne.addToIndexes();
                lastNEBegin = begin;
            }
            lastNEEnd = end;
        }
        lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

        // add the sentence annotation
        int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
        int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
        Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
        sentence.addToIndexes();

        // add the syntactic tree annotation
        List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
        Tree tree = sentenceAnn.get(TreeAnnotation.class);
        if (tree.children().length != 1) {
            throw new RuntimeException("Expected single root node, found " + tree);
        }
        tree = tree.firstChild();
        tree.indexSpans(0);
        TopTreebankNode root = new TopTreebankNode(jCas);
        root.setTreebankParse(tree.toString());
        // TODO: root.setTerminals(v)
        this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

        // get the dependencies
        SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

        // convert Stanford nodes to UIMA annotations
        List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
        Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
        for (IndexedWord stanfordNode : dependencies.vertexSet()) {
            int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
            int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
            int tokenBegin = tokens.get(indexBegin).getBegin();
            int tokenEnd = tokens.get(indexEnd - 1).getEnd();
            DependencyNode node;
            if (dependencies.getRoots().contains(stanfordNode)) {
                node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
            } else {
                node = new DependencyNode(jCas, tokenBegin, tokenEnd);
            }
            stanfordToUima.put(stanfordNode, node);
        }

        // create relation annotations for each Stanford dependency
        ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
        ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
        for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
            DependencyRelation relation = new DependencyRelation(jCas);
            DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
            DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
            String relationType = stanfordEdge.getRelation().toString();
            if (head == null || child == null || relationType == null) {
                throw new RuntimeException(String.format(
                        "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation,
                        child, head));
            }
            relation.setHead(head);
            relation.setChild(child);
            relation.setRelation(relationType);
            relation.addToIndexes();
            headRelations.put(child, relation);
            childRelations.put(head, relation);
        }

        // set the relations for each node annotation
        for (DependencyNode node : stanfordToUima.values()) {
            List<DependencyRelation> heads = headRelations.get(node);
            node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
            if (heads != null) {
                FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
            }
            List<DependencyRelation> children = childRelations.get(node);
            node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
            if (children != null) {
                FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
            }
            node.addToIndexes();
        }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
        sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
        List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
        for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

            // figure out the character span of the token
            List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
            int begin = tokens.get(corefMention.startIndex - 1).getBegin();
            int end = tokens.get(corefMention.endIndex - 2).getEnd();

            // use an existing named entity mention when possible; otherwise create a new one
            NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
            if (mention == null) {
                mention = new NamedEntityMention(jCas, begin, end);
                //String line = mention.getCoveredText();
                //System.out.println(line);
                mention.addToIndexes();
            }
            mentions.add(mention);
        }

        // create an entity for the mentions
        Collections.sort(mentions, new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
                return m1.getBegin() - m2.getBegin();
            }
        });

        // create mentions and add them to entity
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, mentions.size()));
        int index = 0;
        for (NamedEntityMention mention : mentions) {
            mention.setMentionedEntity(entity);
            entity.setMentions(index, mention);
            index += 1;
        }
        entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
        if (mention.getMentionedEntity() == null) {
            NamedEntity entity = new NamedEntity(jCas);
            entity.setMentions(new FSArray(jCas, 1));
            entity.setMentions(0, mention);
            mention.setMentionedEntity(entity);
            entity.getMentions();
            entities.add(entity);
        }
    }

    // sort entities by document order
    Collections.sort(entities, new Comparator<NamedEntity>() {
        @Override
        public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
        }

        private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
                if (mention.getBegin() < min) {
                    min = mention.getBegin();
                }
            }
            return min;
        }
    });

    // add entities to document
    for (NamedEntity entity : entities) {
        //NamedEntityMention mention=entity.getMentions(3);
        //System.out.println(mention.getBegin());
        entity.addToIndexes();
    }

}

From source file:edu.cornell.law.entitylinking.utils.Utility.java

public static List<String> getInnerNounPhrases(String paragraph) {
    List<String> nounPhrases = new ArrayList<String>();
    try {/* www .  j a  va  2 s . c  om*/
        StringTokenizer tokenizer = new StringTokenizer(paragraph, "\\.;?,:");
        while (tokenizer.hasMoreTokens()) {
            Annotation document = new Annotation(tokenizer.nextToken());
            pipeline.annotate(document);
            Tree tree = null;
            // these are all the sentences in this document
            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
            List<CoreMap> sentences = document.get(SentencesAnnotation.class);

            for (CoreMap sentence : sentences) {
                // the parse tree of the current sentence
                tree = sentence.get(TreeAnnotation.class);

                List<Tree> phraseList = new ArrayList<Tree>();
                for (Tree subtree : tree) {
                    if ((subtree.label().value().equals("NP")) || (subtree.label().value().equals("WHNP"))) {
                        phraseList.add(subtree);
                    }
                }

                if (!phraseList.isEmpty()) {
                    String skipPhrase = "false";
                    for (Tree subList : phraseList) {
                        StringBuilder phraseString = new StringBuilder();
                        String phrase = subList.toString();
                        String[] tokens = phrase.split(" ");
                        for (String token : tokens) {
                            if (token.contains("(")) {
                                if (token.contains("(NP")) {
                                    // Check if there are more NP or WHNP in it?
                                    String subPhrase = phrase.replaceFirst("\\(NP", "");
                                    if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) {
                                        skipPhrase = "true";
                                        break;
                                    }
                                } else if (token.contains("(WHNP")) {
                                    // Check if there are more NP or WHNP in it?
                                    String subPhrase = phrase.replaceFirst("\\(WHNP", "");
                                    if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) {
                                        skipPhrase = "true";
                                        break;
                                    }
                                } else {
                                    // do nothing, just drop the keyword.
                                }
                            } else {
                                token = token.replace(")", "");
                                phraseString.append(token + " ");
                                skipPhrase = "false";
                            }

                        }
                        if (!skipPhrase.equals("true")) {
                            String temp = phraseString.toString().trim();
                            if (temp.startsWith("(?i)the"))
                                temp = temp.replaceFirst("(?i)the ", "");

                            else if (temp.startsWith("(?i)a"))
                                temp = temp.replaceFirst("(?i)a ", "");

                            else if (temp.startsWith("(?i)an"))
                                temp = temp.replaceFirst("(?i)an ", "");

                            if (temp.contains(" or ")) {
                                String[] nptokens = temp.split(" or ");
                                for (String s : nptokens) {
                                    nounPhrases.add(s);
                                }
                            } else {
                                nounPhrases.add(temp);
                            }
                        }
                    }
                }
            }
        }
    } catch (OutOfMemoryError e) {
        System.out.println("Result too long to read into memory");
    }
    return nounPhrases;
}

From source file:edu.rpi.tw.linkipedia.search.nlp.NaturalLanguageProcessor.java

License:Open Source License

private List<String> getNounPhraseFromParseTree(Tree parse) {

    List<String> phraseList = new ArrayList<String>();
    for (Tree subtree : parse) {
        if (subtree.label().value().equals("NP")) {
            String subtreeString = subtree.toString();
            if (subtreeString.lastIndexOf("(NP") != subtreeString.indexOf("(NP"))
                continue;
            //System.out.println(subtree);
            List<LabeledWord> words = subtree.labeledYield();
            String currentPhrase = "";
            for (LabeledWord word : words) {

                currentPhrase += word.word() + "|" + word.tag() + " ";
            }//w  ww .  j a va2  s.  c o m
            currentPhrase = currentPhrase.trim();
            phraseList.add(currentPhrase);
        }
    }

    return phraseList;

}

From source file:elkfed.expletives.ExpletiveInstance.java

License:Apache License

public ExpletiveInstance(Tree root, Tree pronoun, String id) {
    _root = root;//from www  .ja  v a2s.  c  om
    _pronoun = pronoun;
    _id = id;

    List<Tree> wordsT = root.getLeaves();
    List<Label> posT = root.preTerminalYield();
    // get words and POS into an array so that
    // we get an idea of the pronoun's surrounding
    String[] words = new String[wordsT.size()];
    String[] pos = new String[wordsT.size()];
    if (!root.dominates(pronoun)) {
        System.err.format("%s does not dominate %s. WTF?", root, pronoun);
    }
    for (int here = 0; here < wordsT.size(); here++) {
        Tree w1 = wordsT.get(here);
        Label p1 = posT.get(here);
        words[here] = w1.toString();
        pos[here] = p1.value();
        if (w1 == pronoun) {
            _idx = here;
        } else if (pronoun.dominates(w1)) {
            _idx = here;
            pronoun = w1;
        }
    }
    assert _idx >= 0 : String.format("wanted %s in %s", pronoun, root);
    assert pos[_idx].equals("PRP") : String.format("wanted PRP got '%s'", pos[_idx]);
    _words = words;
    _pos = pos;
}

From source file:elkfed.expletives.TrainingData.java

License:Apache License

public static void extractExamples(String file, Set<String> anaphoricPronouns,
        List<ExpletiveInstance> instances) throws FileNotFoundException, IOException {
    TreeReader tr = new PennTreeReader(new FileReader(file), new LabeledScoredTreeFactory(),
            new BobChrisTreeNormalizer());
    Tree t;/*from w w w.j ava  2 s .com*/
    String file_id = file.substring(file.length() - 8, file.length() - 4);
    int sent_idx = 1;
    while ((t = tr.readTree()) != null) {
        //t.pennPrint();
        int word_idx = 1;
        for (Tree t1 : t.getLeaves()) {
            String s = t1.toString();
            if ("it".equals(s) || "It".equals(s)) {
                String id = String.format("%s:S%d:%d-%d", file_id, sent_idx, word_idx, word_idx);
                ExpletiveInstance inst = new ExpletiveInstance(t, t1, id);
                boolean is_positive = anaphoricPronouns.contains(id);
                inst.setFeature(PairInstance.FD_POSITIVE, !is_positive);
                instances.add(inst);
                String cls = is_positive ? "+1" : "-1";
                System.out.format("%s\t%s\t(%s)\n", s, id, cls);
            }
            word_idx++;
        }
        //System.out.println();
        //System.out.println(t);
        sent_idx++;
    }

}

From source file:elkfed.mmax.importer.DetermineMinSpan.java

License:Apache License

/** adds min_ids and min_span attributes so that
 *  BART's chunk-based coref resolution works
 *//*from   w ww  .  j  a  v a 2s  .  c o m*/
public static void addMinSpan(int start, Tree tree, IMarkable tag, List<String> tokens) {
    List<Tree> leaves = tree.getLeaves();
    Tree startNode;
    Tree endNode;
    try {
        startNode = leaves.get(tag.getLeftmostDiscoursePosition() - start);
        endNode = leaves.get(tag.getRightmostDiscoursePosition() - start);
        if (".".equals(endNode.parent(tree).value())) {
            //System.err.println("Sentence-final dot in "+
            //        tokens.subList(tag.start, tag.end + 1)+ "removed.");
            endNode = leaves.get(tag.getRightmostDiscoursePosition() - start - 1);
        }
    } catch (IndexOutOfBoundsException ex) {
        System.out.format("indices not found: %d,%d in %s [wanted: %s] [ctx: %s]",
                tag.getLeftmostDiscoursePosition() - start, tag.getRightmostDiscoursePosition() - start, leaves,
                tokens.subList(tag.getLeftmostDiscoursePosition(), tag.getRightmostDiscoursePosition() + 1),
                tokens.subList(start, tag.getLeftmostDiscoursePosition()));
        throw ex;
    }

    Tree parentNode = startNode;
    while (parentNode != null && !parentNode.dominates(endNode)) {
        parentNode = parentNode.parent(tree);
    }

    if (parentNode == null) {
        System.err.println("Could not match tree (1)");
        return;
    }

    if (startNode.leftCharEdge(tree) != parentNode.leftCharEdge(tree)
            || endNode.rightCharEdge(tree) != parentNode.rightCharEdge(tree)) {
        System.err.println("Could not match tree (2)");
        return;
    }

    Tree oldParent = parentNode;
    ModCollinsHeadFinder hf = new ModCollinsHeadFinder();
    // use the head finder to narrow down the span.
    // stop if (a) the head is no longer an NP or
    // (b) the NP is a conjunction
    go_up: while (true) {
        for (Tree t : parentNode.getChildrenAsList()) {
            if (t.value().equals("CC")) {
                break go_up;
            }
        }
        Tree headDtr = hf.determineHead(parentNode);
        if (headDtr == null || !headDtr.value().equals("NP")) {
            break;
        }
        parentNode = headDtr;
    }
    if (parentNode != oldParent) {
        List<Tree> newLeaves = parentNode.getLeaves();
        int newStart = start + find_same(leaves, newLeaves.get(0));
        int newEnd = newStart + newLeaves.size() - 1;
        if (newStart <= tag.getLeftmostDiscoursePosition()) {
            if (tag.getLeftmostDiscoursePosition() - newStart > 1) {
                System.err.println("NP node is too big:" + parentNode.toString() + " wanted:" + tokens
                        .subList(tag.getLeftmostDiscoursePosition(), tag.getRightmostDiscoursePosition() + 1)
                        + " in: " + tree);
                return;
            }
            for (int i = newStart - start; i < tag.getLeftmostDiscoursePosition() - start; i++) {
                System.err.println("additional prefix in syntax:" + leaves.get(i));
            }
            // switch NP boundary and tag boundary
            // (even [Connie Cheung]) => min_words="Connie Cheung"
            int tmp = tag.getLeftmostDiscoursePosition();
            tag.adjustSpan(newStart, tag.getRightmostDiscoursePosition());
            newStart = tmp;
        }
        assert newEnd <= tag.getRightmostDiscoursePosition();
        // this relies on MiniDiscourse's default word numbering
        // which is ugly but should generally work...
        if (newStart == newEnd) {
            tag.setAttributeValue("min_ids", "word_" + (newStart + 1));
        } else {
            tag.setAttributeValue("min_ids", String.format("word_%d..word_%d", newStart + 1, newEnd + 1));
        }
        StringBuffer buf = new StringBuffer();
        for (Tree t : newLeaves) {
            buf.append(t.toString().toLowerCase());
            buf.append(' ');
        }
        buf.setLength(buf.length() - 1);
        tag.setAttributeValue("min_words", buf.toString());
    }
}

From source file:elkfed.mmax.importer.ImportOntonotes.java

License:Apache License

/** adds pos and chunk information */
private void addParseInfo(int start, Tree tree) {
    /** Retrieve chunk tags from the parse tree and add chunk markables */
    boolean inNP = false;
    int startNP = -1;
    int wordLoc = 0;
    int depth = 0;
    for (String tok : tree.toString().replaceAll("\\)", ") ").split("\\s+")) {
        if (tok.matches("\\(NP")) {
            inNP = true;//from w w w. jav  a2  s  .  com
            startNP = wordLoc;
            depth = 0;
        }
        if ((inNP) && (tok.matches(".*\\)"))) {
            depth--;
        }
        if ((inNP) && (tok.matches("\\(.*"))) {
            depth++;
        }
        if (tok.matches(".+\\)")) {
            wordLoc++;
        }
        if ((depth == 0) && (inNP)) {
            inNP = false;
            Tag t = new Tag();
            t.tag = DEFAULT_CHUNK_LEVEL;
            t.attrs.put("tag", "np");
            t.start = start + startNP;
            t.end = start + wordLoc - 1;
            tags.add(t);
        }
    }

    /** Retrieve POS tags from the parse tree */
    List<Label> taggedSent = new ArrayList<Label>(tree.preTerminalYield());
    for (int i = 0; i < taggedSent.size(); i++) {
        Tag t = new Tag();
        t.tag = DEFAULT_POS_LEVEL;
        t.start = t.end = start + i;
        String tag = taggedSent.get(i).value();
        t.attrs.put("tag", tag.toLowerCase());
        tags.add(t);
    }
}