Engines.Test.StanfordParser.TreeHandling.java Source code

Introduction

Here is the source code for Engines.Test.StanfordParser.TreeHandling.java
Source

package Engines.Test.StanfordParser;

/**
* USING THE EDITED EXAMPLE: 
* 
* A Stanford CoreNLP tree visualizer.  
* Copyright (C) 2014  Long Qiu 
     
* This program is free software; you can redistribute it and/or 
* modify it under the terms of the GNU General Public License 
* as published by the Free Software Foundation; either version 2 
* of the License, or (at your option) any later version. 
     
* This program is distributed in the hope that it will be useful, 
* but WITHOUT ANY WARRANTY; without even the implied warranty of 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
* GNU General Public License for more details. 
     
* You should have received a copy of the GNU General Public License 
* along with this program; if not, write to the Free Software 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. 
*/

/*
 * Copyright (c) 1995, 2008, Oracle and/or its affiliates. All rights reserved. 
 * 
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met: 
 * 
 *   - Redistributions of source code must retain the above copyright 
 *     notice, this list of conditions and the following disclaimer. 
 * 
 *   - Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the 
 *     documentation and/or other materials provided with the distribution. 
 * 
 *   - Neither the name of Oracle or the names of its 
 *     contributors may be used to endorse or promote products derived 
 *     from this software without specific prior written permission. 
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

/*
 * The Dynamic Tree part is based on an example provided by Richard Stanford, 
 * a tutorial reader. 
 */

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import javax.swing.tree.DefaultMutableTreeNode;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.EnglishGrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.Labeled;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;

public class TreeHandling {
    private static StanfordCoreNLP pipeline;
    static TreeFactory lstf = new LabeledScoredTreeFactory();
    private static final String ANALYZERS = "tokenize, ssplit, pos, lemma, parse";
    static Properties props;

    public static void main(String[] args) {
        String allowed = "Here are some example sentences for SCNTreeViewer. There are a few symbols in Scala that are special and cannot be defined or used as method names. You have to understand this.";
        String test1 = "]] and [[anti-colorado]] and [[san jases and the british army]] for the film was created by [[paris county league]] and [[tamil nadu state assembly election, 1971|1967 elections]]. ";
        String test2 = "A quick brown fox jumped over the lazy dog. ";
        String harderror = "the the in Kurhaus the are, 60s in Institute the libraries the originally district. other line is officer of County, [[dojin www.atimes.com/atimes/China/FB04Ad04.html last linseed Poms elsewhere of Tianyong]] overall the Albums]] federal .";

        test(harderror);
        test(allowed);
        test(test1);
        test(test2);
        System.out.println("DONE");

        showDepTree(harderror);
        showDepTree(allowed);
        showDepTree(test1);
        showDepTree(test2);

    }

    private static void lazyInit() {
        if (props != null) {
            return;
        }
        props = new Properties();
        //No effect. Still can't recognize it 
        //props.put("pos.model", "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); 

        props.put("annotators", ANALYZERS);
        pipeline = new StanfordCoreNLP(props);
        System.err.println("Loading done.");
    }

    public static Annotation parseArticle(String text) {
        Annotation document = new Annotation(text);
        // run all Annotators on this text 
        pipeline.annotate(document);
        return document;
    }

    public static ArrayList<Tree> getParseTree(Annotation document) {
        ArrayList<Tree> forest = new ArrayList<Tree>();
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            // this is the parse tree of the current sentence 
            Tree tree = sentence.get(TreeAnnotation.class);
            // Alternatively, this is the Stanford dependency graph of the current sentence, but without punctuations 
            // SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); 
            forest.add(tree);
        }
        return forest;
    }

    public static Collection<TypedDependency> getDeps(Tree parseTree) {
        //Dependencies, punctuations included 
        GrammaticalStructure gs = new EnglishGrammaticalStructure(parseTree,
                new PennTreebankLanguagePack().punctuationWordAcceptFilter());
        Collection<TypedDependency> tdl = gs.typedDependencies(); // typedDependenciesCollapsed() eats the prepositions, etc 
        gs = new EnglishGrammaticalStructure(parseTree);
        tdl.addAll(gs.typedDependencies());
        return tdl;
    }

    public static void showDepTree(String text) {
        if (props == null) {
            lazyInit();
        }
        Annotation document = parseArticle(text);
        ArrayList<Tree> parseTrees = getParseTree(document);

        DefaultMutableTreeNode rootParse = new DefaultMutableTreeNode();
        for (int i = 0; i < parseTrees.size(); i++) {
            rootParse.add(toDMTree(null, parseTrees.get(i)));
        }

        for (Tree tree : parseTrees)
            System.out.println(getDeps(tree));
    }

    public static DefaultMutableTreeNode toDMTree(DefaultMutableTreeNode root, Tree tree) {
        if (root == null) {
            root = new DefaultMutableTreeNode();
        }

        String nodeContent = tree.nodeString();
        root.setUserObject(nodeContent);
        for (Tree c : tree.children()) {
            DefaultMutableTreeNode n = toDMTree(null, c);
            root.add(n);
        }
        return root;
    }

    public static DefaultMutableTreeNode toDMTree(IndexedWord root, SemanticGraph dependencies) {

        if (root == null) {
            root = dependencies.getFirstRoot();
        }

        DefaultMutableTreeNode node = new DefaultMutableTreeNode();

        String nodeContent = root.value();

        for (SemanticGraphEdge edge : dependencies.edgeIterable()) {
            if (edge.getDependent().equals(root)) {
                nodeContent = "<-" + edge.getRelation() + "- " + nodeContent;
                break;
            }
        }

        node.setUserObject(nodeContent);
        for (IndexedWord c : dependencies.getChildList(root)) {
            DefaultMutableTreeNode n = toDMTree(c, dependencies);
            node.add(n);
        }
        return node;
    }

    public static Tree makeTreeRobust(Collection<TypedDependency> tdl) {
        LinkedList<TypedDependency> toAssemble = new LinkedList<TypedDependency>();
        for (TypedDependency dep : tdl) {
            toAssemble.add(dep);
        }
        Tree tree = makeTree(toAssemble, true, null);
        return tree;
    }

    /**
     *  
     * @param tdl 
     * @param fail set true to return a null tree if the constructiion fails 
     * @param tree null to build a new tree, or you can start with a partial tree 
     * @return 
     * Feb 22. a gov could be missing, which fails the tree construction process. 
     * Use GrammaticalStructure.root() instead, whose POS nodes to be removed. 
     */
    public static Tree makeTree(LinkedList<TypedDependency> toAssemble, boolean fail, Tree tree) {
        if (tree == null) {
            tree = lstf.newTreeNode("DUMMYROOT", null);
        }

        toAssemble.add(null);// 
        int counter = toAssemble.size();

        while (toAssemble.size() > 0) {
            //1. pick the next dep 
            TypedDependency dep = toAssemble.getFirst();
            if (dep == null) {
                toAssemble.poll();
                if (counter-- > 0) {
                    toAssemble.add(null);
                    continue;
                } else {
                    if (toAssemble.size() > 0 && fail) {
                        tree = null;
                    }
                    break;
                }
            }

            //2. assemble it onto the tree 
            Tree newRoot = putOnBranch(dep, tree);
            //2.1 success -> remove it from the set 
            toAssemble.remove(dep);
            if (newRoot != null) {
                tree = newRoot;
                //    System.out.println(tree+" BetterText.makeTree()"); 
                //    System.out.println("Added:\t"+dep.gov() +"-->"+dep.dep()); 
            } else {
                //2.2 fail -> put it back at the tail of the set 
                //    System.out.println("Skipd:\t"+dep.gov() +"-->"+dep.dep()); 
                //    System.out.print("."); 
                toAssemble.add(toAssemble.size(), dep);
            }
        }
        return tree.getChild(0);
    }

    private static Tree putOnBranch(TypedDependency dep, Tree tree) {
        /*
         * Each node is a tree with a single child 
         */
        Tree mySubtree = lstf.newTreeNode(dep.gov().backingLabel(), new LinkedList<Tree>());
        mySubtree.setValue("[<-" + dep.reln() + "-] " + dep.dep().value());//nudge in the dependency relation information 

        if (tree.children().length == 0) {
            if (tree.label().value().toString().equals("DUMMYROOT")) {
                tree.addChild(mySubtree);
                return tree;
            } else {
                //Shouldn't happen 
                System.err.println("Forgot to add a child earlier.");
                return null;
            }
        } else {
            //   System.err.println(dep.dep().label() +"\t[on]\t" + tree.label()); 
            for (Tree child : tree.children()) {
                //if dep is child's parent, insert dep between child and its parent 
                if (((CoreLabel) child.label()).index() == ((CoreLabel) ((Labeled) dep.dep()).label()).index()) {
                    tree.removeChild(tree.objectIndexOf(child));
                    mySubtree.addChild(child);
                }
            }
            if (mySubtree.children().length > 1) {
                tree.addChild(mySubtree);
                return tree;
            }

            for (Tree child : tree.children()) {
                //if dep is Child's sibling, or child 
                if (((CoreLabel) child.label()).index() == ((CoreLabel) ((Labeled) dep.gov()).label()).index()) {
                    tree.addChild(mySubtree);
                    return tree;
                }

                if (child.children().length > 0) {
                    if (putOnBranch(dep, child) != null) {
                        return tree;
                    }
                }
            }
        }
        //    tree.getLeaves() == null 
        //check its childrens, recurisively. 
        return null;
    }

    public static void test(String text) {
        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
        lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
        TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
        Tree tree = lp.apply(wordList);
        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);

    }

}