edu.albany.cubism.util.StanfordChineseParser.java Source code

Introduction

Here is the source code for edu.albany.cubism.util.StanfordChineseParser.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package edu.albany.cubism.util;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.trees.CollinsHeadFinder;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import edu.albany.cubism.relation.Graph;
import edu.albany.cubism.relation.Node;

/**
 *
 * @author mehrdad
 */
public class StanfordChineseParser {
    Segmenter seg = new Segmenter();

    public class ParserParts {

        public String dependency; // The dependencies of a sentence
        public String tree; // The tree of a sentence

        public ParserParts(String dependency, String tree) {
            this.dependency = dependency;
            this.tree = tree;
        }
    }

    Tree t;
    //    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    //    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz");
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz");
    //    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz");
    //LexicalizedParser lp = new LexicalizedParser("lib/stanford-parser-2012-02-03/grammar/englishFactored.ser.gz");
    TokenizerFactory tf = PTBTokenizer.factory(new WordTokenFactory(), "");
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    int count = 0;
    int subj_count = 0;
    int ad_subj_count = 0;
    int ad_count = 0;
    Collection tdl;
    //    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
    //    TreebankLanguagePack tlp = new SpanishTreebankLanguagePack();

    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

    public Tree getTree() {
        return t;
    }

    public ParserParts parse(String sentence) {
        try {
            System.out.println("--------- Start Parsing ---------");
            //         FileInputStream fis = new FileInputStream(filepath);
            //         DataInputStream dis = new DataInputStream(fis);
            //         BufferedReader br = new BufferedReader(new InputStreamReader(fis));

            // prepare parser, tokenizer and tree printer

            System.out.println("Parse sentence: " + sentence);
            if (sentence.trim().length() == 0) {
                ParserParts newParserPartsNode = new ParserParts("", "");
                return newParserPartsNode;
            }
            List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
            //lp.parse(tokens);
            //t = lp.getBestParse();
            t = (Tree) lp.apply(tokens);
            System.out.println("penn Structure: ");
            t.pennPrint();
            GrammaticalStructure gs = gsf.newGrammaticalStructure(t);
            //tdl = gs.typedDependenciesCollapsed();
            tdl = gs.typedDependencies();
            System.out.println("Relationship: " + tdl);
            System.out.println("--------- End Parsing --------- " + count);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            System.out.println("count is while exception: " + sentence);
            e.printStackTrace();
        }
        ParserParts newParserPartsNode = new ParserParts(tdl.toString(), t.toString());
        return newParserPartsNode;
    }

    public void showAllDependencies(Collection tdl, String tag) {
        boolean added = false;
        boolean ad_added = false;
        boolean has_subj = false;
        count++;
        for (Iterator it = tdl.iterator(); it.hasNext();) {
            TypedDependency td = (TypedDependency) it.next();
            if (td.reln().getLongName().indexOf("subject") != -1) {
                has_subj = true;
                if (!added) {
                    subj_count++;
                    if (tag.equalsIgnoreCase("Action-Directive")) {
                        ad_subj_count++;
                    }
                    added = true;
                }
            }

            if (tag.equalsIgnoreCase("Action-Directive") && !ad_added) {
                ad_count++;
                ad_added = true;
                System.out.println("This utterance is " + tag);
            }
            /*
             * System.out.println("Dependency " + (++count));
             * System.out.println("Dependent: " + td.dep().value());
             * System.out.println("Governor: " + td.gov().value());
             * System.out.println("Relation: " + td.reln().getLongName());
             System.out.println("---------------");
             */
        }
        if (!has_subj) {
            printTree(getTree());
        }
        System.out.println("Total action-directive: " + ad_count);
        System.out.println("has subj: " + ad_subj_count);
        System.out.println("Total utterances: " + count);
        System.out.println("Total subjects of the utterances: " + subj_count);
    }

    public Collection getDependencies() {
        return tdl;
    }

    public void printTree(Tree t) {
        tp.printTree(t);
        tp.printTree(t.headTerminal(new CollinsHeadFinder()));//SemanticHeadFinder()));
        //System.out.println("tree label: " + t.label());
        List trees = t.subTreeList();

        for (int i = 0; i < trees.size(); i++) {
            Tree sbt = (Tree) trees.get(i);
            /*
             * if (!sbt.isLeaf()) { trees.addAll(sbt.subTreeList()); }
             */
            //System.out.println("sbt lable: " + sbt.label());
        }
        //System.out.println("done");
        List<Tree> leaves = t.getLeaves();
        for (int i = 0; i < leaves.size(); i++) {
            Tree leaf = leaves.get(i);
            //if (leaf.parent() != null) {
            System.out.println(leaf.pennString() + " " + leaf.value());
            //}
        }
        /*
         * Set dependencies = t.dependencies(); Iterator it =
         * dependencies.iterator(); while (it.hasNext()) { Dependency dependency
         * = (Dependency)it.next(); System.out.println(dependency.toString());
         * System.out.println(dependency.name()); }
         */
    }

    public void buildDependcyGraph(String sentence, Graph g) {
        String segmented = this.seg.segment(sentence);
        parse(segmented);
        if (getTree() == null) {
            g = null;
            return;
        }
        ArrayList<TaggedWord> alWords = getTree().taggedYield();

        //Get relationship
        //String peen=sp.getTree().pennString();
        //fill in the required information(td index is from 1)
        //        Graph g = new Graph();
        //test for undirected graph
        //        Graph bidirectg = new Graph();
        Collection<TypedDependency> colTD = getDependencies();
        Iterator it = colTD.iterator();
        while (it.hasNext()) {
            TypedDependency td = (TypedDependency) it.next();

            //            String govWord = (td.gov()).nodeString();
            String govWord = (td.gov()).toString();
            govWord = govWord.substring(0, govWord.lastIndexOf("-"));
            int govId = td.gov().index();
            String govTag = "";
            if (govId == 0) {
                govTag = "";
            } else {
                govTag = alWords.get(govId - 1).tag();
            }

            //            String depWord = td.dep().nodeString();
            String depWord = td.dep().toString();
            depWord = depWord.substring(0, depWord.lastIndexOf("-"));
            int depId = td.dep().index();
            String depTag = "";
            if (depId == 0) {
                depTag = "";
            } else {
                depTag = alWords.get(depId - 1).tag();
            }

            String rel = td.reln().getShortName();

            Node sNode = new Node(govId, govWord, govTag);
            Node dNode = new Node(depId, depWord, depTag);
            //edu.albany.ils.remnd.relation.Edge edge = new edu.albany.ils.remnd.relation.Edge(sNode, dNode, rel);
            //g.addEdge(edge);
            //updated by CSLin
            g.checkNewEdge(sNode, dNode, rel);

            //            bidirectg.checkNewEdge(sNode, dNode, rel);
            //            bidirectg.checkNewEdge(dNode, sNode, "reversedRelation");
        }
        colTD.clear();

        //        Node head=bidirectg.getNode("government");
        //        Node tail=bidirectg.getNode("veins");
        //        List<Node> nodList=bidirectg.shortestPath(head, tail);
        //        for(int i=0; i<nodList.size(); i++)
        //            System.out.println(nodList.get(i).getName());
        //        System.out.print("");
        //        return g;
    }

    /**
     * @param targetConcept: e.g "federal bureaucracy"
     * @param highICWord: e.g "home" true: targetConcept is in the sub-sentence
     * outside the highICWord (get rid of this highICWord)
     */
    public boolean checkSubSentence(Graph g, String targetConcept, String highICWord) {
        ArrayList<String> relSubSentence = new ArrayList<String>();
        relSubSentence.add("rcmod");
        //root
        Node nRoot = g.getNode("ROOT");

        //root verb
        ArrayList<edu.albany.cubism.relation.Edge> rootedge = (ArrayList<edu.albany.cubism.relation.Edge>) (g
                .outFlow(nRoot));
        Node rootVerb = rootedge.get(0).getChild();

        //node of target concept and high IC word
        Node nTarget = g.getHead(targetConcept);
        Node nHighICWord = g.getNode(highICWord);

        //check if nHighICWord directly links to nTarget
        while (true) {
            ArrayList<edu.albany.cubism.relation.Edge> edges = (ArrayList<edu.albany.cubism.relation.Edge>) (g
                    .inFlow(nHighICWord));
            edu.albany.cubism.relation.Edge edge = edges.get(0);

            Node node = edge.getParent();

            if (node == nTarget) {
                return false;
            } else if (node == rootVerb) {
                break;
            } else {
                nHighICWord = node;
            }
        }

        //check if the nTarget is in the sub-sentence
        nHighICWord = g.getNode(highICWord);

        while (true) {
            ArrayList<edu.albany.cubism.relation.Edge> edges = (ArrayList<edu.albany.cubism.relation.Edge>) (g
                    .inFlow(nTarget));
            edu.albany.cubism.relation.Edge edge = edges.get(0);

            if (relSubSentence.contains(edge.getRelation())) {
                return true;
            }

            Node node = edge.getParent();

            if (node == nHighICWord) {
                return false;
            } else if (node == rootVerb) {
                break;
            } else {
                nTarget = node;
            }
        }

        return false;
    }

    public static void main(String[] args) {
        StanfordChineseParser sp_ = new StanfordChineseParser();
        Graph g = new Graph();
        //        sp_.buildDependcyGraph("I loved Morocco and I'd probably go back before going to Ireland.", g);
        sp_.buildDependcyGraph("?", g);
        //        boolean bSubSen = sp_.checkSubSentence(g, "affirmative action", "sore");
        System.out.println();
    }
}