org.sam_agent.csparser.ContinuousParser.java Source code

Introduction

Here is the source code for org.sam_agent.csparser.ContinuousParser.java
Source

/*
Continuous Parser - provides POS tags and typed dependencies in JSON format over HTTP and stdio.
Copyright (C) 2015 Samuel Brian
    
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

package org.sam_agent.csparser;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.ParserAnnotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.CoreMap;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

import java.util.*;

public class ContinuousParser {

    public static final String ModelPath = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    public static final String Version = "v3.5.2";

    private StanfordCoreNLP pipeline;

    /**
     * Initialises the Stanford Parser with required configuration and the given parser model.
     * @param modelPath
     * @throws Exception
     */
    public ContinuousParser(String modelPath) {
        initCoreNLP_v3_5_2(modelPath);
    }

    /**
     * Initialises the Stanford Parser with required configuration and the built-in model.
     * @throws Exception
     */
    public ContinuousParser() {
        this(ModelPath);
    }

    /**
     * CoreNLP initialisation for v3.5.2.
     * @param modelPath
     */
    void initCoreNLP_v3_5_2(String modelPath) {
        if (modelPath == null) {
            modelPath = ModelPath;
        }
        Properties props = new Properties();
        //props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner");
        props.setProperty("annotators", "tokenize, ssplit, parse, lemma, ner");

        props.setProperty("parse.flags", "-makeCopulaHead -retainTmpSubcategories");
        props.setProperty("parse.model", modelPath);
        props.setProperty("parse.originalDependencies", "true");
        props.setProperty("parse.maxlen", "80");
        props.setProperty("parse.extradependencies", "REF_COLLAPSED_AND_SUBJ");

        props.setProperty("parse.collapsed", "true");
        props.setProperty("parse.outputFormat", "typedDependenciesCollapsed"); // probably has no affect
        //props.setProperty("parse.DEBUG", "true"); // verbose

        //        props.setProperty("parse.retainTmpSubcategories", "true");
        props.setProperty("tokenize.tokenized", "true");
        props.setProperty("tokenize.tagSeparator", "/");
        //        props.setProperty("ner.markTimeRanges", "true");

        pipeline = new StanfordCoreNLP(props);
        System.err.println("Parser ready.");
    }

    /**
     * Parse a sentence with the Stanford Parser, returning a JSON string of the dependencies and part-of-speech tags.
     * @param text
     * @return
     */
    public String parse(String text) {

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(text);

        // run all Annotators on this text
        pipeline.annotate(document);

        // these are all the sentences in this document
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        List<String> sentencesList = new ArrayList<String>();

        for (CoreMap sentence : sentences) {
            String sentenceString = sentence.get(CoreAnnotations.TextAnnotation.class);
            String wordsJSON = stringify(sentence);
            SemanticGraph dependencies = sentence
                    .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
            String dependenciesJSON = stringify(dependencies);
            String rootsJSON = stringify(dependencies.getRoots());
            sentencesList.add(String.format("{\"sentence\":\"%s\",%s,%s,\"roots\":%s}", sentenceString, wordsJSON,
                    dependenciesJSON, rootsJSON));
        }

        return String.format("{\"input\":\"%s\",\"sentences\":[%s]}", text, String.join(",", sentencesList));
    }

    public String stringify(Collection<IndexedWord> roots) {
        List<String> rootTokens = new ArrayList<String>();
        for (IndexedWord root : roots) {
            rootTokens.add(String.format("\"%s-%d\"", esc(root.value()), root.index()));
        }
        return "[" + String.join(",", rootTokens) + "]";
    }

    public String stringify(SemanticGraph dependencies) {
        List<String> depsList = new ArrayList<String>();

        for (SemanticGraphEdge eit : dependencies.edgeIterable()) {
            String rel = eit.getRelation().toString();
            IndexedWord gov = eit.getGovernor(), dep = eit.getDependent();
            String arg0 = gov.word().toString() + "-" + gov.index();
            String arg1 = dep.word().toString() + "-" + dep.index();
            depsList.add(String.format("{\"rel\":\"%s\",\"arg0\":\"%s\",\"arg1\":\"%s\"}", rel, arg0, arg1));
        }

        return String.format("\"dependencies\":[%s]", String.join(",", depsList));
    }

    public String stringify(CoreMap words) {

        List<String> posList = new ArrayList<String>();
        List<String> posMap = new ArrayList<String>();
        List<String> lemmaMap = new ArrayList<String>();
        List<String> timexMap = new ArrayList<String>();
        Map<String, List<String>> timexIdMap = new HashMap<String, List<String>>();

        for (CoreLabel token : words.get(CoreAnnotations.TokensAnnotation.class)) {

            String word = token.get(CoreAnnotations.TextAnnotation.class);
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
            String wordToken = esc(word) + "-" + token.index();

            posList.add(String.format("{\"token\":\"%s\",\"pos\":\"%s\"}", esc(word), esc(pos)));
            posMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(pos)));
            lemmaMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(lemma)));

            Timex t = token.get(TimeAnnotations.TimexAnnotation.class);
            if (t != null) {
                String tid = t.tid();
                if (timexIdMap.containsKey(tid)) {
                    timexIdMap.get(tid).add("\"" + wordToken + "\"");
                    continue;
                }

                List<String> tokens = new ArrayList<String>();
                tokens.add("\"" + wordToken + "\"");
                timexIdMap.put(tid, tokens);

                List<String> attributesList = new ArrayList<String>();
                Element xml = t.toXmlElement();
                NamedNodeMap attrs = xml.getAttributes();
                for (int i = 0; i < attrs.getLength(); i++) {
                    Node item = attrs.item(i);
                    String name = item.getNodeName();
                    String value = item.getNodeValue();
                    attributesList.add(String.format("\"%s\":\"%s\"", name, value));
                }
                String json = String.format("\"%s\":{%s}", t.tid(), String.join(",", attributesList));
                timexMap.add(json);
            }
        }

        String posListJSON = "[" + String.join(", ", posList) + "]";
        String posMapJSON = "{" + String.join(", ", posMap) + "}";
        String lemmaMapJSON = "{" + String.join(", ", lemmaMap) + "}";
        String timexMapJSON = "{" + String.join(", ", timexMap) + "}";

        List<String> temp = new ArrayList<String>();
        for (String tid : timexIdMap.keySet()) {
            temp.add(String.format("\"%s\":[%s]", tid, String.join(",", timexIdMap.get(tid))));
        }
        String timexIdMapJSON = "{" + String.join(",", temp) + "}";

        return String.format("\"pos\":{\"map\":%s,\"list\":%s},\"lemma\":%s,\"timex\":%s,\"timexGroups\":%s",
                posMapJSON, posListJSON, lemmaMapJSON, timexMapJSON, timexIdMapJSON);
    }

    public static String stringify(Exception e) {
        return String.format("{\"error\":\"%s\"}",
                e.getMessage() != null ? esc(e.getMessage()) : "no error message provided");
    }

    /**
     * Escape double quotes in a string.
     * @param str
     * @return
     */
    public static String esc(String str) {
        return str.replace("\"", "\\\"").replace("\n", "\\n");
    }

    public static List<String> array2list(String[] array) {
        List<String> list = new ArrayList<String>();
        for (String elem : array) {
            list.add(elem);
        }
        return list;
    }

}