Java tutorial
/* Continuous Parser - provides POS tags and typed dependencies in JSON format over HTTP and stdio. Copyright (C) 2015 Samuel Brian This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package org.sam_agent.csparser; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.ParserAnnotator; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.time.TimeAnnotations; import edu.stanford.nlp.time.Timex; import edu.stanford.nlp.util.CoreMap; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import java.util.*; public class ContinuousParser { public static final String ModelPath = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; public static final String Version = "v3.5.2"; private StanfordCoreNLP pipeline; /** * Initialises the Stanford Parser with required configuration and the given parser model. * @param modelPath * @throws Exception */ public ContinuousParser(String modelPath) { initCoreNLP_v3_5_2(modelPath); } /** * Initialises the Stanford Parser with required configuration and the built-in model. * @throws Exception */ public ContinuousParser() { this(ModelPath); } /** * CoreNLP initialisation for v3.5.2. * @param modelPath */ void initCoreNLP_v3_5_2(String modelPath) { if (modelPath == null) { modelPath = ModelPath; } Properties props = new Properties(); //props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner"); props.setProperty("annotators", "tokenize, ssplit, parse, lemma, ner"); props.setProperty("parse.flags", "-makeCopulaHead -retainTmpSubcategories"); props.setProperty("parse.model", modelPath); props.setProperty("parse.originalDependencies", "true"); props.setProperty("parse.maxlen", "80"); props.setProperty("parse.extradependencies", "REF_COLLAPSED_AND_SUBJ"); props.setProperty("parse.collapsed", "true"); props.setProperty("parse.outputFormat", "typedDependenciesCollapsed"); // probably has no affect //props.setProperty("parse.DEBUG", "true"); // verbose // props.setProperty("parse.retainTmpSubcategories", "true"); props.setProperty("tokenize.tokenized", "true"); props.setProperty("tokenize.tagSeparator", "/"); // props.setProperty("ner.markTimeRanges", "true"); pipeline = new StanfordCoreNLP(props); System.err.println("Parser ready."); } /** * Parse a sentence with the Stanford Parser, returning a JSON string of the dependencies and part-of-speech tags. * @param text * @return */ public String parse(String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); List<String> sentencesList = new ArrayList<String>(); for (CoreMap sentence : sentences) { String sentenceString = sentence.get(CoreAnnotations.TextAnnotation.class); String wordsJSON = stringify(sentence); SemanticGraph dependencies = sentence .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class); String dependenciesJSON = stringify(dependencies); String rootsJSON = stringify(dependencies.getRoots()); sentencesList.add(String.format("{\"sentence\":\"%s\",%s,%s,\"roots\":%s}", sentenceString, wordsJSON, dependenciesJSON, rootsJSON)); } return String.format("{\"input\":\"%s\",\"sentences\":[%s]}", text, String.join(",", sentencesList)); } public String stringify(Collection<IndexedWord> roots) { List<String> rootTokens = new ArrayList<String>(); for (IndexedWord root : roots) { rootTokens.add(String.format("\"%s-%d\"", esc(root.value()), root.index())); } return "[" + String.join(",", rootTokens) + "]"; } public String stringify(SemanticGraph dependencies) { List<String> depsList = new ArrayList<String>(); for (SemanticGraphEdge eit : dependencies.edgeIterable()) { String rel = eit.getRelation().toString(); IndexedWord gov = eit.getGovernor(), dep = eit.getDependent(); String arg0 = gov.word().toString() + "-" + gov.index(); String arg1 = dep.word().toString() + "-" + dep.index(); depsList.add(String.format("{\"rel\":\"%s\",\"arg0\":\"%s\",\"arg1\":\"%s\"}", rel, arg0, arg1)); } return String.format("\"dependencies\":[%s]", String.join(",", depsList)); } public String stringify(CoreMap words) { List<String> posList = new ArrayList<String>(); List<String> posMap = new ArrayList<String>(); List<String> lemmaMap = new ArrayList<String>(); List<String> timexMap = new ArrayList<String>(); Map<String, List<String>> timexIdMap = new HashMap<String, List<String>>(); for (CoreLabel token : words.get(CoreAnnotations.TokensAnnotation.class)) { String word = token.get(CoreAnnotations.TextAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String wordToken = esc(word) + "-" + token.index(); posList.add(String.format("{\"token\":\"%s\",\"pos\":\"%s\"}", esc(word), esc(pos))); posMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(pos))); lemmaMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(lemma))); Timex t = token.get(TimeAnnotations.TimexAnnotation.class); if (t != null) { String tid = t.tid(); if (timexIdMap.containsKey(tid)) { timexIdMap.get(tid).add("\"" + wordToken + "\""); continue; } List<String> tokens = new ArrayList<String>(); tokens.add("\"" + wordToken + "\""); timexIdMap.put(tid, tokens); List<String> attributesList = new ArrayList<String>(); Element xml = t.toXmlElement(); NamedNodeMap attrs = xml.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node item = attrs.item(i); String name = item.getNodeName(); String value = item.getNodeValue(); attributesList.add(String.format("\"%s\":\"%s\"", name, value)); } String json = String.format("\"%s\":{%s}", t.tid(), String.join(",", attributesList)); timexMap.add(json); } } String posListJSON = "[" + String.join(", ", posList) + "]"; String posMapJSON = "{" + String.join(", ", posMap) + "}"; String lemmaMapJSON = "{" + String.join(", ", lemmaMap) + "}"; String timexMapJSON = "{" + String.join(", ", timexMap) + "}"; List<String> temp = new ArrayList<String>(); for (String tid : timexIdMap.keySet()) { temp.add(String.format("\"%s\":[%s]", tid, String.join(",", timexIdMap.get(tid)))); } String timexIdMapJSON = "{" + String.join(",", temp) + "}"; return String.format("\"pos\":{\"map\":%s,\"list\":%s},\"lemma\":%s,\"timex\":%s,\"timexGroups\":%s", posMapJSON, posListJSON, lemmaMapJSON, timexMapJSON, timexIdMapJSON); } public static String stringify(Exception e) { return String.format("{\"error\":\"%s\"}", e.getMessage() != null ? esc(e.getMessage()) : "no error message provided"); } /** * Escape double quotes in a string. * @param str * @return */ public static String esc(String str) { return str.replace("\"", "\\\"").replace("\n", "\\n"); } public static List<String> array2list(String[] array) { List<String> list = new ArrayList<String>(); for (String elem : array) { list.add(elem); } return list; } }