package edu.stanford.nlp.pipeline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CyclicCoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.pipeline.Annotation.ParsePLAnnotation;
import edu.stanford.nlp.pipeline.Annotation.WordsPLAnnotation;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphFactory;
import edu.stanford.nlp.util.CoreMap;
/**
* This class will add parse information to an Annotation.
* It assumes that the Annotation already contains the tokenized words
* as a {@code List<List<CoreLabel>>} under
* {@code Annotation.WordsPLAnnotation.class}.
* If the words have POS tags, they will be used.
*
* If the input does not already have sentences, it adds parse information
* to the Annotation under the key
* {@code Annotation.ParsePLAnnotation.class} as a {@code List<Tree>}.
* Otherwise, they are added to each sentence's coremap (get with
* {@code CoreAnnotations.SentencesAnnotation}) under
* {@code CoreAnnotations.TreeAnnotation}).
*
* @author Jenny Finkel
*/
public class ParserAnnotator implements Annotator {
private final boolean VERBOSE;
private final LexicalizedParser parser;
// private Timing timer = new Timing();
// private TreePrint treePrint;
/** Do not parse sentences larger than this sentence length */
int maxSentenceLength;
public ParserAnnotator() {
this(true, -1);
}
public ParserAnnotator(boolean verbose, int maxSent) {
this(System.getProperty("parser.model", LexicalizedParser.DEFAULT_PARSER_LOC), verbose, maxSent);
}
public ParserAnnotator(String parserLoc,
boolean verbose,
int maxSent) {
this(loadModel(parserLoc, verbose), verbose, maxSent);
}
public ParserAnnotator(LexicalizedParser parser, boolean verbose, int maxSent) {
VERBOSE = verbose;
this.parser = parser;
maxSentenceLength = maxSent;
}
private static LexicalizedParser loadModel(String parserLoc, boolean verbose) {
if (verbose) {
// timer.start();
System.err.println("Loading Parser Model [" + parserLoc + "] ...");
}
LexicalizedParser result = new LexicalizedParser(parserLoc);
result.setOptionFlags("-retainTmpSubcategories");
// lp.setOptionFlags(new String[]{"-outputFormat", "penn,typedDependenciesCollapsed", "-retainTmpSubcategories"});
// treePrint = lp.getTreePrint();
// if (VERBOSE) {
// timer.stop("done.");
// }
return result;
}
// private static long millisecondsAnnotating = 0;
public void annotate(Annotation annotation) {
if (annotation.containsKey(WordsPLAnnotation.class)) {
List<List<? extends CoreLabel>> sentences = annotation.get(WordsPLAnnotation.class);
if (VERBOSE) {
// timer.start();
System.err.println("Adding Parser annotation...");
System.err.println(" for: " + sentences);
}
List<Tree> trees = new ArrayList<Tree>();
for (List<? extends CoreLabel> words : sentences) {
trees.add(doOneSentence(words));
}
annotation.set(ParsePLAnnotation.class, trees);
if (VERBOSE) {
// millisecondsAnnotating += timer.stop("done.");
for (Tree tree : trees) {
System.err.println("output:\n"+tree.pennString()+"\n");
}
}
// } else {
// List<? extends CoreLabel> words = (List<? extends CoreLabel>) w;
// Tree tree = doOneSentence(words);
// annotation.setAnnotation(AnnotationKey.PARSE_KEY, tree);
// if (VERBOSE) {
// // millisecondsAnnotating += timer.stop("done.");
// System.err.println("output:\n"+tree.pennString()+"\n");
// }
// }
} else if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// parse a tree for each sentence
for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Tree tree = null;
List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
if (VERBOSE) {
System.err.println("Parsing: " + words);
}
// generate the constituent tree
if(maxSentenceLength <= 0 || words.size() < maxSentenceLength) tree = parser.apply(words);
else tree = xTree(words);
// make sure all tree nodes are CoreLabels
// TODO: why isn't this always true? something fishy is going on
convertToCoreLabels(tree);
// index nodes, i.e., add start and end token positions to all nodes
// this is needed by other annotators down stream, e.g., the NFLAnnotator
tree.indexSpans(0);
sentence.set(CoreAnnotations.TreeAnnotation.class, tree);
if (VERBOSE) {
System.err.println("Tree is:");
tree.pennPrint(System.err);
}
// generate the dependency graph
try {
SemanticGraph deps = generateCollapsedDependencies(tree);
SemanticGraph uncollapsedDeps = generateUncollapsedDependencies(tree);
SemanticGraph ccDeps = generateCCProcessedDependencies(tree);
if (VERBOSE) {
System.err.println("SDs:");
System.err.println(deps.toString("plain"));
}
sentence.set(CoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
sentence.set(CoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
sentence.set(CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
} catch(Exception e) {
System.err.println("WARNING: Exception caught during extraction of Stanford dependencies. Will ignore and continue...");
e.printStackTrace();
}
}
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
public static SemanticGraph generateUncollapsedDependencies(Tree tree) {
return generateDependencies(tree, false, false, false, true, true);
}
public static SemanticGraph generateCollapsedDependencies(Tree tree) {
return generateDependencies(tree, true, false, false, true, true);
}
public static SemanticGraph generateCCProcessedDependencies(Tree tree) {
return generateDependencies(tree, true, true, false, true, true);
}
public static SemanticGraph generateDependencies(Tree tree,
boolean collapse,
boolean ccProcess,
boolean includeExtras,
boolean lemmatize,
boolean threadSafe) {
SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, collapse, ccProcess, includeExtras, lemmatize, threadSafe);
return deps;
}
/**
* Converts the tree labels to CoreLabels
* We need this because we store additional info in the CoreLabel, like token span
* @param tree
*/
public static void convertToCoreLabels(Tree tree) {
Label l = tree.label();
if(! (l instanceof CoreLabel)){
CoreLabel cl = new CoreLabel();
cl.setValue(l.value());
tree.setLabel(cl);
}
for (Tree kid : tree.children()) {
convertToCoreLabels(kid);
}
}
private Tree doOneSentence(List<? extends CoreLabel> words) {
// convert to CyclicCoreLabels because the parser hates CoreLabels
List<CyclicCoreLabel> newWords = new ArrayList<CyclicCoreLabel>();
for (CoreLabel fl : words) {
CyclicCoreLabel ml = new CyclicCoreLabel();
ml.setWord(fl.word());
ml.setValue(fl.word());
newWords.add(ml);
}
Tree tree = null;
if(maxSentenceLength <= 0 || newWords.size() < maxSentenceLength) tree = parser.apply(newWords);
else tree = xTree(newWords);
return tree;
}
/**
* Construct a fall through tree in case we can't parse this sentence
* @param words
* @return
*/
private Tree xTree(List<? extends HasWord> words) {
TreeFactory lstf = new LabeledScoredTreeFactory();
List<Tree> lst2 = new ArrayList<Tree>();
for (HasWord obj : words) {
String s = obj.word().toString();
Tree t = lstf.newLeaf(s);
Tree t2 = lstf.newTreeNode("X", Collections.singletonList(t));
lst2.add(t2);
}
return lstf.newTreeNode("X", lst2);
}
}
|