ParserAnnotator.java :  » Natural-Language-Processing » Stanford-CoreNLP » edu » stanford » nlp » pipeline » Java Open Source

Java Open Source » Natural Language Processing » Stanford CoreNLP 
Stanford CoreNLP » edu » stanford » nlp » pipeline » ParserAnnotator.java
package edu.stanford.nlp.pipeline;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CyclicCoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.pipeline.Annotation.ParsePLAnnotation;
import edu.stanford.nlp.pipeline.Annotation.WordsPLAnnotation;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphFactory;
import edu.stanford.nlp.util.CoreMap;


/**
 * This class will add parse information to an Annotation.
 * It assumes that the Annotation already contains the tokenized words
 * as a {@code List<List<CoreLabel>>} under
 * {@code Annotation.WordsPLAnnotation.class}.
 * If the words have POS tags, they will be used.
 * 
 * If the input does not already have sentences, it adds parse information
 * to the Annotation under the key
 * {@code Annotation.ParsePLAnnotation.class} as a {@code List<Tree>}.
 * Otherwise, they are added to each sentence's coremap (get with
 * {@code CoreAnnotations.SentencesAnnotation}) under
 * {@code CoreAnnotations.TreeAnnotation}).
 *
 * @author Jenny Finkel
 */
public class ParserAnnotator implements Annotator {

  private final boolean VERBOSE;
  private final LexicalizedParser parser;
  // private Timing timer = new Timing();
  // private TreePrint treePrint;

  /** Do not parse sentences larger than this sentence length */
  int maxSentenceLength;
  
  public ParserAnnotator() {
    this(true, -1);
  }

  public ParserAnnotator(boolean verbose, int maxSent) {
    this(System.getProperty("parser.model", LexicalizedParser.DEFAULT_PARSER_LOC), verbose, maxSent);
  }

  public ParserAnnotator(String parserLoc, 
                         boolean verbose,
                         int maxSent) {
    this(loadModel(parserLoc, verbose), verbose, maxSent);
  }

  public ParserAnnotator(LexicalizedParser parser, boolean verbose, int maxSent) {
    VERBOSE = verbose;
    this.parser = parser;
    maxSentenceLength = maxSent;
  }

  private static LexicalizedParser loadModel(String parserLoc, boolean verbose) {
    if (verbose) {
      // timer.start();
      System.err.println("Loading Parser Model [" + parserLoc + "] ...");
    }
    LexicalizedParser result = new LexicalizedParser(parserLoc);
    result.setOptionFlags("-retainTmpSubcategories");
    // lp.setOptionFlags(new String[]{"-outputFormat", "penn,typedDependenciesCollapsed", "-retainTmpSubcategories"});
    // treePrint = lp.getTreePrint();

    // if (VERBOSE) {
    //   timer.stop("done.");
    // }
    return result;
  }

  // private static long millisecondsAnnotating = 0;

  public void annotate(Annotation annotation) {
    if (annotation.containsKey(WordsPLAnnotation.class)) {
      List<List<? extends CoreLabel>> sentences = annotation.get(WordsPLAnnotation.class);
      if (VERBOSE) {
        // timer.start();
        System.err.println("Adding Parser annotation...");
        System.err.println("  for: " + sentences);
      }

      List<Tree> trees = new ArrayList<Tree>();
      for (List<? extends CoreLabel> words : sentences) {
        trees.add(doOneSentence(words));
      }
      
      annotation.set(ParsePLAnnotation.class, trees);
      
      if (VERBOSE) {
        // millisecondsAnnotating += timer.stop("done.");
        for (Tree tree : trees) {
          System.err.println("output:\n"+tree.pennString()+"\n");
        }
      }
//    } else {
//    List<? extends CoreLabel> words = (List<? extends CoreLabel>) w;
//    Tree tree = doOneSentence(words);
//    annotation.setAnnotation(AnnotationKey.PARSE_KEY, tree);
//    if (VERBOSE) {
//    // millisecondsAnnotating += timer.stop("done.");
//    System.err.println("output:\n"+tree.pennString()+"\n");
//    }
//    }
    } else if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      // parse a tree for each sentence
      for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        Tree tree = null;
        List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
        if (VERBOSE) {
          System.err.println("Parsing: " + words);
        }
        // generate the constituent tree
        if(maxSentenceLength <= 0 || words.size() < maxSentenceLength) tree = parser.apply(words);
        else tree = xTree(words);
        
        // make sure all tree nodes are CoreLabels
        // TODO: why isn't this always true? something fishy is going on
        convertToCoreLabels(tree);
        
        // index nodes, i.e., add start and end token positions to all nodes
        // this is needed by other annotators down stream, e.g., the NFLAnnotator
        tree.indexSpans(0);
        
        sentence.set(CoreAnnotations.TreeAnnotation.class, tree);
        if (VERBOSE) {
          System.err.println("Tree is:");
          tree.pennPrint(System.err);
        }
        
        // generate the dependency graph
        try {
          SemanticGraph deps = generateCollapsedDependencies(tree);
          SemanticGraph uncollapsedDeps = generateUncollapsedDependencies(tree);
          SemanticGraph ccDeps = generateCCProcessedDependencies(tree);
          if (VERBOSE) {
            System.err.println("SDs:");
            System.err.println(deps.toString("plain"));
          }
          sentence.set(CoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
          sentence.set(CoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
          sentence.set(CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
        } catch(Exception e) {
          System.err.println("WARNING: Exception caught during extraction of Stanford dependencies. Will ignore and continue...");
          e.printStackTrace();
        }
      }

    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }
  
  public static SemanticGraph generateUncollapsedDependencies(Tree tree) {
    return generateDependencies(tree, false, false, false, true, true);
  }
  public static SemanticGraph generateCollapsedDependencies(Tree tree) {
    return generateDependencies(tree, true, false, false, true, true);
  }
  public static SemanticGraph generateCCProcessedDependencies(Tree tree) {
    return generateDependencies(tree, true, true, false, true, true);
  }
  
  public static SemanticGraph generateDependencies(Tree tree,
      boolean collapse,
      boolean ccProcess,
      boolean includeExtras,
      boolean lemmatize,
      boolean threadSafe) {
    SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, collapse, ccProcess, includeExtras, lemmatize, threadSafe);
    return deps;
  }
      
  
  /**
   * Converts the tree labels to CoreLabels
   * We need this because we store additional info in the CoreLabel, like token span
   * @param tree
   */
  public static void convertToCoreLabels(Tree tree) {
    Label l = tree.label();
    if(! (l instanceof CoreLabel)){
      CoreLabel cl = new CoreLabel();
      cl.setValue(l.value());
      tree.setLabel(cl);
    }
    
    for (Tree kid : tree.children()) {
      convertToCoreLabels(kid);
    }
  }
  
  private Tree doOneSentence(List<? extends CoreLabel> words) {
    // convert to CyclicCoreLabels because the parser hates CoreLabels
    List<CyclicCoreLabel> newWords = new ArrayList<CyclicCoreLabel>();
    for (CoreLabel fl : words) {
      CyclicCoreLabel ml = new CyclicCoreLabel();
      ml.setWord(fl.word());
      ml.setValue(fl.word());
      newWords.add(ml);
    }
    Tree tree = null;
    if(maxSentenceLength <= 0 || newWords.size() < maxSentenceLength) tree = parser.apply(newWords);
    else tree = xTree(newWords);
    return tree;
  }
  
  /**
   * Construct a fall through tree in case we can't parse this sentence
   * @param words
   * @return
   */
  private Tree xTree(List<? extends HasWord> words) {
    TreeFactory lstf = new LabeledScoredTreeFactory();
    List<Tree> lst2 = new ArrayList<Tree>();
    for (HasWord obj : words) {
      String s = obj.word().toString();
      Tree t = lstf.newLeaf(s);
      Tree t2 = lstf.newTreeNode("X", Collections.singletonList(t));
      lst2.add(t2);
    }
    return lstf.newTreeNode("X", lst2);
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.