StanfordCoreNLP.java :  » Natural-Language-Processing » Stanford-CoreNLP » edu » stanford » nlp » pipeline » Java Open Source

Java Open Source » Natural Language Processing » Stanford CoreNLP 
Stanford CoreNLP » edu » stanford » nlp » pipeline » StanfordCoreNLP.java
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.pipeline;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import nu.xom.Attribute;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.ProcessingInstruction;
import nu.xom.Serializer;

import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TrueCaseAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TrueCaseTextAnnotation;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Factory;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;

/**
 * This is a pipeline that takes in a string and returns various analyzed
 * linguistic forms.
 * The String is tokenized via a tokenizer (such as PTBTokenizerAnnotator), and
 * then other sequence model style annotation can be used to add things like
 * lemmas, POS tags, and named entities.  These are returned as a list of CoreLabels.
 * Other analysis components build and store parse trees, dependency graphs, etc.
 * <p>
 * This class is designed to apply multiple Annotators
 * to an Annotation.  The idea is that you first
 * build up the pipeline by adding Annotators, and then
 * you take the objects you wish to annotate and pass
 * them in and get in return a fully annotated object.
 * Please see the package level javadocs for sample usage
 * and a more complete description.
 * <p>
 * The main entry point for the API is StanfordCoreNLP.process()
 * <p>
 * <i>Implementation note:</i> There are other annotation pipelines, but they
 * don't extend this one. Look for classes that implement Annotator and which
 * have "Pipeline" in their name.
 *
 * @author Jenny Finkel
 * @author Anna Rafferty
 * @author Christopher Manning
 * @author Mihai Surdeanu
 * @author Steven Bethard
 */

public class StanfordCoreNLP extends AnnotationPipeline {

  /*
   * List of all known annotator property names
   * Add new annotators and/or annotators from other groups here!
   */
  public static final String STANFORD_TOKENIZE = "tokenize";
  public static final String STANFORD_CLEAN_XML = "cleanxml";
  public static final String STANFORD_SSPLIT = "ssplit";
  public static final String STANFORD_POS = "pos";
  public static final String STANFORD_LEMMA = "lemma";
  public static final String STANFORD_NER = "ner";
  public static final String STANFORD_REGEXNER = "regexner";
  public static final String STANFORD_GENDER = "gender";
  public static final String STANFORD_TRUECASE = "truecase";
  public static final String STANFORD_PARSE = "parse";
  public static final String STANFORD_DETERMINISTIC_COREF = "dcoref";

  /** Formats the constituent parse trees for display */
  private TreePrint constituentTreePrinter;
  /** Formats the dependency parse trees for human-readable display */
  private TreePrint dependencyTreePrinter;
  /** Converts the constituent tree to a set of dependencies (for display) */
  private GrammaticalStructureFactory gsf;

  /** Stores the overall number of words processed */
  private int numWords;

  /** Maintains the shared pool of annotators */
  private static AnnotatorPool pool = null;

  private Properties properties;

  /**
   * Prints the list of properties required to run the pipeline
   * @param os PrintStream to print usage to
   */
  private static void printRequiredProperties(PrintStream os) {
    os.println("The following properties can be defined:");
    os.println("(if -props or -annotators is not passed, default properties will be loaded via the classpath)");
    os.println("\t\"annotators\" - comma separated list of annotators");
    os.println("\t\tThe following annotators are supported: tokenize, cleanxml, ssplit, pos, lemma, ner, truecase, parse, coref, dcoref");

    os.println("\n\tIf annotator \"pos\" is defined:");
    os.println("\t\"pos.model\" - path towards the POS tagger model");

    os.println("\n\tIf annotator \"ner\" is defined:");
    os.println("\t\"ner.model.3class\" - path towards the three-class NER model");
    os.println("\t\"ner.model.7class\" - path towards the seven-class NER model");
    os.println("\t\"ner.model.MISCclass\" - path towards the NER model with a MISC class");

    os.println("\n\tIf annotator \"truecase\" is defined:");
    os.println("\t\"truecase.model\" - path towards the true-casing model; default: " + DefaultPaths.DEFAULT_TRUECASE_MODEL);
    os.println("\t\"truecase.bias\" - class bias of the true case model; default: " + TrueCaseAnnotator.DEFAULT_MODEL_BIAS);
    os.println("\t\"truecase.mixedcasefile\" - path towards the mixed case file; default: " + DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);

    os.println("\n\tIf annotator \"parse\" is defined:");
    os.println("\t\"parser.model\" - path towards the PCFG parser model");

    /* XXX: unstable, do not use for now
    os.println("\n\tIf annotator \"srl\" is defined:");
    os.println("\t\"srl.verb.args\" - path to the file listing verbs and their core arguments (\"verbs.core_args\")");
    os.println("\t\"srl.model.id\" - path prefix for the role identification model (adds \".model.gz\" and \".fe\" to this prefix)");
    os.println("\t\"srl.model.cls\" - path prefix for the role classification model (adds \".model.gz\" and \".fe\" to this prefix)");
    os.println("\t\"srl.model.jic\" - path to the directory containing the joint model's \"model.gz\", \"fe\" and \"je\" files");
    os.println("\t                  (if not specified, the joint model will not be used)");
    */

    os.println("\nCommand line properties:");
    os.println("\t\"file\" - run the pipeline on the content of this file, or on the content of the files in this directory");
    os.println("\t         XML output is generated for every input file \"file\" as file.xml");
    os.println("\t\"extension\" - if -file used with a directory, process only the files with this extension");
    os.println("\t\"filelist\" - run the pipeline on the list of files given in this file");
    os.println("\t             XML output is generated for every input file as file.outputExtension");
    os.println("\t\"outputDirectory\" - where to put XML output (defaults to the current directory)");
    os.println("\t\"outputExtension\" - extension to use for the output file (defaults to \".xml\").  Don't forget the dot!");
    os.println("\t\"replaceExtension\" - flag to chop off the last extension before adding outputExtension to file");
    os.println("\t\"noClobber\" - don't automatically override (clobber) output files that already exist");
    os.println("\nIf none of the above are present, run the pipeline in an interactive shell (default properties will be loaded from the classpath).");
    os.println("The shell accepts input from stdin and displays the output at stdout.");
    os.println();
  }

  private static String getProperty(Properties props, String name) {
    String val = props.getProperty(name);
    if (val == null) {
      System.err.println("Missing property \"" + name + "\"!");
      printRequiredProperties(System.err);
      throw new RuntimeException("Missing property: \"" + name + '\"');
    }
    return val;
  }
  private static String getProperty(Properties props, String name, String defaultValue) {
    return props.getProperty(name, defaultValue);
  }

  /**
   * Finds the properties file in the classpath and loads the properties from there
   * @return
   */
  private static Properties loadPropertiesFromClasspath() {
    List<String> validNames = Arrays.asList("StanfordCoreNLP", "edu.stanford.nlp.pipeline.StanfordCoreNLP");
    for(String name: validNames){
      Properties props = loadProperties(name);
      if(props != null) return props;
    }
    throw new RuntimeException("ERROR: Could not find properties file in the classpath!");
  }

  private static Properties loadProperties(String name) {
    return loadProperties(name, Thread.currentThread().getContextClassLoader());
  }

  private static final String PROPS_SUFFIX = ".properties";

  private static Properties loadProperties(String name, ClassLoader loader){
    if(name.endsWith (PROPS_SUFFIX)) name = name.substring(0, name.length () - PROPS_SUFFIX.length ());
    name = name.replace('.', '/');
    name += PROPS_SUFFIX;
    Properties result = null;

    // Returns null on lookup failures
    System.err.println("Searching for resource: " + name);
    InputStream in = loader.getResourceAsStream (name);
    try {
      if (in != null) {
        result = new Properties ();
        result.load(in); // Can throw IOException
      }
    } catch (IOException e) {
      result = null;
    } finally {
      if (in != null) try { in.close (); } catch (Throwable ignore) {}
    }

    return result;
  }

  private void construct(AnnotatorPool pool, Properties props, boolean enforceRequirements) {
    this.numWords = 0;
    this.constituentTreePrinter = new TreePrint("penn");
    this.dependencyTreePrinter = new TreePrint("typedDependenciesCollapsed");
    this.gsf = new PennTreebankLanguagePack().grammaticalStructureFactory();

    if(props == null){
      // if undefined, find the props file in the classpath
      props = loadPropertiesFromClasspath();
    } else if (props.getProperty("annotators") == null) {
      // this happens when some command line options are specified (e.g just "-filelist") but no properties file is.
      // we use the options that are given and let them override the default properties from the class path properties.
      Properties fromClassPath = loadPropertiesFromClasspath();
      fromClassPath.putAll(props);
      props = fromClassPath;
    }
    this.properties = props;
    if(pool == null) {
      // if undefined, use the default pool
      pool = getDefaultAnnotatorPool(props);
    }

    // define requirements
    Map<String, List<String>> requires = new HashMap<String, List<String>>();
    if(enforceRequirements){
      requires.put(STANFORD_TOKENIZE, Arrays.<String>asList());
      requires.put(STANFORD_CLEAN_XML, Arrays.asList(STANFORD_TOKENIZE));
      requires.put(STANFORD_SSPLIT, Arrays.asList(STANFORD_TOKENIZE));
      requires.put(STANFORD_POS, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT));
      requires.put(STANFORD_LEMMA, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS));
      requires.put(STANFORD_NER, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA));
      requires.put(STANFORD_REGEXNER, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT));
      requires.put(STANFORD_GENDER, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT));
      requires.put(STANFORD_TRUECASE, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA));
      requires.put(STANFORD_PARSE, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS));
      requires.put(STANFORD_DETERMINISTIC_COREF, Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_NER, STANFORD_PARSE));
    }

    // now construct the annotators from the given props in a reasonable order
    HashSet<String> annoNames = new HashSet<String>(Arrays.asList(getProperty(props, "annotators").split("[, \t]+")));
    for (String name: Arrays.asList(STANFORD_TOKENIZE, STANFORD_CLEAN_XML, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_REGEXNER, STANFORD_GENDER, STANFORD_TRUECASE, STANFORD_PARSE, STANFORD_DETERMINISTIC_COREF)) {
      if (annoNames.contains(name)) {

        // check for any required annotators
        if(requires.containsKey(name)){
          for (String required: requires.get(name)) {
            if (! annoNames.contains(required)) {
              String fmt = "annotator \"%s\" requires annotator \"%s\"";
              throw new RuntimeException(String.format(fmt, name, required));
            }
          }
        }

        // create the annotator and add it to the list
        Annotator an = pool.get(name);
        this.addAnnotator(an);
      }
    }
  }

  /**
   * Constructs a pipeline using as properties the properties file found in the classpath
   */
  public StanfordCoreNLP() {
    this((Properties) null);
  }

  /**
   * Construct a basic pipeline. The Properties will be used to determine
   * which annotators to create, and a default AnnotatorPool will be used
   * to create the annotators.
   *
   */
  public StanfordCoreNLP(Properties props)  {
    this(props, true);
  }

  public StanfordCoreNLP(AnnotatorPool pool, Properties props) {
    this(pool, props, true);
  }

  public StanfordCoreNLP(Properties props, boolean enforceRequirements)  {
    this(null, props, enforceRequirements);
  }

  /**
   * Construct a basic pipeline. The Properties will be used to determine
   * which annotators to create, and the AnnotatorPool will be used to create
   * the specified annotators.
   *
   */
  public StanfordCoreNLP(AnnotatorPool pool, Properties props, boolean enforceRequirements) {
    construct(pool, props, enforceRequirements);
  }

  /**
   * Constructs a pipeline with the properties read from this file, which must be found in the classpath
   * @param propsFileNamePrefix
   */
  public StanfordCoreNLP(String propsFileNamePrefix) {
    this(propsFileNamePrefix, true);
  }

  public StanfordCoreNLP(String propsFileNamePrefix, boolean enforceRequirements) {
    Properties props = loadProperties(propsFileNamePrefix);
    if(props == null){
      throw new RuntimeException("ERROR: cannot find properties file \"" + propsFileNamePrefix + "\" in the classpath!");
    }
    construct(null, props, enforceRequirements);
  }

  /** Fetches the Properties object used to construct this Annotator */
  public Properties getProperties() { return properties; }

  private static synchronized AnnotatorPool getDefaultAnnotatorPool(final Properties props) {
    // if the pool already exists reuse!
    if(pool != null) return pool;

    pool = new AnnotatorPool();

    //
    // tokenizer: breaks text into a sequence of tokens
    // this is required for all following annotators!
    //
    pool.register(STANFORD_TOKENIZE, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        return new PTBTokenizerAnnotator(false);
      }
    });

    pool.register(STANFORD_CLEAN_XML, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        String xmlTags = 
          props.getProperty("clean.xmltags", 
                            CleanXmlAnnotator.DEFAULT_XML_TAGS);
        String sentenceEndingTags = 
          props.getProperty("clean.sentenceendingtags",
                            CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS);
        String allowFlawedString = props.getProperty("clean.allowflawedxml");
        boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS;
        if (allowFlawedString != null)
          allowFlawed = Boolean.valueOf(allowFlawedString);
        return new CleanXmlAnnotator(xmlTags, sentenceEndingTags, allowFlawed);
      }
    });

    //
    // sentence splitter: splits the above sequence of tokens into sentences
    // this is required when processing entire documents or text consisting of multiple sentences
    //
    pool.register(STANFORD_SSPLIT, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        WordsToSentencesAnnotator wts = new WordsToSentencesAnnotator(false);

        // regular boundaries
        String bounds = props.getProperty("ssplit.boundariesToDiscard");
        if(bounds != null){
          String [] toks = bounds.split(",");
          // for(int i = 0; i < toks.length; i ++) System.err.println("BOUNDARY: " + toks[i]);
          wts.setSentenceBoundaryToDiscard(new HashSet<String>(Arrays.asList(toks)));
        }

        // HTML boundaries
        bounds = props.getProperty("ssplit.htmlBoundariesToDiscard");
        if(bounds != null){
          String [] toks = bounds.split(",");
          wts.addHtmlSentenceBoundaryToDiscard(new HashSet<String>(Arrays.asList(toks)));
        }

        return wts;
      }
    });

    //
    // POS tagger
    //
    pool.register(STANFORD_POS, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        try {
          String maxLenStr = props.getProperty("pos.maxlen");
          int maxLen = Integer.MAX_VALUE;
          if(maxLenStr != null) maxLen = Integer.parseInt(maxLenStr);
          return new POSTaggerAnnotator(getProperty(props, "pos.model", DefaultPaths.DEFAULT_POS_MODEL), true, maxLen);
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }
    });

    //
    // Lemmatizer
    //
    pool.register(STANFORD_LEMMA, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        return new MorphaAnnotator(false);
      }
    });

    //
    // NER
    //
    pool.register(STANFORD_NER, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        List<String> models = new ArrayList<String>();
        List<Pair<String, String>> modelNames = new ArrayList<Pair<String,String>>();
        modelNames.add(new Pair<String, String>("ner.model", null));
        modelNames.add(new Pair<String, String>("ner.model.3class", DefaultPaths.DEFAULT_NER_THREECLASS_MODEL));
        modelNames.add(new Pair<String, String>("ner.model.7class", DefaultPaths.DEFAULT_NER_MUC_MODEL));
        modelNames.add(new Pair<String, String>("ner.model.MISCclass", DefaultPaths.DEFAULT_NER_CONLL_MODEL));

        for (Pair<String, String> name: modelNames) {
          String model = props.getProperty(name.first, name.second);
          if (model != null && model.length() > 0) {
            models.add(model);
          }
        }
        if (models.isEmpty()) {
          throw new RuntimeException("no NER models specified");
        }
        NERClassifierCombiner nerCombiner;
        try {
          nerCombiner = new NERClassifierCombiner(models.toArray(new String[models.size()]));
          boolean applyNumericClassifiers = Boolean.parseBoolean(
                  props.getProperty("ner.applyNumericClassifiers", "true"));
          nerCombiner.setApplyNumericClassifiers(applyNumericClassifiers);
        } catch (FileNotFoundException e) {
          throw new RuntimeException(e);
        }
        // ms 2009, no longer needed: the functionality of all these annotators is now included in NERClassifierCombiner
        /*
        AnnotationPipeline pipeline = new AnnotationPipeline();
        pipeline.addAnnotator(new NERCombinerAnnotator(nerCombiner, false));
        pipeline.addAnnotator(new NumberAnnotator(false));
        pipeline.addAnnotator(new TimeWordAnnotator(false));
        pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
        return pipeline;
        */
        return new NERCombinerAnnotator(nerCombiner, false);
      }
    });

    //
    // Regex NER
    //
    pool.register(STANFORD_REGEXNER, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        String mapping = props.getProperty("regexner.mapping", DefaultPaths.DEFAULT_REGEXNER_RULES);
        String ignoreCase = props.getProperty("regexner.ignorecase", "false");
        return new RegexNERAnnotator(mapping, Boolean.valueOf(ignoreCase));
      }
    });

    //
    // Gender Annotator
    //
    pool.register(STANFORD_GENDER, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        return new GenderAnnotator(false, props.getProperty("gender.firstnames", DefaultPaths.DEFAULT_GENDER_FIRST_NAMES));
      }
    });


    //
    // True caser
    //
    pool.register(STANFORD_TRUECASE, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        String model = props.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL);
        String bias = props.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS);
        String mixed = props.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
        return new TrueCaseAnnotator(model, bias, mixed, false);
      }
    });

    //
    // Parser
    //
    pool.register(STANFORD_PARSE, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        String maxLenStr = props.getProperty("parser.maxlen");

        int maxLen = -1;
        if(maxLenStr != null) maxLen = Integer.parseInt(maxLenStr);
        ParserAnnotator anno = new ParserAnnotator(getProperty(props, "parser.model", DefaultPaths.DEFAULT_PARSER_MODEL), PropertiesUtils.hasProperty(props, "parser.debug"), maxLen);

        // no longer supported. we now generate both collapsed and uncollapsed dependencies
        /*
        String val = props.getProperty("parser.collapse.deps");
        if(val != null) anno.setCollapse(Boolean.parseBoolean(val));
        val = props.getProperty("parser.ccprocess.deps");
        if(val != null) anno.setCcProcess(Boolean.parseBoolean(val));
        val = props.getProperty("parser.extras.deps");
        if(val != null) anno.setIncludeExtras(Boolean.parseBoolean(val));
        val = props.getProperty("parser.lemmatize.deps");
        if(val != null) anno.setLemmatize(Boolean.parseBoolean(val));
        */

        return anno;
      }
    });

    //
    // Coreference resolution
    //
    pool.register(STANFORD_DETERMINISTIC_COREF, new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        return new DeterministicCorefAnnotator(props);
      }
    });

    //
    // add more annotators here!
    //
    return pool;
  }

  public static synchronized Annotator getExistingAnnotator(String name) {
    if(pool == null){
      System.err.println("ERROR: attempted to fetch annotator \"" + name + "\" before the annotator pool was created!");
      return null;
    }
    try {
      Annotator a =  pool.get(name);
      return a;
    } catch(IllegalArgumentException e) {
      System.err.println("ERROR: attempted to fetch annotator \"" + name + "\" but the annotator pool does not store any such type!");
      return null;
    }
  }

  /* (non-Javadoc)
   * @see edu.stanford.nlp.pipeline.AnnotationPipeline#annotate(edu.stanford.nlp.pipeline.Annotation)
   */
  public void annotate(Annotation annotation) {
    super.annotate(annotation);
    List<CoreLabel> words = annotation.get(CoreAnnotations.TokensAnnotation.class);
    if (words != null) {
      numWords += words.size();
    }
  }

  /** Return a String that gives detailed human-readable information about
   *  how much time was spent by each annotator and by the entire annotation
   *  pipeline.
   *
   *  @return Human readable information on time spent in processing.
   */
  @Override
  public String timingInformation() {
    StringBuilder sb = new StringBuilder(super.timingInformation());
    if (TIME && numWords >= 0) {
      long total = this.getTotalTime();
      sb.append(" for ").append(this.numWords).append(" tokens at ");
      sb.append(String.format("%f", numWords / (((double) total)/1000)));
      sb.append( " tokens/sec.");
    }
    return sb.toString();
  }

  /**
   * Runs the entire pipeline on the content of the given text passed in.
   * @param text The text to process
   * @return An Annotation object containing the output of all annotators
   */
  public Annotation process(String text) {
    Annotation annotation = new Annotation(text);
    annotate(annotation);
    return annotation;
  }

  /**
   * Displays the output of all annotators in a format easily readable by people
   * @param annotation Contains the output of all annotators
   * @param os The output stream
   */
  public void prettyPrint(Annotation annotation, PrintWriter os) {
    String beamAsString = properties.getProperty("printable.relation.beam");
    double beam = 0.0;
    if (beamAsString != null) {
      beam = Double.parseDouble(beamAsString);
    }

    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);

    // Display docid if available
    String docId =  annotation.get(CoreAnnotations.DocIDAnnotation.class);
    if (docId != null) {
      List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      int nSentences = (sentences != null)? sentences.size():0;
      int nTokens = (tokens != null)? tokens.size():0;
      os.printf("Document: ID=%s (%d sentences, %d tokens)\n", docId, nSentences, nTokens);
    }

    // display each sentence in this annotation
    if (sentences != null) {
      for(int i = 0, sz = sentences.size(); i < sz; i ++) {
        CoreMap sentence = sentences.get(i);
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        os.printf("Sentence #%d (%d tokens):\n", (i + 1), tokens.size());

        String text = sentence.get(CoreAnnotations.TextAnnotation.class);
        os.println(text);

        // display the token-level annotations
        String[] tokenAnnotations = new String[]{
                "Text", "PartOfSpeech", "Lemma", "Answer", "NamedEntityTag", "CharacterOffsetBegin", "CharacterOffsetEnd", "NormalizedNamedEntityTag", "TrueCase", "TrueCaseText" };
        for (CoreLabel token: tokens) {
          os.print(token.toShorterString(tokenAnnotations) + " ");
        }
        os.println();

        // display the parse tree for this sentence
        Tree tree = sentence.get(CoreAnnotations.TreeAnnotation.class);
        if (tree != null){
          constituentTreePrinter.printTree(tree, os);
          dependencyTreePrinter.printTree(tree, os);
        }

        // display MachineReading entities and relations
        List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
        if(entities != null){
          System.err.println("Extracted the following MachineReading entity mentions:");
          for(EntityMention e: entities){
            System.err.println("\t" + e);
          }
        }
        List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
        if(relations != null){
          System.err.println("Extracted the following MachineReading relation mentions:");
          for(RelationMention r: relations){
            if(r.printableObject(beam)){
              System.err.println(r);
            }
          }
        }
      }
    }

    // display the old-style doc-level coref annotations
    // this is not supported anymore!
    //String corefAnno = annotation.get(CorefPLAnnotation.class);
    //if(corefAnno != null) os.println(corefAnno);

    // display the new-style coreference graph
    List<Pair<IntTuple, IntTuple>> graph = annotation.get(CoreAnnotations.CorefGraphAnnotation.class);
    if(graph != null && sentences != null) {
      List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>();
      for (CoreMap sentence : sentences) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        sents.add(tokens);
      }

      os.println("Coreference links:");
      for(Pair<IntTuple, IntTuple> link: graph){
        IntTuple src = link.first;
        IntTuple dst = link.second;
        // all offsets start at 1!
        os.println("\t" + src + " -> " + dst + ", that is: \"" +
            sents.get(src.get(0) - 1).get(src.get(1) - 1).get(TextAnnotation.class) + "\" -> \"" +
            sents.get(dst.get(0) - 1).get(dst.get(1) - 1).get(TextAnnotation.class) + "\"");
      }
    }

    os.flush();
  }

  /**
   * Wrapper around xmlPrint(Annotation, OutputStream).
   * Added for backward compatibility.
   * @param annotation
   * @param pw
   * @throws IOException
   */
  public void xmlPrint(Annotation annotation, PrintWriter pw) throws IOException {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    xmlPrint(annotation, os);
    pw.print(new String(os.toByteArray(), "UTF-8"));
  }

  /**
   * Displays the output of all annotators in XML format
   * @param annotation Contains the output of all annotators
   * @param os The output stream
   * @throws IOException
   */
  public void xmlPrint(Annotation annotation, OutputStream os) throws IOException {
    Document xmlDoc = annotationToDoc(annotation);
    Serializer ser = new Serializer(os, "UTF-8");
    ser.setIndent(2);
    ser.setMaxLength(0);
    ser.write(xmlDoc);
    ser.flush();
  }

  // the namespace is set in the XSLT file
  private static final String NAMESPACE_URI = null;
  private static final String STYLESHEET_NAME = "CoreNLP-to-HTML.xsl";

  /**
   * Converts the given annotation to an XML document
   */
  public Document annotationToDoc(Annotation annotation) {
    //
    // create the XML document with the root node pointing to the namespace URL
    //
    Element root = new Element("root", NAMESPACE_URI);
    Document xmlDoc = new Document(root);
    ProcessingInstruction pi = new ProcessingInstruction("xml-stylesheet",
          "href=\"" + STYLESHEET_NAME + "\" type=\"text/xsl\"");
    xmlDoc.insertChild(pi, 0);
    Element docElem = new Element("document", NAMESPACE_URI);
    root.appendChild(docElem);

    String docId =  annotation.get(CoreAnnotations.DocIDAnnotation.class);
    if (docId != null) {
      docElem.appendChild(new Element("docId", docId));
    }
    Element sentencesElem = new Element("sentences", NAMESPACE_URI);
    docElem.appendChild(sentencesElem);

    //
    // save the info for each sentence in this doc
    //
    int sentCount = 1;
    if(annotation.get(CoreAnnotations.SentencesAnnotation.class) != null){
      for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        Element sentElem = new Element("sentence", NAMESPACE_URI);
        sentElem.addAttribute(new Attribute("id", Integer.toString(sentCount)));
        sentCount ++;

        // add the word table with all token-level annotations
        Element wordTable = new Element("tokens", NAMESPACE_URI);
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for(int j = 0; j < tokens.size(); j ++){
          Element wordInfo = new Element("token", NAMESPACE_URI);
          addWordInfo(wordInfo, tokens.get(j), j + 1, NAMESPACE_URI);
          wordTable.appendChild(wordInfo);
        }
        sentElem.appendChild(wordTable);

        // add tree info
        Tree tree = sentence.get(CoreAnnotations.TreeAnnotation.class);

        if(tree != null){
          // add the constituent tree for this sentence
          Element parseInfo = new Element("parse", NAMESPACE_URI);
          addConstituentTreeInfo(parseInfo, tree);
          sentElem.appendChild(parseInfo);

          // add the dependencies for this sentence
          Element depInfo = new Element("basic-dependencies", NAMESPACE_URI);
          addDependencyTreeInfo(depInfo, sentence.get(CoreAnnotations.BasicDependenciesAnnotation.class), tokens, NAMESPACE_URI);
          sentElem.appendChild(depInfo);

          depInfo = new Element("collapsed-dependencies", NAMESPACE_URI);
          addDependencyTreeInfo(depInfo, sentence.get(CoreAnnotations.CollapsedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
          sentElem.appendChild(depInfo);

          depInfo = new Element("collapsed-ccprocessed-dependencies", NAMESPACE_URI);
          addDependencyTreeInfo(depInfo, sentence.get(CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
          sentElem.appendChild(depInfo);
        }

        // add the MR entities and relations
        List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
        List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
        if (entities != null && entities.size() > 0){
          Element mrElem = new Element("MachineReading", NAMESPACE_URI);
          Element entElem = new Element("entities", NAMESPACE_URI);
          addEntities(entities, entElem, NAMESPACE_URI);
          mrElem.appendChild(entElem);

          if(relations != null){
            Element relElem = new Element("relations", NAMESPACE_URI);
            addRelations(relations, relElem, NAMESPACE_URI, properties.getProperty("printable.relation.beam"));
            mrElem.appendChild(relElem);
          }

          sentElem.appendChild(mrElem);
        }

        // add the sentence to the root
        sentencesElem.appendChild(sentElem);
      }
    }

    //
    // add the coref graph
    //
    List<Pair<IntTuple, IntTuple>> graph = annotation.get(CoreAnnotations.CorefGraphAnnotation.class);
    if(graph != null){
      Element corefInfo = new Element("coreference", NAMESPACE_URI);
      addCorefGraphInfo(corefInfo, graph, NAMESPACE_URI);
      docElem.appendChild(corefInfo);
    }

    //
    // save any document-level annotations here
    //

    return xmlDoc;
  }

  /**
   * Generates the XML content for a constituent tree
   */
  private void addConstituentTreeInfo(Element treeInfo, Tree tree) {
    StringWriter treeStrWriter = new StringWriter();
    constituentTreePrinter.printTree(tree, new PrintWriter(treeStrWriter, true));
    String temp = treeStrWriter.toString();
    //System.err.println(temp);
    treeInfo.appendChild(temp);
  }

  private void addDependencyTreeInfo(Element depInfo, SemanticGraph graph, List<CoreLabel> tokens, String curNS) {
    if(graph != null) {
      for (SemanticGraphEdge edge : graph.edgeList()) {
        String rel = edge.getRelation().toString();
        rel = rel.replaceAll("\\s+", "");
        int source = edge.getSource().index();
        int target = edge.getTarget().index();

        Element depElem = new Element("dep", curNS);
        depElem.addAttribute(new Attribute("type", rel));

        Element govElem = new Element("governor", curNS);
        govElem.addAttribute(new Attribute("idx", Integer.toString(source)));
        govElem.appendChild(tokens.get(source - 1).word());
        depElem.appendChild(govElem);

        Element dependElem = new Element("dependent", curNS);
        dependElem.addAttribute(new Attribute("idx", Integer.toString(target)));
        dependElem.appendChild(tokens.get(target -1).word());
        depElem.appendChild(dependElem);

        depInfo.appendChild(depElem);
      }
    }
  }

  /**
   * Generates the XML content for a dependency tree
   */
  @SuppressWarnings("unused")
  private void addDependencyTreeInfo(Element depInfo, Tree tree, String curNS) {
    if (tree != null){
      GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
      Collection<TypedDependency> deps = gs.typedDependencies();
      for(TypedDependency dep: deps){
        Element depElem = new Element("dep", curNS);
        depElem.addAttribute(new Attribute("type", dep.reln().getShortName()));

        Element govElem = new Element("governor", curNS);
        govElem.addAttribute(new Attribute("idx", Integer.toString(dep.gov().index())));
        govElem.appendChild(dep.gov().value());
        depElem.appendChild(govElem);

        Element dependElem = new Element("dependent", curNS);
        dependElem.addAttribute(new Attribute("idx", Integer.toString(dep.dep().index())));
        dependElem.appendChild(dep.dep().value());
        depElem.appendChild(dependElem);

        depInfo.appendChild(depElem);
      }
    }
  }

  /**
   * Generates the XML content for MachineReading entities
   */
  private static void addEntities(List<EntityMention> entities, Element top, String curNS) {
    for(EntityMention e: entities){
      Element ee = e.toXML(curNS);
      top.appendChild(ee);
    }
  }

  /**
   * Generates the XML content for MachineReading relations
   */
  private static void addRelations(List<RelationMention> relations, Element top, String curNS, String beamAsString){
    double beam = 0;
    if(beamAsString != null) beam = Double.parseDouble(beamAsString);
    for(RelationMention r: relations){
      if(r.printableObject(beam)) {
        Element re = r.toXML(curNS);
        top.appendChild(re);
      }
    }
  }

  /**
   * Generates the XML content for the coreference graph
   */
  private static void addCorefGraphInfo(Element corefInfo, List<Pair<IntTuple, IntTuple>> graph, String curNS) {
    for(Pair<IntTuple, IntTuple> link: graph) {
      Element linkElem = new Element("link", curNS);

      Element srcElem = new Element("source", curNS);
      srcElem.addAttribute(new Attribute("sentence", Integer.toString(link.first.get(0))));
      srcElem.addAttribute(new Attribute("token", Integer.toString(link.first.get(1))));
      linkElem.appendChild(srcElem);

      Element dstElem = new Element("destination", curNS);
      dstElem.addAttribute(new Attribute("sentence", Integer.toString(link.second.get(0))));
      dstElem.addAttribute(new Attribute("token", Integer.toString(link.second.get(1))));
      linkElem.appendChild(dstElem);

      corefInfo.appendChild(linkElem);
    }
  }

  private static void addWordInfo(Element wordInfo, CoreMap token, int id, String curNS) {
    // store the position of this word in the sentence
    wordInfo.addAttribute(new Attribute("id", Integer.toString(id)));

    setSingleElement(wordInfo, "word", curNS, token.get(TextAnnotation.class));
    setSingleElement(wordInfo, "lemma", curNS, token.get(LemmaAnnotation.class));

    if (token.containsKey(CharacterOffsetBeginAnnotation.class) && token.containsKey(CharacterOffsetEndAnnotation.class)) {
      setSingleElement(wordInfo, "CharacterOffsetBegin", curNS, Integer.toString(token.get(CharacterOffsetBeginAnnotation.class)));
      setSingleElement(wordInfo, "CharacterOffsetEnd", curNS, Integer.toString(token.get(CharacterOffsetEndAnnotation.class)));
    }

    if (token.containsKey(PartOfSpeechAnnotation.class)) {
      setSingleElement(wordInfo, "POS", curNS, token.get(PartOfSpeechAnnotation.class));
    }

    if (token.containsKey(NamedEntityTagAnnotation.class)) {
      setSingleElement(wordInfo, "NER", curNS, token.get(NamedEntityTagAnnotation.class));
    }

    if (token.containsKey(NormalizedNamedEntityTagAnnotation.class)) {
      setSingleElement(wordInfo, "NormalizedNER", curNS, token.get(NormalizedNamedEntityTagAnnotation.class));
    }

    if (token.containsKey(CoreAnnotations.TrueCaseAnnotation.class)) {
      Element cur = new Element("TrueCase", curNS);
      cur.appendChild(token.get(TrueCaseAnnotation.class));
      wordInfo.appendChild(cur);
    }
    if (token.containsKey(CoreAnnotations.TrueCaseTextAnnotation.class)) {
      Element cur = new Element("TrueCaseText", curNS);
      cur.appendChild(token.get(TrueCaseTextAnnotation.class));
      wordInfo.appendChild(cur);
    }

//    IntTuple corefDest;
//    if((corefDest = label.get(CorefDestAnnotation.class)) != null){
//      Element cur = new Element("coref", curNS);
//      String value = Integer.toString(corefDest.get(0)) + "." + Integer.toString(corefDest.get(1));
//      cur.setText(value);
//      wordInfo.addContent(cur);
//    }
  }

  /**
   * Helper method for addWordInfo().  If the value is not null,
   * creates an element of the given name and namespace and adds it to the
   * tokenElement.
   *
   * @param tokenElement This is the element to which the newly created element will be added
   * @param elemName This is the name for the new XML element
   * @param curNS    The current namespace
   * @param value    This is its value
   */
  private static void setSingleElement(Element tokenElement, String elemName, String curNS, String value) {
    Element cur = new Element(elemName, curNS);
    if (value != null) {
      cur.appendChild(value);
      tokenElement.appendChild(cur);
    }
  }

  /**
   * Runs an interactive shell where input text is processed with the given pipeline
   * @param pipeline The pipeline to be used
   * @throws IOException If IO problem with stdin
   */
  private static void shell(StanfordCoreNLP pipeline) throws IOException {
    BufferedReader is = new BufferedReader(new InputStreamReader(System.in));
    PrintWriter os = new PrintWriter(System.out);
    System.out.println("Entering interactive shell. Type q to quit.");
    while(true){
      System.out.print("NLP> ");
      String line = is.readLine();
      if(line != null && line.length() > 0){
        if(line.equalsIgnoreCase("q")) break;
        Annotation anno = pipeline.process(line);
        pipeline.prettyPrint(anno, os);
      }
    }
  }

  /**
   * Fetch the list of files contained in this directory. If path points to a file returns just this file.
   * @param path Path towards the input directory or file
   * @param extension If not null and path is directory extracts only files with this extension
   * @return The list of files stored in this directory or just the file pointed to by path
   */
  private static List<File> fetchFiles(File path, String extension) {
    List<File> files = new ArrayList<File>();
    if(path.isFile()) extension = null;
    fetchFiles(path, extension, files);
    return files;
  }

  private static void fetchFiles(File path, String extension, List<File> files) {
    if(path.isFile() && (extension == null || path.getAbsolutePath().endsWith(extension))){
      files.add(path);
    } else if(path.isDirectory()){
      File [] childPaths = path.listFiles();
      for(File childPath: childPaths){
        fetchFiles(childPath, extension, files);
      }
    }
  }

  private static List<File> readFileList(String fileName) throws IOException {
    BufferedReader is = new BufferedReader(new FileReader(fileName));
    List<File> files = new ArrayList<File>();
    String line;
    while((line = is.readLine()) != null){
      files.add(new File(line.trim()));
    }
    is.close();
    return files;
  }

  private static String readBuffer(BufferedReader is) throws IOException {
    StringBuilder os = new StringBuilder();
    int c;
    while ((c = is.read()) >= 0) {
      os.append((char) c);
    }
    return os.toString();
  }

  private static void processFiles(StanfordCoreNLP pipeline, List<File> files, Properties props) throws IOException {
    for (File file : files) {
      String outputFilename = new File(props.getProperty("outputDirectory", "."), file.getName()).getPath();
      if (props.getProperty("replaceExtension") != null) {
        int lastDot = outputFilename.lastIndexOf('.');
        // for paths like "./zzz", lastDot will be 0
        if (lastDot > 0) {
          outputFilename = outputFilename.substring(0, lastDot);
        }
      }
      String extension = props.getProperty("outputExtension", ".xml");
      // ensure we don't make filenames with doubled extensions like .xml.xml
      if (!outputFilename.endsWith(extension)) {
        outputFilename += extension;
      }
      // normalize filename for the upcoming comparison
      outputFilename = new File(outputFilename).getCanonicalPath();

      // TODO this could fail if there are softlinks, etc. -- need some sort of sameFile tester
      if (outputFilename.equals(file.getCanonicalPath())) {
        System.out.println("Skipping " + file.getName() + ": output file " + outputFilename + " has the same filename as the input file -- assuming you don't actually want to do this.");
        continue;
      }
      if (props.getProperty("noClobber") != null && new File(outputFilename).exists()) {
        System.out.println("Skipping " + file.getName() + ": output file " + outputFilename + " already exists.  Pass -override to automatically override all files.");
        continue;
      }

      System.err.println("Processing file " + file.getAbsolutePath() + " ... (writing to " + outputFilename + ")");
      BufferedReader is = new BufferedReader(new FileReader(file));
      String text = readBuffer(is);
      is.close();
      OutputStream os = new FileOutputStream(outputFilename);

      Annotation annotation = pipeline.process(text);

      pipeline.xmlPrint(annotation, os);
      os.close();
    }
  }

  /**
   * This can be used just for testing or for command-line text processing.
   * This runs the pipeline you specify on the
   * text in the file that you specify and sends some results to stdout.
   * The current code in this main method assumes that each line of the file
   * is to be processed separately as a single sentence.
   * <p>
   * Example usage:<br>
   * java -mx6g edu.stanford.nlp.pipeline.StanfordCoreNLP properties
   *
   * @param args List of required properties
   * @throws java.io.IOException If IO problem
   * @throws ClassNotFoundException If class loading problem
   */
  public static void main(String[] args) throws IOException, ClassNotFoundException {
    Timing tim = new Timing();

    //
    // process the arguments
    //
    // extract all the properties from the command line
    // if cmd line is empty, set the props to null. The processor will search for the properties file in the classpath
    Properties props = null;
    if(args.length > 0){
      props = StringUtils.argsToProperties(args);
      if (props.containsKey("h") || props.containsKey("help")){
        printRequiredProperties(System.err);
        return;
      }
    }

    //
    // construct the pipeline
    //
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    props = pipeline.getProperties();
    long setupTime = tim.report();

    // blank line after all the loading statements to make output more readable
    System.err.println();

    //
    // Process one file or a directory of files
    //
    if(props.containsKey("file")){
      String fileName = props.getProperty("file");
      List<File> files = fetchFiles(new File(fileName), props.getProperty("extension"));
      processFiles(pipeline, files, props);
    }

    //
    // Process a list of files
    //
    else if(props.containsKey("filelist")){
      String fileName = props.getProperty("filelist");
      List<File> files = readFileList(fileName);
      processFiles(pipeline, files, props);
    }

    //
    // Run the interactive shell
    //
    else {
      shell(pipeline);
    }

    if (TIME) {
      System.err.println();
      System.err.println(pipeline.timingInformation());
      System.err.println("Pipeline setup: " +
          Timing.toSecondsString(setupTime) + " sec.");
      System.err.println("Total time for StanfordCoreNLP pipeline: " +
          tim.toSecondsString() + " sec.");
    }
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.