POSTaggerAnnotator.java :  » Natural-Language-Processing » Stanford-CoreNLP » edu » stanford » nlp » pipeline » Java Open Source

Java Open Source » Natural Language Processing » Stanford CoreNLP 
Stanford CoreNLP » edu » stanford » nlp » pipeline » POSTaggerAnnotator.java
package edu.stanford.nlp.pipeline;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.pipeline.Annotation.WordsPLAnnotation;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Timing;

/**
 * Wrapper for the maxent part of speech tagger.
 * 
 * @author Anna Rafferty
 * 
 */
public class POSTaggerAnnotator implements Annotator {

  MaxentTagger pos; // = null;

  private boolean VERBOSE; // = false;

  private int maxSentenceLength;

  public POSTaggerAnnotator() throws Exception {
    this(true);
  }

  public POSTaggerAnnotator(boolean verbose) throws Exception {
    this(System.getProperty("pos.model", MaxentTagger.DEFAULT_NLP_GROUP_MODEL_PATH), verbose);
  }

  public POSTaggerAnnotator(String posLoc, boolean verbose) throws Exception {
    this(posLoc, verbose, Integer.MAX_VALUE);
  }

  public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength) throws Exception {
    VERBOSE = verbose;
    loadModel(posLoc);
    this.maxSentenceLength = maxSentenceLength;
  }

  public void setMaxSentenceLength(int maxLen) {
    this.maxSentenceLength = maxLen;
  }

  private void loadModel(String loc) throws Exception {
    Timing timer = new Timing();
    if (VERBOSE) {
      timer.doing("Loading POS Model [" + loc + ']');
    }
    pos = new MaxentTagger(loc);
    if (VERBOSE) {
      timer.done();
    }
  }

  public void annotate(Annotation annotation) {
    // turn the annotation into a sentence
    if (annotation.has(WordsPLAnnotation.class)) {
      List<List<? extends CoreLabel>> sentences = annotation.get(WordsPLAnnotation.class);
      for (List<? extends CoreLabel> words : sentences) {
        processText(words);
      }
    } else if (annotation.has(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        ArrayList<TaggedWord> tagged = null;

        tagged = pos.apply(tokens);

        for (int i = 0; i < tokens.size(); ++i) {
          tokens.get(i).set(PartOfSpeechAnnotation.class, tagged.get(i).tag());
          // System.err.println("#" + i + " " + tokens.get(i).word() + ": " +
          // tagged.get(i).tag());
        }
      }
    } else {
      throw new RuntimeException("unable to find words/tokens in: " + annotation);
    }
  }

  /**
   * Takes in a list of words and POS tags them. Tagging is done in place - the
   * returned CoreLabels are the same ones you passed in, with tags added.
   * 
   * @param text
   *          List of tokens to tag
   * @return Tokens with tags
   */
  public List<? extends CoreLabel> processText(List<? extends CoreLabel> text) {
    // cdm 2009: copying isn't necessary; the POS tagger's apply()
    // method does not change the parameter passed in. But I think you
    // can't have it correctly generic without copying. Sigh.

    // if the text size is more than the max length allowed
    if (text.size() > maxSentenceLength) {
      return processTextLargerThanMaxLen(text);
    }

    ArrayList<TaggedWord> tagged = pos.apply(new ArrayList<CoreLabel>(text));
    // copy in the tags
    Iterator<TaggedWord> taggedIter = tagged.iterator();
    for (CoreLabel word : text) {
      TaggedWord cur = taggedIter.next();
      word.setTag(cur.tag());
    }
    return text;
  }

  /**
   * if the text length is more than specified than the text is divided into
   * (length/MaxLen) sentences and tagged individually
   * 
   * @param text
   * @return
   */
  private List<? extends CoreLabel> processTextLargerThanMaxLen(List<? extends CoreLabel> text) {

    int startIndx = 0;
    int endIndx = (startIndx + maxSentenceLength < text.size() ? startIndx + maxSentenceLength : text.size());
    while (true) {
      System.out.println(startIndx + "\t" + endIndx);
      List<? extends CoreLabel> textToTag = text.subList(startIndx, endIndx);
      ArrayList<TaggedWord> tagged = pos.apply(textToTag);

      Iterator<TaggedWord> taggedIter = tagged.iterator();
      for (CoreLabel word : textToTag) {
        TaggedWord cur = taggedIter.next();
        word.setTag(cur.tag());
      }

      if (startIndx + maxSentenceLength >= text.size())
        break;

      startIndx += maxSentenceLength;
      endIndx = (startIndx + maxSentenceLength < text.size() ? startIndx + maxSentenceLength : text.size());

    }
    return text;
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.