WordToSentenceProcessor.java :  » Natural-Language-Processing » Stanford-Named-Entity-Recognizer » edu » stanford » nlp » process » Java Open Source

Java Open Source » Natural Language Processing » Stanford Named Entity Recognizer 
Stanford Named Entity Recognizer » edu » stanford » nlp » process » WordToSentenceProcessor.java
package edu.stanford.nlp.process;

import java.io.File;
import java.net.URL;
import java.util.*;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;

/**
 * Transforms a Document of Words into a Document of Sentences by grouping the
 * Words.  The word stream is assumed to already be adequately tokenized,
 * and this class just divides the list into sentences, perhaps discarding
 * some separator tokens based on the setting of the following three sets:
 * <ul>
 * <li>sentenceBoundaryTokens are tokens that are left in a sentence, but are
 * to be regarded as ending a sentence.  A canonical example is a period.
 * If two of these follow each other, the second will be a sentence
 * consisting of only the sentenceBoundaryToken.
 * <li>sentenceBoundaryFollowers are tokens that are left in a sentence, and
 * which can follow a sentenceBoundaryToken while still belonging to
 * the previous sentence.  They cannot begin a sentence (except at the
 * beginning of a document).  A canonical example is a close parenthesis
 * ')'.
 * <li>sentenceBoundaryToDiscard are tokens which separate sentences and
 * which should be thrown away.  In web documents, a typical example would
 * be a '{@code <p>}' tag.  If two of these follow each other, they are
 * coalesced: no empty Sentence is output.  The end-of-file is not
 * represented in this Set, but the code behaves as if it were a member.
 * <li>sentenceRegionBeginPattern A regular expression for marking the start
 * of a sentence region.  Not included in the sentence.
 * <li>sentenceRegionEndPattern A regular expression for marking the end
 * of a sentence region.  Not included in the sentence.
 * </ul>
 *
 * @author Joseph Smarr (jsmarr@stanford.edu)
 * @author Christopher Manning
 * @author Teg Grenager (grenager@stanford.edu)
 * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
 *
 * @param <IN> The type of the tokens in the sentences
 * @param <L> The type of the labels
 * @param <F> The type of the features
 */
public class WordToSentenceProcessor<IN, L, F> extends AbstractListProcessor<IN, List<IN>, L, F> {

  private static final boolean DEBUG = false;

  /**
   * Set of tokens (Strings) that qualify as sentence-final tokens.
   */
  private Set<String> sentenceBoundaryTokens;

  /**
   * Set of tokens (Strings) that qualify as tokens that can follow
   * what normally counts as an end of sentence token, and which are
   * attributed to the preceding sentence.  For example ")" coming after
   * a period.
   */
  private Set<String> sentenceBoundaryFollowers;

  /**
   * Set of tokens (Strings) that are sentence boundaries to be discarded.
   */
  private Set<String> sentenceBoundaryToDiscard;

  private Pattern sentenceRegionBeginPattern;

  private Pattern sentenceRegionEndPattern;


  /**
   * Returns a List of Sentences where each element is built from a run
   * of Words in the input Document. Specifically, reads through each word in
   * the input document and breaks off a sentence after finding a valid
   * sentence boundary token or end of file.
   * Note that for this to work, the words in the
   * input document must have been tokenized with a tokenizer that makes
   * sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String)
   * @return A list of Sentence
   * @see #WordToSentenceProcessor(Set, Set, Set)
   * @see edu.stanford.nlp.ling.Sentence
   */
  public List<List<IN>> process(List<IN> words) {
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = null;
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    for (IN o: words) {
      String word;
      if (o instanceof HasWord) {
        HasWord h = (HasWord) o;
        word = h.word();
      } else if (o instanceof String) {
        word = (String) o;
      } else if (o instanceof CoreMap) {
        word = ((CoreMap)o).get(CoreAnnotations.WordAnnotation.class);
      } else {
        throw new RuntimeException("Expected token to be either Word or String.");
      }
      if (DEBUG) {
        EncodingPrintWriter.err.println("Word is " + word, "UTF-8");
      }
      if (currentSentence == null) {
        currentSentence = new ArrayList<IN>();
      }
      if (sentenceRegionBeginPattern != null && ! insideRegion) {
        if (sentenceRegionBeginPattern.matcher(word).matches()) {
          insideRegion = true;
        }
        if (DEBUG) {
          System.err.println("  outside region");
        }
        continue;
      }
      if (sentenceBoundaryFollowers.contains(word) && lastSentence != null && currentSentence.isEmpty()) {
        lastSentence.add(o);
        if (DEBUG) {
          System.err.println("  added to last");
        }
      } else {
        boolean newSent = false;
        if (sentenceBoundaryToDiscard.contains(word)) {
          newSent = true;
        } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
          insideRegion = false;
          newSent = true;
        } else if (sentenceBoundaryTokens.contains(word)) {
          currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  is sentence boundary; added to current");
          }
          newSent = true;
        } else {
          currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  added to current");
          }
        }
        if (newSent && currentSentence.size() > 0) {
          if (DEBUG) {
            System.err.println("  beginning new sentence");
          }
          sentences.add(currentSentence);
          // adds this sentence now that it's complete
          lastSentence = currentSentence;
          currentSentence = null; // clears the current sentence
        }
      }
    }

    // add any words at the end, even if there isn't a sentence
    // terminator at the end of file
    if (currentSentence != null && currentSentence.size() > 0) {
      sentences.add(currentSentence); // adds last sentence
    }
    return sentences;
  }


  /**
   * Create a <code>WordToSentenceProcessor</code> using a sensible default
   * list of tokens to split on.  The default set is: {".","?","!"}.
   */
  public WordToSentenceProcessor() {
    this(new HashSet<String>(Arrays.asList(".", "?", "!")));
  }

  /**
   * Flexibly set the set of acceptable sentence boundary tokens, but with
   * a default set of allowed boundary following tokens (based on English
   * and Penn Treebank encoding).
   * The allowed set of boundary followers is:
   * {")","]","\"","\'", "''", "-RRB-", "-RSB-", "-RCB-"}.
   *
   * @param boundaryTokens The set of boundary tokens
   */
  public WordToSentenceProcessor(Set<String> boundaryTokens) {
    this(boundaryTokens, Generics.newHashSet(Arrays.asList(")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-")));
  }

  /**
   * Flexibly set the set of acceptable sentence boundary tokens and
   * also the set of tokens commonly following sentence boundaries, and
   * the set of discarded separator tokens.
   * The default set of discarded separator tokens is: {"\n"}.
   */
  public WordToSentenceProcessor(Set<String> boundaryTokens, Set<String> boundaryFollowers) {
    this(boundaryTokens, boundaryFollowers, Collections.singleton("\n"));
  }


  /**
   * Flexibly set the set of acceptable sentence boundary tokens,
   * the set of tokens commonly following sentence boundaries, and also
   * the set of tokens that are sentences boundaries that should be
   * discarded.
   */
  public WordToSentenceProcessor(Set<String> boundaryTokens, Set<String> boundaryFollowers,
                                 Set<String> boundaryToDiscard) {
    this(boundaryTokens, boundaryFollowers, boundaryToDiscard, null, null);
  }

  public WordToSentenceProcessor(Pattern regionBeginPattern, Pattern regionEndPattern) {
    this(Collections.<String>emptySet(), Collections.<String>emptySet(),
         Collections.<String>emptySet(), regionBeginPattern, regionEndPattern);
  }

  /**
   * Flexibly set the set of acceptable sentence boundary tokens,
   * the set of tokens commonly following sentence boundaries, and also
   * the set of tokens that are sentences boundaries that should be
   * discarded.
   * This is private because it is a dangerous constructor. It's not clear what the semantics
   * should be if there are both boundary token sets, and patterns to match.
   */
  private WordToSentenceProcessor(Set<String> boundaryTokens, Set<String> boundaryFollowers, Set<String> boundaryToDiscard, Pattern regionBeginPattern, Pattern regionEndPattern) {
    sentenceBoundaryTokens = boundaryTokens;
    sentenceBoundaryFollowers = boundaryFollowers;
    sentenceBoundaryToDiscard = boundaryToDiscard;
    sentenceRegionBeginPattern = regionBeginPattern;
    sentenceRegionEndPattern = regionEndPattern;
    if (DEBUG) {
      EncodingPrintWriter.err.println("WordToSentenceProcessor: boundaryTokens=" + boundaryTokens, "UTF-8");
      EncodingPrintWriter.err.println("  boundaryFollowers=" + boundaryFollowers, "UTF-8");
      EncodingPrintWriter.err.println("  boundaryToDiscard=" + boundaryToDiscard, "UTF-8");
    }
  }


  /* -- for testing only
  private void printSet(Set s) {
    for (Iterator i = s.iterator(); i.hasNext();) {
      System.out.print(i.next() + " ");
    }
    System.out.println();
  }
  -- */


  /**
   * This will print out as sentences some text.  It can be used to
   * test sentence division.  <br>
   * Usage: java edu.stanford.nlp.process.WordToSentenceProcessor fileOrUrl+
   *
   * @param args Command line argument: files or URLs
   */
  public static void main(String[] args) {
    if (args.length == 0) {
      System.out.println("usage: java edu.stanford.nlp.process.WordToSentenceProcessor fileOrUrl");
      System.exit(0);
    }
    try {
      for (String filename : args) {
        Document<?, Word, Word> d; // always initialized below
        if (filename.startsWith("http://")) {
          Document dpre = new BasicDocument().init(new URL(filename));
          DocumentProcessor notags = new StripTagsProcessor();
          d = notags.processDocument(dpre);
        } else {
          d = new BasicDocument().init(new File(filename));
        }
        WordToSentenceProcessor proc = new WordToSentenceProcessor();
        List<Sentence> sentd = proc.processDocument(d);
        for (Sentence sent : sentd) {
          System.out.println(sent);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.