PTBTokenizer.java :  » Natural-Language-Processing » Stanford-Chinese-Word-Segmenter » edu » stanford » nlp » process » Java Open Source

Java Open Source » Natural Language Processing » Stanford Chinese Word Segmenter 
Stanford Chinese Word Segmenter » edu » stanford » nlp » process » PTBTokenizer.java
package edu.stanford.nlp.process;


import edu.stanford.nlp.util.Function;


import edu.stanford.nlp.util.Function;


import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.io.IOUtils;


/**
 * Tokenizer implementation that conforms to the Penn Treebank tokenization
 * conventions.
 * This tokenizer is a Java implementation of Professor Chris Manning's Flex
 * tokenizer, pgtt-treebank.l.  It reads raw text and outputs
 * tokens as edu.stanford.nlp.trees.Words in the Penn treebank format. It can
 * optionally return carriage returns as tokens.
 *
 * @author Tim Grow
 * @author Teg Grenager (grenager@stanford.edu)
 * @author Christopher Manning
 * @author Jenny Finkel (integrating in invertible PTB tokenizer)
 */
public class PTBTokenizer<T> extends AbstractTokenizer<T> {

  // whether carriage returns should be returned as tokens
  private boolean tokenizeCRs;
  private boolean invertible;
  private boolean suppressEscaping; // = false;

  // the underlying lexer
  private PTBLexer lexer;
  private LexedTokenFactory<T> tokenFactory;
  // private int position;

  /**
   * Constructs a new PTBTokenizer that treats carriage returns as normal
   * whitespace.
   *
   * @param r The Reader whose contents will be tokenized
   * @return a PTBTokenizer that tokenizes a stream to objects of type
   * {@link Word}
   */
  public static PTBTokenizer<Word> newPTBTokenizer(Reader r) {
    return newPTBTokenizer(r, false);
  }

  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token. CRs come back as Words whose text is
   * the value of <code>PTBLexer.cr</code>.
   */
  public static PTBTokenizer<Word> newPTBTokenizer(Reader r, boolean tokenizeCRs) {
    return new PTBTokenizer<Word>(r, tokenizeCRs, new WordTokenFactory());
  }


  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token. CRs come back as Words whose text is
   * the value of <code>PTBLexer.cr</code>.
   *
   * @param invertible if set to true, then will produce CoreLabels which
   * will have fields for the string before and after, and the character offsets
   */
  public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible) {
    return new PTBTokenizer<CoreLabel>(r, tokenizeCRs, invertible, new CoreLabelTokenFactory());
  }


  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token, and has a custom LexedTokenFactory.
   * CRs come back as Words whose text is
   * the value of <code>PTBLexer.cr</code>.
   *
   * @param tokenFactory The LexedTokenFactory to use to create
   *  tokens from the text.
   */
  public PTBTokenizer(Reader r, boolean tokenizeCRs,
      LexedTokenFactory<T> tokenFactory) {
    this (r, tokenizeCRs, false, tokenFactory);
  }

  private PTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible,
                       LexedTokenFactory<T> tokenFactory) {
    this(r, tokenizeCRs, invertible, false, tokenFactory);
  }

  private PTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible,
                       boolean suppressEscaping,
                       LexedTokenFactory<T> tokenFactory) {
    this.tokenizeCRs = tokenizeCRs;
    this.tokenFactory = tokenFactory;
    this.invertible = invertible;
    this.suppressEscaping = suppressEscaping;
    setSource(r);
  }


  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  protected T getNext() {
    // if (lexer == null) {
    //   return null;
    // }
    T token = null;
    try {
      token = (T) lexer.next();
      // cdm 2007: this shouldn't be necessary: PTBLexer decides for itself whether to return CRs based on the same flag!
      // get rid of CRs if necessary
      // while (!tokenizeCRs && PTBLexer.cr.equals(((HasWord) token).word())) {
      //   token = (T)lexer.next();
      // }
    } catch (Exception e) {
      nextToken = null;
      // do nothing, return null
    }
    return token;
  }

  /**
   * Sets the source of this Tokenizer to be the Reader r.
   * @param r The Reader to tokenize from
   */
  public void setSource(Reader r) {
    if (invertible) {
      lexer = new PTBLexer(r, invertible, tokenizeCRs);
    } else {
      lexer = new PTBLexer(r, tokenFactory, tokenizeCRs, suppressEscaping);
    }
    // position = 0;
  }

  /**
   * Returns a presentable version of the given PTB-tokenized text.
   * PTB tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join
   * the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   */
  public static String ptb2Text(String ptbText) {
    StringBuilder sb = new StringBuilder(ptbText.length()); // probably an overestimate
    PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
    try {
      for (String token; (token = lexer.next()) != null; ) {
        sb.append(token);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    return (sb.toString());
  }

  /**
   * Returns a presentable version of the given PTB-tokenized text.
   * PTB tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join
   * the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   */
  public static int ptb2Text(Reader ptbText, Writer w) throws IOException {
    int numTokens = 0;
    PTB2TextLexer lexer = new PTB2TextLexer(ptbText);
    for (String token; (token = lexer.next()) != null; ) {
      numTokens++;
      w.write(token);
    }
    return numTokens;
  }

  private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens = ptb2Text(r, out);
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readerFromStringName(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }
        numTokens += ptb2Text(r, out);
        out.close();
      }
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("PTBTokenizer untokenized " + numTokens + " tokens at " +
                       nf.format(wordspersec) + " tokens per second.");
  }

  /**
   * Returns a presentable version of the given PTB-tokenized words.
   * Pass in a List of Words or Strings, or a Document and this method will
   * join the words with spaces and call {@link #ptb2Text(String) } on the
   * output. This method will check if the elements in the list are subtypes
   * of Word, and if so, it will take the word() values to prevent additional
   * text from creeping in (e.g., POS tags). Otherwise the toString value will
   * be used.
   *
   * <i>Implementation note:</i> At the moment, this can be called on either a
   * List of String or Word.  The typing should be cleaned up at some point.
   */
  public static String ptb2Text(List ptbWords) {
    for (int i = 0, sz = ptbWords.size(); i < sz; i++) {
      if (ptbWords.get(i) instanceof Word) {
        // store a String there instead
        ptbWords.set(i, ((Word) ptbWords.get(i)).word());
      }
      // else silently assume it is a String
    }

    return ptb2Text(StringUtils.join(ptbWords));
  }


  private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd, boolean tokenizeNL, boolean preserveLines, boolean dump) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump);
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readerFromStringName(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }

        numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump);
        r.close();
        if (outputFileList != null) out.close();
      } // end for j going through inputFileList
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("PTBTokenizer tokenized " + numTokens + " tokens at " +
                       nf.format(wordspersec) + " tokens per second.");
  }

  private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, boolean tokenizeNL, boolean preserveLines, boolean dump) {
    int numTokens = 0;
    PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(r, tokenizeNL, true);
    boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity
    boolean beginLine = true;
    while (tokenizer.hasNext()) {
      CoreLabel obj = tokenizer.next();
      String str = obj.word();

      if (parseInsideBegin != null && parseInsideBegin.matcher(str).matches()) {
        printing = true;
      } else if (parseInsideEnd != null && parseInsideEnd.matcher(str).matches()) {
        printing = false;
      } else if (printing) {
        if (dump) {
          // after having checked for tags, change str to be exhaustive
          str = obj.toString();
        }
        if (preserveLines) {
          if ("*CR*".equals(str)) {
            beginLine = true;
            out.println();
          } else {
            if ( ! beginLine) {
              out.print(" ");
            } else {
              beginLine = false;
            }
            out.print(str);
          }
        } else {
          out.println(str);
        }
      }
      numTokens++;
    }
    return numTokens;
  }


  public static TokenizerFactory<Word> factory() {
    return PTBTokenizerFactory.newPTBTokenizerFactory();
  }

  public static TokenizerFactory<Word> factory(boolean tokenizeCRs) {
    return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs);
  }


  public static <T>TokenizerFactory<T> factory(boolean tokenizeCRs, LexedTokenFactory<T> factory) {
    return new PTBTokenizerFactory<T>(tokenizeCRs, factory);
  }

  public static TokenizerFactory<CoreLabel> factory(boolean tokenizeCRs, boolean invertible) {
    return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs, invertible);
  }

  public static TokenizerFactory<Word> factory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping) {
    return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs, invertible, suppressEscaping);
  }


  public static class PTBTokenizerFactory<T> implements TokenizerFactory<T> {

    protected boolean tokenizeCRs;
    protected boolean invertible;
    protected boolean suppressEscaping; // = false;
    protected LexedTokenFactory<T> factory;

    /**
     * Constructs a new PTBTokenizerFactory that treats carriage returns as
     * normal whitespace.
     */
    public static PTBTokenizerFactory<Word> newPTBTokenizerFactory() {
      return newPTBTokenizerFactory(false);
    }

    /**
     * Constructs a new PTBTokenizer that optionally returns carriage returns
     * as their own token.
     *
     * @param tokenizeCRs If true, CRs come back as Words whose text is
     *    the value of <code>PTBLexer.cr</code>.
     */
    public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean tokenizeCRs) {
      return new PTBTokenizerFactory<Word>(tokenizeCRs, new WordTokenFactory());
    }

    public PTBTokenizerFactory(boolean tokenizeCRs, LexedTokenFactory<T> factory) {
      this(tokenizeCRs, false, false, factory);
    }

    public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeCRs, boolean invertible) {
      return new PTBTokenizerFactory<CoreLabel>(tokenizeCRs, invertible, new CoreLabelTokenFactory());
    }

    // I'm not sure what will happen
    // if you set both invertible and suppressEscaping to true.
    // -pichuan (Wed Jan 31 23:12:04 2007)
    public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping) {
      return new PTBTokenizerFactory<Word>(tokenizeCRs, invertible, suppressEscaping, new WordTokenFactory());
    }

    private PTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, LexedTokenFactory<T> factory) {
      this(tokenizeCRs, invertible, false, factory);
    }

    private PTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) {
      this.tokenizeCRs = tokenizeCRs;
      this.invertible = invertible;
      this.suppressEscaping = suppressEscaping;
      this.factory = factory;
    }


    public Iterator<T> getIterator(Reader r) {
      return getTokenizer(r);
    }

    public Tokenizer<T> getTokenizer(Reader r) {
      return new PTBTokenizer<T>(r, tokenizeCRs, invertible, suppressEscaping, factory);
    }

  } // end static class PTBTokenizerFactory

  /**
   * Reads files named as arguments and print their tokens one per line.
   * This is mainly as a testing aid, but it can also be quite useful
   * standalone to turn a corpus into a one-token-per-line file of tokens.
   * This main method assumes that the input file is in utf-8 encoding,
   * unless it is specified.
   * <p/>
   * Usage: <code>java edu.stanford.nlp.process.PTBTokenizer [-charset charset] [-nl] filename+
   * </code>
   * <p/>
   * Options:
   * <ul>
   * <li> -nl means to tokenize newlines
   * <li> -preserveLines means to do space-separated
   * tokens, except when the original had a line break, not one-token-per-line
   * <li> -charset specifies a character encoding
   * <li> -parseInside names an XML-style
   *      element to look inside for tokens (regex matching, not an XML parser)
   * <li> -ioFileList means remaining command-line arguments are files that
   * themselves contain lists of pairs of input-output filenames (2 column,
   * whitespace separated).
   * <li>-h Print usage info
   * </ul>
   *
   * @param args Command line arguments
   * @throws IOException If any file I/O problem
   */
  public static void main(String[] args) throws IOException {
    int i = 0;
    String charset = "utf-8";
    Pattern parseInsideBegin = null;
    Pattern parseInsideEnd = null;
    boolean tokenizeNL = false;
    boolean preserveLines = false;
    boolean inputOutputFileList = false;
    boolean dump = false;
    boolean untok = false;

    while (i < args.length && args[i].charAt(0) == '-') {
      if ("-nl".equals(args[i])) {
        tokenizeNL = true;
      } else if ("-preserveLines".equals(args[i])) {
        preserveLines = true;
        tokenizeNL = true;
      } else if ("-dump".equals(args[i])) {
        dump = true;
      } else if ("-ioFileList".equals(args[i])) {
        inputOutputFileList = true;
      } else if ("-charset".equals(args[i]) && i < args.length - 1) {
        i++;
        charset = args[i];
      } else if ("-parseInside".equals(args[i]) && i < args.length - 1) {
        i++;
        try {
          parseInsideBegin = Pattern.compile("<(?:" + args[i] + ")[^>]*?>");
          parseInsideEnd = Pattern.compile("</(?:" + args[i] + ")[^>]*?>");
        } catch (Exception e) {
          parseInsideBegin = null;
          parseInsideEnd = null;
        }
      } else if ("-untok".equals(args[i])) {
        untok = true;
      } else if ("-h".equals(args[i])) {
        System.err.println("usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
        System.err.println("  options: -nl|-preserveLines|-dump|-ioFileList|-charset|-parseInside|-h");
      } else {
        System.err.println("Unknown option: " + args[i]);
      }
      i++;
    }

    ArrayList<String> inputFileList = new ArrayList<String>();
    ArrayList<String> outputFileList = null;

    if (inputOutputFileList) {
      outputFileList = new ArrayList<String>();
      for (int j = i; j < args.length; j++) {
        BufferedReader r = new BufferedReader(
          new InputStreamReader(new FileInputStream(args[j]), charset));
        for (String inLine; (inLine = r.readLine()) != null; ) {
          String[] fields = inLine.split("\\s+");
          inputFileList.add(fields[0]);
          outputFileList.add(fields[1]);
        }
        r.close();
      }
    } else {
      for (int j = i; j < args.length; j++) inputFileList.add(args[j]);
    }

    if (untok) {
      untok(inputFileList, outputFileList, charset);
    } else {
      tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump);
    }
  } // end main

} // end PTBTokenizer
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.