TTags.java :  » Natural-Language-Processing » Part-Of-Speech » edu » stanford » nlp » tagger » maxent » Java Open Source

Java Open Source » Natural Language Processing » Part Of Speech 
Part Of Speech » edu » stanford » nlp » tagger » maxent » TTags.java
package edu.stanford.nlp.tagger.maxent;

import edu.stanford.nlp.io.InDataStreamFile;
import edu.stanford.nlp.io.OutDataStreamFile;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.HashIndex;

import java.io.IOException;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.util.*;

/**
 * This class holds the POS tags, assigns them unique ids, and knows which tags
 * are open versus closed class.
 * <p/>
 * Title:        StanfordMaxEnt<p>
 * Description:  A Maximum Entropy Toolkit<p>
 * Company:      Stanford University<p>
 *
 * @author Kristina Toutanova
 * @version 1.0
 */
public class TTags {

  private Index<String> index = new HashIndex<String>();
  private final HashSet<String> closed = new HashSet<String>();
  private HashSet<String> openTags = null; /* cache */
  private boolean isEnglish = false; // for speed

  /** If true, then the open tags are fixed and we set closed tags based on
   *  index-openTags; otherwise, we set open tags based on index-closedTags.
   */
  private boolean openFixed = false;

  static final int CLOSED_TAG_THRESHOLD = 40;

  /** When making a decision based on the training data as to whether a
   *  tag is closed, this is the threshold for how many tokens can be in
   *  a closed class - purposely conservative.
   */
  private final int closedTagThreshold = CLOSED_TAG_THRESHOLD;

  /** If true, when a model is trained, all tags that had fewer tokens than
   *  closedTagThreshold will be considered closed.
   */
  private boolean learnClosedTags = false;

  /** Caches values returned by deterministicallyExpandTags.
   *  This assumes that each word comes always with the same tag (wasn't proven false so far).
   */
  private final Map<String,String[]> deterministicExpansionMemoizer = new HashMap<String,String[]>();

  public TTags() {
  }

  /*
  public TTags(TaggerConfig config) {
    String[] closedArray = config.getClosedClassTags();
    String[] openArray = config.getOpenClassTags();
    if(closedArray.length > 0) {
      closed = new HashSet<String>(Arrays.asList(closedArray));
    } else if(openArray.length > 0) {
      openTags = new HashSet<String>(Arrays.asList(openArray));
    } else {
      learnClosedTags = config.getLearnClosedClassTags();
      closedTagThreshold = config.getClosedTagThreshold();
    }
  }
  */

  TTags(String language) {
    if (language.equalsIgnoreCase("english")) {
      closed.add(".");
      closed.add(",");
      closed.add("``");
      closed.add("''");
      closed.add(":");
      closed.add("$");
      closed.add("EX");
      closed.add("(");
      closed.add(")");
      closed.add("#");
      closed.add("MD");
      closed.add("CC");
      closed.add("DT");
      closed.add("LS");
      closed.add("PDT");
      closed.add("POS");
      closed.add("PRP");
      closed.add("PRP$");
      closed.add("RP");
      closed.add("TO");
      closed.add("EOS");
      closed.add("UH");
      closed.add("WDT");
      closed.add("WP");
      closed.add("WP$");
      closed.add("WRB");
      closed.add("-LRB-");
      closed.add("-RRB-");
      //  closed.add("IN");
      isEnglish = true;
    } else if(language.equalsIgnoreCase("polish")) {
      closed.add(".");
      closed.add(",");
      closed.add("``");
      closed.add("''");
      closed.add(":");
      closed.add("$");
      closed.add("(");
      closed.add(")");
      closed.add("#");
      closed.add("POS");
      closed.add("EOS");
      closed.add("ppron12");
      closed.add("ppron3");
      closed.add("siebie");
      closed.add("qub");
      closed.add("conj");
    } else if(language.equalsIgnoreCase("chinese")) {
      /* chinese treebank 5 tags */
      closed.add("AS");
      closed.add("BA");
      closed.add("CC");
      closed.add("CS");
      closed.add("DEC");
      closed.add("DEG");
      closed.add("DER");
      closed.add("DEV");
      closed.add("DT");
      closed.add("ETC");
      closed.add("IJ");
      closed.add("LB");
      closed.add("LC");
      closed.add("P");
      closed.add("PN");
      closed.add("PU");
      closed.add("SB");
      closed.add("SP");
      closed.add("VC");
      closed.add("VE");
    } else if (language.equalsIgnoreCase("arabic")) {
      closed.add("PUNC");
      closed.add("CONJ");
      // maybe more should still be added ... cdm jun 2006
    } else if(language.equalsIgnoreCase("german")) {
      closed.add("$,");
      closed.add("$.");
      closed.add("$");
    } else if (language.equalsIgnoreCase("medpost")) {
      closed.add(".");
      closed.add(",");
      closed.add("``");
      closed.add("''");
      closed.add(":");
      closed.add("$");
      closed.add("EX");
      closed.add("(");
      closed.add(")");
      closed.add("VM");
      closed.add("CC");
      closed.add("DD");
      closed.add("DB");
      closed.add("GE");
      closed.add("PND");
      closed.add("PNG");
      closed.add("TO");
      closed.add("EOS");
      closed.add("-LRB-");
      closed.add("-RRB-");
    } else if (language.equalsIgnoreCase("")) {
    }
    /* add closed-class lists for other languages here */
    else {
      throw new RuntimeException("unknown language: " + language);
    }
  }


  /**
   * Returns a list of all open class tags
   * @return set of open tags
   */
  public Set<String> getOpenTags() {
    if (openTags == null) { /* cache check */
      HashSet<String> open = new HashSet<String>();

      for (String tag : index) {
        if ( ! closed.contains(tag)) {
          open.add(tag);
        }
      }

      openTags = open;
    } // if
    return openTags;
  }

  protected int add(String tag) {
    index.add(tag);
    return index.indexOf(tag);
  }

  public String getTag(int i) {
    return index.get(i);
  }

  protected void save(String filename, 
                      HashMap<String, HashSet<String>> tagTokens) {
    try {
      DataOutputStream out = new OutDataStreamFile(filename);
      save(out, tagTokens);
      out.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  protected void save(DataOutputStream file, 
                      HashMap<String, HashSet<String>> tagTokens) {
    try {
      file.writeInt(index.size());
      for (String item : index) {
        file.writeUTF(item);
        if (learnClosedTags) {
          if (tagTokens.get(item).size() < closedTagThreshold) {
            markClosed(item);
          }
        }
        file.writeBoolean(isClosed(item));
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }


  protected void read(String filename) {
    try {
      InDataStreamFile in = new InDataStreamFile(filename);
      read(in);
      in.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  protected void read(DataInputStream file) {
    try {
      int size = file.readInt();
      index = new HashIndex<String>();
      for (int i = 0; i < size; i++) {
        String tag = file.readUTF();
        boolean inClosed = file.readBoolean();
        index.add(tag);

        if (inClosed) closed.add(tag);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }


  protected boolean isClosed(String tag) {
    if (openFixed) {
      return !openTags.contains(tag);
    } else {
      return closed.contains(tag);
    }
  }

  void markClosed(String tag) {
    closed.add(tag);
  }

  public void setLearnClosedTags(boolean learn) {
    learnClosedTags = learn;
  }

  public void setOpenClassTags(String[] openClassTags) {
    openTags = new HashSet<String>();
    openTags.addAll(Arrays.asList(openClassTags));
    openFixed = true;
  }

  public void setClosedClassTags(String[] closedClassTags) {
    for(String tag : closedClassTags) {
      markClosed(tag);
    }
  }


  int getIndex(String tag) {
    return index.indexOf(tag);
  }

  public int getSize() {
    return index.size();
  }

  /**
   * Deterministically adds other possible tags for words given observed tags.
   * For instance, for English with the Penn POS tag, a word with the VB
   * tag would also be expected to have the VBP tag.
   * (CDM May 2007: This was putting repeated values into the set of possible
   * tags, which was bad.  Now it doesn't, but the resulting code is a funny
   * mixture of trying to micro-optimize, and just using equals() inside a
   * List linear scan....
   *
   * @param tags Known possible tags for the word
   * @param word The word (currently not a used parameter)
   * @return A superset of tags
   */
  String[] deterministicallyExpandTags(String[] tags, String word) {
    String[] cached;
    synchronized(deterministicExpansionMemoizer) { cached = deterministicExpansionMemoizer.get(word); }
    if(cached != null)
      return cached;
    if (isEnglish) {
      ArrayList<String> tl = new ArrayList<String>(tags.length + 2);
      int yVBD = getIndex("VBD");
      int yVBN = getIndex("VBN");
      int yVBP = getIndex("VBP");
      int yVB = getIndex("VB");
      if (yVBD < 0 || yVBN < 0 || yVBP < 0 || yVB < 0) {
        System.err.println("Language erroneously set to 'english' when it isn't UPenn English tag set!!");
        return tags;
      }

      for (String tag : tags) {
        int y = getIndex(tag);
        assert (y >= 0);
        addIfAbsent(tl, tag);
        if (y == yVBD) {
          addIfAbsent(tl, "VBN");
        } else if (y == yVBN) {
          addIfAbsent(tl, "VBD");
        } else if (y == yVB) {
          addIfAbsent(tl, "VBP");
        } else if (y == yVBP) {
          addIfAbsent(tl, "VB");
        }
      } // end for i
      String[] newtags = tl.toArray(new String[tl.size()]);
      synchronized(deterministicExpansionMemoizer) { deterministicExpansionMemoizer.put(word,newtags); }
      return newtags;
    } else {
      // no tag expansion for other languages currently
      return tags;
    }
  }

  // TODO: this should be in some kind of list util
  private static void addIfAbsent(List<String> list, String item) {
    if ( ! list.contains(item)) {
      list.add(item);
    }
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.