Statement.java :  » Natural-Language-Processing » MinorThird » edu » cmu » minorthird » text » mixup » Java Open Source

Java Open Source » Natural Language Processing » MinorThird 
MinorThird » edu » cmu » minorthird » text » mixup » Statement.java
package edu.cmu.minorthird.text.mixup;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

public class Statement implements Serializable{

  static private final long serialVersionUID=20080303L;

  private static Logger log=Logger.getLogger(Statement.class);

  public static int REGEX=1,MIXUP=2,FILTER=3,PROVIDE=4,REQUIRE=5,DECLARE=6,
      TRIE=7,ANNOTATE_WITH=8;

  //TODO: We should handle these properties better, possibly using a java properties object
  // encodes the statement properties
  private String keyword,property,type,startType,value;

  // set of words, for a dictionary
  private Set<String> wordSet=null;

  // file containing dictionary
//  private File dictFile=null;

  // Variable for whether to ignore case in dictionary
  private boolean ignoreCase;

  // split string for retokenizing textBase
  private String split,patt;

  // current tokenization level 
  private String level;

  // Variables that define the level and type to be imported to the current textBase
  private String importLevel,importType,oldType;

  // encode generator
  private int statementType;

  // for statementType = MIXUP or FILTER
  private Mixup mixupExpr=null;

  // for statementType TRIE
  private List<String> phraseList;

  // for statementType = REGEX
  private String regex=null;

  private int regexGroup;

  // for statementType=PROVIDE,REQUIRE,ANNOTATEWITH,DICTIONARY
  private String annotationType,fileToLoad;

  private List<String> filesToLoad;

  // for parsing
//  private Matcher matcher;

  // for TRIE
  private int lastTokenStart;

  private String input;

  private static Set<String> generatorStart=new HashSet<String>();

  private static Set<String> legalKeywords=new HashSet<String>();

  private static Set<String> colonEqualsOrCase=new HashSet<String>();

  private static Set<String> defLevelType=new HashSet<String>();
  static{
    legalKeywords.add("defTokenProp");
    legalKeywords.add("defSpanProp");
    legalKeywords.add("defSpanType");
    legalKeywords.add("defDict");
    legalKeywords.add("declareSpanType");
    legalKeywords.add("provide");
    legalKeywords.add("require");
    legalKeywords.add("defLevel");
    legalKeywords.add("onLevel");
    legalKeywords.add("offLevel");
    legalKeywords.add("importFromLevel");
  }
  static{
    colonEqualsOrCase.add(":");
    colonEqualsOrCase.add("=");
    colonEqualsOrCase.add("case");
  }
  static{
    generatorStart.add(":");
    generatorStart.add("~");
    generatorStart.add("-");
  }
  static{
    defLevelType.add("re");
    defLevelType.add("split");
    defLevelType.add("filter");
    defLevelType.add("pseudotoken");
  }

  //
  // constructor and parser
  //
  Statement(Mixup.MixupTokenizer tok,String firstTok)
      throws Mixup.ParseException{
    keyword=firstTok;
    if(keyword.equals("declareSpanType")){
      statementType=DECLARE;
      type=tok.advance(null);
      tok.advance(null); // advance to end-of-statement marker
      return;
    }
    if(keyword.equals("provide")){
      statementType=PROVIDE;
      annotationType=tok.advance(null);
      if(annotationType.charAt(0)=='\''){
        annotationType=annotationType.substring(1,annotationType.length()-1);
      }
      // added to parse ";" -frank
      tok.advance(null);
      return;
    }
    if(keyword.equals("annotateWith")){
      statementType=ANNOTATE_WITH;
      fileToLoad=tok.advance(null);
      if(fileToLoad.charAt(0)=='\''){
        fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1);
      }
      tok.advance(null);
      return;
    }
    if(keyword.equals("require")){
      statementType=REQUIRE;
      annotationType=tok.advance(null);
      if(annotationType.charAt(0)=='\''){
        annotationType=annotationType.substring(1,annotationType.length()-1);
      }
      String marker=tok.advance(null); //Collections.singleton(","));
      log.debug("marker: "+marker);
      if(marker!=null){
        fileToLoad=tok.advance(null);
        if(fileToLoad.charAt(0)=='\'')
          fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1);
        tok.advance(null);
      }
      return;
    }
    if("onLevel".equals(keyword)||"offLevel".equals(keyword)){
      level=tok.advance(null);
      tok.advance(null);
      return;
    }
    if("importFromLevel".equals(keyword)){
      importLevel=tok.advance(null);
      // continue to parse NEWTYPE = OLDTYPE
      importType=tok.advance(null); // read property or type
      tok.advance(Collections.singleton("="));
      oldType=tok.advance(null);
      tok.advance(null); // advance to end-of-statement marker
      return;
    }
    String propOrType=tok.advance(null); // read property or type
    //        importType = propOrType;
    String token=tok.advance(colonEqualsOrCase); // read ':' or '='
    if(":".equals(token)){
      if(!"defSpanProp".equals(keyword)&&!"defTokenProp".equals(keyword)){
        parseError("can't define properties here");
      }
      property=propOrType;
      type=null;
      value=tok.advance(null);
      tok.advance(Collections.singleton("="));
    }else if("case".equals(token)){
      if(!"defDict".equals(keyword))
        parseError("illegal keyword usage");
    }else{
      // token is '='
      if(!"defSpanType".equals(keyword)&&!"defDict".equals(keyword)&&
          !"defLevel".equals(keyword)){
        parseError("illegal keyword usage");
      }
      if(!"=".equals(token)){

        parseError("expected '='");
      }
      type=propOrType;
      property=null;
    }

    if("defDict".equals(keyword)){
      // syntax is "defDict [+case] dictName = ", so either
      // propOrType = dictName and token = '=', or else 
      // propOrType = + and token = 'case', or else 
      ignoreCase=true;
      if("case".equals(token)){
        ignoreCase=false;
        if(!"+".equals(propOrType))
          parseError("illegal defDict");
        type=tok.advance(null);
        tok.advance(Collections.singleton("="));
      }else{
        type=propOrType;
      }
      wordSet=new HashSet<String>();
      filesToLoad=new ArrayList<String>();
      while(true){
        String w=tok.advance(null);
        // read in each line of the file name embraced by double quotes  
        if(w.equals("\"")){
          StringBuffer defFile=new StringBuffer("");
          while(!(w=tok.advance(null)).equals("\""))
            defFile.append(w);
          fileToLoad=defFile.toString();
          filesToLoad.add(fileToLoad);
        }else{
          wordSet.add(ignoreCase?w.toLowerCase():w);
        }
        String sep=tok.advance(null);
        if(sep==null)
          break;
        else if(!",".equals(sep))
          parseError("expected comma");
      }
    }else if("defLevel".equals(keyword)){
      split=tok.advance(defLevelType);
      patt=tok.advance(null);
      if(patt.charAt(0)=='\''&&patt.charAt(patt.length()-1)=='\'')
        patt=patt.substring(1,patt.length()-1);
      tok.advance(null);
    }else{
      // GEN
      // should be at '=' sign or starttype
      token=tok.advance(null);
      if(generatorStart.contains(token)){
        startType="top";
      }else{
        startType=token;
        token=tok.advance(generatorStart);
      }
      if(token.equals(":")){
        statementType=MIXUP;
        //mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) );
        //if(tok.advance())
        if(tok.advance())
          mixupExpr=new Mixup(tok);
      }else if(token.equals("-")){
        statementType=FILTER;
        //mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) );
        //if(tok.advance())        
        if(tok.advance())
          mixupExpr=new Mixup(tok);
      }else if(token.equals("~")){
        token=tok.advance(null);
        if("re".equals(token)){
          statementType=REGEX;
          regex=tok.advance(null);
          if(regex.startsWith("'")){
            regex=regex.substring(1,regex.length()-1);
            regex=regex.replaceAll("\\\\'","'");
          }
          token=tok.advance(Collections.singleton(","));
          token=tok.advance(null);
          try{
            regexGroup=Integer.parseInt(token);
            token=tok.advance(null);
          }catch(NumberFormatException e){
            parseError("expected a regex group number and saw "+token);
          }
        }else if("trie".equals(token)){
          statementType=TRIE;
          phraseList=new ArrayList<String>();
          String word=tok.advance(null);
          word.trim();
          String fullWord="";
          while(word!=null){
            if(!word.equals(",")){
              fullWord=fullWord+word+" ";
            }else{
              fullWord.trim();
              phraseList.add(fullWord);
              fullWord="";
            }
            word=tok.advance(null);
          }
          phraseList.add(fullWord);
          //String[] phrases = (String[])phraseList.toArray();
        }else{
          parseError("expected 're' or 'trie'");
        }
      }else{
        throw new IllegalStateException("unexpected generatorStart '"+token+"'");
      }
    }
  }

  /** convert a set to a string listing the elements */
//  private String setContents(Set set){
//    StringBuffer buf=new StringBuffer("");
//    for(Iterator i=set.iterator();i.hasNext();){
//      if(buf.length()>0)
//        buf.append(" ");
//      buf.append("'"+i.next().toString()+"'");
//    }
//    return buf.toString();
//  }

  // an error message
  private String parseError(String msg) throws Mixup.ParseException{
    throw new Mixup.ParseException("statement error at char "+lastTokenStart+
        ": "+msg+"\nin '"+input+"'");
  }

  public String toString(){
    if("defDict".equals(keyword)||"defLevel".equals(keyword)){
      return keyword+" "+type+" = ... ";
    }else if("onLevel".equals(keyword)||"offLevel".equals(keyword)){
      return keyword+" "+level;
    }else if("importFromLevel".equals(keyword)){
      return keyword+" "+importLevel+" "+importType+" = "+oldType;
    }else if(statementType==DECLARE){
      return keyword+" "+type;
    }else if(statementType==PROVIDE){
      return keyword+" "+annotationType;
    }else if(statementType==REQUIRE){
      return keyword+" "+annotationType+","+fileToLoad;
    }else if(statementType==ANNOTATE_WITH){
      return keyword+" "+fileToLoad;
    }else{
      String genString="???";
      if(statementType==MIXUP){
        genString=": "+mixupExpr.toString();
      }else if(statementType==FILTER){
        genString="- "+mixupExpr.toString();
      }else if(statementType==REGEX){
        genString="~ re '"+regex+"' ,"+regexGroup;
      }else if(statementType==TRIE){
        genString="~ trie ...";
      }
      if(type!=null){
        return keyword+" "+type+" ="+startType+genString;
      }else{
        return keyword+" "+property+":"+value+" ="+startType+genString;
      }
    }
  }

  //
  // From here down are public accessors to the properties of this Statement.  In the future
  // this should be changed to use a better data store for less cumbersome access
  //

  /**
   * Returns an integer representing the type this Statement is.  Valid types are:
   * DECLARE, PROVIDE, REQUIRE, ANNOTATE_WITH, MIXUP, FILTER, REGEX, and TRIE.
   */
  public int getStatementType(){
    return statementType;
  }

  /**
   * Returns the keyword that defines what this Statement does.
   */
  public String getKeyword(){
    return keyword;
  }

  /**
   * Returns a list of the files that need to be loaded if this Statement
   * defines a dictionary.
   */
  public List<String> getFilesToLoad(){
    return filesToLoad;
  }

  /**
   * Returns the file that needs to be loaded in this Statement is an
   * ANNOTATE_WITH or REQUIRE statement.
   */
  public String getFileToLoad(){
    return fileToLoad;
  }

  /**
   * Returns the type that this Statement matches.
   */
  public String getType(){
    return type;
  }

  /**
   * Returns the property that this statement matches
   */
  public String getProperty(){
    return property;
  }

  /**
   * Returns the value that this statement will match.
   */
  public String getValue(){
    return value;
  }

  /**
   * Returns whether or not this statement will ignore case when defining a dictionary.
   */
  public boolean getIgnoreCase(){
    return ignoreCase;
  }

  /**
   * Returns the set of words defining a dictionary in the case that this statement
   * defines a dictionary inline.
   */
  public Set<String> getWordSet(){
    return wordSet;
  }

  /**
   * Returns the type of level to create when this Statement is defining a level.
   */
  public String getSplit(){
    return split;
  }

  /**
   * Returns the pattern that is used to create a new level when this statement
   * is defining a new level.
   */
  public String getPatt(){
    return patt;
  }

  /**
   * Returns the level name to be used when this statement is performing a level
   * operation (onLevel, offLeve, defLevel, importFromLevel)
   */
  public String getLevel(){
    return level;
  }

  /**
   * Returns the type from the source level that should be imported when this statement
   * executes an importFromLevel call.
   */
  public String getOldType(){
    return oldType;
  }

  /**
   * Returns the type that imported spans should be called when this statement 
   * executes an importFromLevel call.
   */
  public String getImportType(){
    return importType;
  }

  /**
   * Returns the level that this statement will import from in a call to importFromLevel.
   */
  public String getImportLevel(){
    return importLevel;
  }

  /**
   * Returns the type that this statement either provides or requires.
   */
  public String getAnnotationType(){
    return annotationType;
  }

  /**
   * Returns the starting type in the case that this statement is a generator statement.
   */
  public String getStartType(){
    return startType;
  }

  /**
   * Returns the mixup expression that this statement will execute.
   */
  public Mixup getMixupExpr(){
    return mixupExpr;
  }

  /**
   * Returns the phrase list for when this statement will define a trie.
   */
  public List<String> getPhraseList(){
    return phraseList;
  }

  /**
   * Returns the regex string that will be executed by this statement.
   */
  public String getRegex(){
    return regex;
  }

  /**
   * Returns the regex group that will be returned when this statement executes.
   */
  public int getRegexGroup(){
    return regexGroup;
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.