MinorTagger.java :  » Natural-Language-Processing » MinorThird » edu » cmu » minorthird » text » Java Open Source

Java Open Source » Natural Language Processing » MinorThird 
MinorThird » edu » cmu » minorthird » text » MinorTagger.java
package edu.cmu.minorthird.text;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.ServerSocket;
import java.net.Socket;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import montylingua.JMontyLingua;

import org.apache.log4j.Logger;

import edu.cmu.minorthird.text.mixup.MixupInterpreter;
import edu.cmu.minorthird.text.mixup.MixupProgram;

/**
 * Description:
 * An echo-like server that labels popular entities using XML tags in an input text
 *
 * Instruction:
 *
 * VERY IMPORTANT:
 * 1) Make sure you comment out the following line:
 * require 'pos';
 * in "lib/mixup/np.mixup" because this server already does POS tagging for you!
 * (otherwise MontyLingua will be invoked more than once)
 *
 * 2) If you encounter 'rcwangName.mixup' not found error, copy apps/names/lib/rcwangName.mixup
 * and apps/names/lib/newnames.txt to lib/mixup, or make sure rcwangName.mixup and newnames.txt
 * are in your classpath
 *
 * 3) To use Minorthird's createXMLmarkup function instead of mine, uncomment the line:
 * tagged = labelsLoader.createXMLmarkup(tempFile.getName(), labels);
 * and comment out the line:
 * tagged = createXML(in, labelsLoader.saveTypesAsXML(labels));
 * in the "tag" function. To switch back, do the opposite.
 *
 * 4) After connecting to the server, the client can send in any text. The server will
 * buffer the incoming text until it detects that a line is ended with TWO consecutive
 * dollar signs ($$). The server will then process the text in the buffer and echo back
 * the resulting text without disconnecting the client. However, if a line is ended
 * with THREE consecutive dollar signs ($$$), then after the server echos back the
 * resulting text, it will disconnect the connection automatically.
 *
 * @author Richard Wang <rcwang@cmu.edu>
 */
public class MinorTagger extends StringAnnotator{

  static Logger log=Logger.getLogger(MinorTagger.class);

  private static JMontyLingua montyLingua;

  private static final File NAMEMIXUP=new File("rcwangName.mixup");

  private static final File DATEMIXUP=new File("date.mixup");

//  private static final boolean DEBUG = false;
  private static final int DEFAULT_PORT=9998;

  public String explainAnnotation(TextLabels labels,Span documentSpan){
    return "no idea";
  }

  public static void main(String args[]){
    ServerSocket echoServer;
    Socket clientSocket;
    int port=DEFAULT_PORT;

    System.out.println("Starting MinorTagger v0.02b...");
    if(args.length>0&&args[0].matches("\\d+"))
      port=Integer.parseInt(args[0]);
    else{
      System.out
          .println("WARN: No listening port specified, using default port!");
      System.out
          .println("WARN: To specify, use the port number as the first argument.");
    }
    System.out.println("Loading Part-of-Speech Tagger...");
    montyLingua=new JMontyLingua();
    System.out.println("MinorTagger Started Successfully!");
    System.out.println("Waiting for connection on port "+port+"...");
    try{
      echoServer=new ServerSocket(port);
      while(true){
        clientSocket=echoServer.accept();
        System.out.println("["+(new Date())+"] Connected from "+
            clientSocket.getRemoteSocketAddress());
        (new MinorTaggerThread(clientSocket)).start();
      }
    }catch(IOException e){
      System.out.println(e);
    }
  }

  public MinorTagger(){
    montyLingua=new JMontyLingua();
  }

  protected CharAnnotation[] annotateString(String text){
    List<CharAnnotation> charAnnList=new ArrayList<CharAnnotation>();
    String tagged=montyLingua.tag_text(text);
    log.debug("Tagged: "+tagged);
    StringTokenizer tokeTagged=new StringTokenizer(tagged,"\n ",false);

    int index=0;
    while(tokeTagged.hasMoreTokens()){
      String strToken=tokeTagged.nextToken();
      int sep=strToken.lastIndexOf("/");
      String word=strToken.substring(0,sep);
      log.debug("word: "+word);
      String pos=strToken.substring(sep+1);
      log.debug("POS: "+pos);
      if(pos.endsWith("$"))
        pos=pos.replace('$','S');
      // look for alternative word
//      if (text.trim().toLowerCase())
      if(!text.trim().toLowerCase().startsWith(word.toLowerCase()))
        word=findAlternative(text,word);
      // ensure case are the same
      if(text.trim().toLowerCase().startsWith(word.toLowerCase()))
        word=text.trim().substring(0,word.length());
      CharAnnotation charAnn=makeCharAnnotation(text,word,pos,index,false);
      index=charAnn.getOffset()+charAnn.getLength();
    }
    return (CharAnnotation[])charAnnList.toArray(new CharAnnotation[0]);
  }

  /**
   * Tag text with part-of-speech (plain text only)
   * @param text
   * @return the POS tag as a String
   */
  public static String POSTag(String text){
    StringBuffer xmlTagged=new StringBuffer();
    text=text.replaceAll("<[^<>]+>","");
    String tagged=montyLingua.tag_text(text);
    log.debug("Tagged: "+tagged);
    StringTokenizer tokeTagged=new StringTokenizer(tagged,"\n ",false);

    while(tokeTagged.hasMoreTokens()){
      String strToken=tokeTagged.nextToken();
      int sep=strToken.lastIndexOf("/");
      String word=strToken.substring(0,sep);
      log.debug("word: "+word);
      String pos=strToken.substring(sep+1);
      log.debug("POS: "+pos);
      if(pos.endsWith("$"))
        pos=pos.replace('$','S');
      // look for alternative word
      if(!text.trim().toLowerCase().startsWith(word.toLowerCase()))
        word=findAlternative(text,word);
      // ensure case are the same
      if(text.trim().toLowerCase().startsWith(word.toLowerCase()))
        word=text.trim().substring(0,word.length());
      text=substFirst(text,word,"<"+pos+">"+word+"</"+pos+">",false);
      log.debug("WorkingString: "+text);
      int endPointer=text.lastIndexOf("</"+pos+">");
      if(endPointer!=-1)
        endPointer+=("</"+pos+">").length();
      log.debug("EndPointer: "+endPointer);
      if(endPointer==-1)
        break;
      xmlTagged.append(text.substring(0,endPointer));
      text=text.substring(endPointer); //in_ptr, working.length()
    }
    return xmlTagged.toString();
  }

  private static CharAnnotation makeCharAnnotation(String text,String word,
      String pos,int startIndex,boolean caseSensitive){
    int index=0;
    if(caseSensitive)
      index=text.indexOf(word,startIndex);
    else
      index=text.toLowerCase().indexOf(word.toLowerCase(),startIndex);
    if(index<0)
      return null;
    return new CharAnnotation(startIndex+index,word.length(),pos);
  }

  private static String substFirst(String in,String find,String newStr,
      boolean caseSensitive){
    char[] working=in.toCharArray();
    StringBuffer sb=new StringBuffer();
    int startIndex=0;
    if(caseSensitive)
      startIndex=in.indexOf(find);
    else
      startIndex=(in.toLowerCase()).indexOf(find.toLowerCase());
    if(startIndex<0)
      return in;
    int currIndex=0;
    for(int i=currIndex;i<startIndex;i++)
      sb.append(working[i]);
    currIndex=startIndex;
    sb.append(newStr);
    currIndex+=find.length();
    for(int i=currIndex;i<working.length;i++)
      sb.append(working[i]);
    return sb.toString();
  }

  private static String findAlternative(String in,String word){
    String workingString=in.trim().toLowerCase();
    String workingWord=word.toLowerCase();
    String find=word;
    if(workingWord.equals("do")){
      if(workingString.startsWith("dunno"))
        find="du";
    }else if(workingWord.equals("not")){
      if(workingString.startsWith("n't"))
        find="n't";
      else if(workingString.startsWith("'t"))
        find="'t";
      else if(workingString.startsWith("nno"))
        find="n";
    }else if(workingWord.equals("know")){
      if(workingString.startsWith("no"))
        find="no";
    }else if(workingWord.equals("your")){
      if(workingString.startsWith("ur"))
        find="ur";
    }else if(workingWord.equals("am")){
      if(workingString.startsWith("'m"))
        find="'m";
    }else if(workingWord.equals("are")){
      if(workingString.startsWith("'re"))
        find="'re";
    }else if(workingWord.equals("is")){
      if(workingString.startsWith("'s"))
        find="'s";
    }else if(workingWord.equals("would")){
      if(workingString.startsWith("'d"))
        find="'d";
    }else if(workingWord.equals("had")){
      if(workingString.startsWith("'d"))
        find="'d";
    }else if(workingWord.equals("will")){
      if(workingString.startsWith("'ll"))
        find="'ll";
      else if(workingString.startsWith("wo"))
        find="wo";
    }else if(workingWord.equals("come")){
      if(workingString.startsWith("c'm"))
        find="c'm";
    }else if(workingWord.equals("you")){
      if(workingString.startsWith("y'"))
        find="y'";
      else if(workingString.startsWith("ya"))
        find="ya";
    }else if(workingWord.equals("it")){
      if(workingString.startsWith("'t"))
        find="'t";
    }else if(workingWord.equals("-")){
      if(workingString.startsWith("/"))
        find="/";
    }else if(workingWord.equals("want")){
      if(workingString.startsWith("wan"))
        find="wan";
    }else if(workingWord.equals("going")){
      if(workingString.startsWith("gon"))
        find="gon";
    }else if(workingWord.equals("have")){
      if(workingString.startsWith("haf"))
        find="haf";
      else if(workingString.startsWith("'ve"))
        find="'ve";
    }else if(workingWord.equals("to")){
      if(workingString.startsWith("na"))
        find="na";
      else if(workingString.startsWith("ta"))
        find="ta";
    }else if(workingWord.equals("give")){
      if(workingString.startsWith("gim"))
        find="gim";
    }else if(workingWord.equals("let")){
      if(workingString.startsWith("lem"))
        find="lem";
    }else{
      System.err.println("POS Tagging error!");
      System.err.println("Current word:"+word);
      System.err.println("Working string:"+in);
    }
    return find;
  }

  private static String tag(String in){
    if(in.replaceAll("\\s+","").length()==0)
      return "";
    String tagged=POSTag(in);

    // load text base
    TextBaseLoader baseLoader=
        new TextBaseLoader(TextBaseLoader.DOC_PER_FILE,true);
    File tempFile=createFile(tagged);
    
    try{
      baseLoader.load(tempFile);
    }catch(Exception e){
      System.err.println(e);
    }

    // get XML labels
    MutableTextLabels labels=baseLoader.getLabels();

    // evaluate mixup
    MixupProgram p;
    try{
      p=new MixupProgram(NAMEMIXUP);
      (new MixupInterpreter(p)).eval(labels);
    }catch(Exception e){
      System.err.println(e);
    }
    try{
      p=new MixupProgram(DATEMIXUP);
      (new MixupInterpreter(p)).eval(labels);
    }catch(Exception e){
      System.err.println(e);
    }

    TextLabelsLoader labelsLoader=new TextLabelsLoader();

    // Minorthird's version of marking up XML labels
    // tagged = labelsLoader.createXMLmarkup(tempFile.getName(), labels);

    // My own version of marking up XML labels; doesn't check for overlapping spans
    tagged=createXML(in,labelsLoader.saveTypesAsXML(labels));

    String keepXML[]=
        {"CC","CD","DT","EX","FW","IN","JJ","JJR","JJS","LS","MD","NN","NNS",
            "NNP","NNPS","PDT","POS","PRP","PRPS","RB","RBR","RBS","RP","TO",
            "UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WPS","WRB","S",
            "NP","name","extracted_date","extracted_time"};
    tagged=filterXML(tagged,keepXML);

    // fix the problem so that end-tag doesn't stick to the next begin-tag.
    // tagged = tagged.replaceAll("(<[^<>]+>[^<>]+?)(\\s+)(</[^<>]+>)", "$1$3$2");

    return tagged;
  }

  private static String filterXML(String tagged,String[] keepXML){
    Matcher m=Pattern.compile("</?([^<>]+)>").matcher(tagged);
    Set<String> delXML=new HashSet<String>();
    while(m.find()){
      if(delXML.contains(m.group(1)))
        continue;
      boolean keep=false;
      for(int i=0;i<keepXML.length;i++){
        if(keepXML[i].equals(m.group(1))){
          keep=true;
          break;
        }
      }
      if(!keep)
        delXML.add(m.group(1));
    }
    for(Iterator<String> i=delXML.iterator();i.hasNext();){
      String del=i.next();
      tagged=tagged.replaceAll("</?\\Q"+del+"\\E>","");
    }
    return tagged;
  }

  private static String createXML(String in,String XMLIndex){
    Map<Integer,String> hash=new HashMap<Integer,String>();
    String temp;
    List<List<Object>> spanList=new ArrayList<List<Object>>();
    StringBuffer buf=new StringBuffer(in);
    Matcher m=
        Pattern.compile("[\r\n]  <(\\S+) lo=(\\d+) hi=(\\d+)>").matcher(
            XMLIndex);

    while(m.find()){
      List<Object> span=new ArrayList<Object>();
      span.add(m.group(1));
      span.add(new Integer(m.group(2)));
      span.add(new Integer(m.group(3)));
      spanList.add(span);
    }

    Collections.sort(spanList,new Comparator<List<Object>>(){
      public int compare(List<Object> o1,List<Object> o2){
        Integer lo1=(Integer)o1.get(1);
        Integer lo2=(Integer)o2.get(1);
        Integer hi1=(Integer)o1.get(2);
        Integer hi2=(Integer)o2.get(2);
        if(lo1.compareTo(lo2)==0)
          return hi2.compareTo(hi1);
        else
          return lo1.compareTo(lo2);
      }
    });

    for(Iterator<List<Object>> i=spanList.iterator();i.hasNext();){
      List<Object> span=i.next();
      String xml=(String)span.get(0);
      String start="<"+xml+">";
      String end="</"+xml+">";
      Integer lo=(Integer)span.get(1);
      Integer hi=(Integer)span.get(2);
      temp=(String)hash.get(lo);
      hash.put(lo,((temp==null)?"":temp)+start);
      temp=(String)hash.get(hi);
      hash.put(hi,end+((temp==null)?"":temp));
    }

    List<Integer> al=new ArrayList<Integer>(hash.keySet());
    Collections.sort(al);
    int counter=0;
    for(Iterator<Integer> i=al.iterator();i.hasNext();){
      Integer index=i.next();
      String tags=hash.get(index);
      buf.insert(index.intValue()+counter,tags);
      counter+=tags.length();
    }
    return buf.toString();
  }

  // create a temp file from a string
  private static File createFile(String content){
    File temp=null;
    BufferedWriter bWriter;
    try{
      temp=File.createTempFile("tmp","");
      temp.deleteOnExit();
      bWriter=new BufferedWriter(new FileWriter(temp));
      bWriter.write(content);
      bWriter.close();
    }catch(IOException ioe){
      System.err.println("Error creating temp file: "+ioe);
    }
    return temp;
  }

  private static class MinorTaggerThread extends Thread{

    private Socket socket=null;

    public MinorTaggerThread(Socket socket){
      super("MinorTaggerThread");
      this.socket=socket;
    }

    public void run(){
      StringBuffer buf=new StringBuffer();
      BufferedReader br;
      PrintStream os;
      String line;

      try{
        br=new BufferedReader(new InputStreamReader(socket.getInputStream()));
        os=new PrintStream(socket.getOutputStream());

        while((line=br.readLine())!=null){
          boolean end_close=false;
          boolean end_continue=false;
          if(line.endsWith("$$$")){
            end_close=true;
            line=line.substring(0,line.length()-3);
          }else if(line.endsWith("$$")){
            end_continue=true;
            line=line.substring(0,line.length()-2);
          }
          buf.append(line+"\n");
          if(end_close||end_continue){
            os.println(tag(buf.toString()));
            buf.setLength(0);
          }
          if(end_close){
            os.close();
            br.close();
            socket.close();
            break;
          }
        }
        System.out.println("["+(new Date())+"] Disconnected from "+
            socket.getRemoteSocketAddress());
      }catch(Exception e){
        System.err.println(e);
      }
    }
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.