edu.cmu.ark.nlp.question.QuestionUtil.java Source code

Introduction

Here is the source code for edu.cmu.ark.nlp.question.QuestionUtil.java
Source

// Question Generation via Overgenerating Transformations and Ranking
// Copyright (c) 2010 Carnegie Mellon University.  All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Michael Heilman
//     Carnegie Mellon University
//     mheilman@cmu.edu
//     http://www.cs.cmu.edu/~mheilman

package edu.cmu.ark.nlp.question;

import java.io.*;
import java.net.*;
import java.util.*;

import org.apache.commons.lang3.StringUtils;

import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.data.POS;
import net.didion.jwnl.dictionary.Dictionary;
import edu.cmu.ark.nlp.parse.ParseResult;
import edu.cmu.ark.nlp.parse.TregexPatternFactory;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.parser.lexparser.*;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.*;
import edu.stanford.nlp.trees.tregex.tsurgeon.*;
import edu.stanford.nlp.util.Pair;

public class QuestionUtil {

    //private LexicalizedParser parser;
    //private static QuestionUtil instance;
    //private VerbConjugator conjugator;

    private static LabeledScoredTreeFactory tree_factory = null;
    private PennTreebankLanguagePack tlp;

    private QuestionUtil(Properties props) {
        //parser = null;

        //headfinder = new CollinsHeadFinder();
        tlp = new PennTreebankLanguagePack();
    }

    public static LabeledScoredTreeFactory getTreeFactory() {
        if (tree_factory == null) {
            tree_factory = new LabeledScoredTreeFactory();
        }
        return tree_factory;
    }

    public static void addPeriodIfNeeded(Tree input) {
        String tregexOpStr = "ROOT < (S=mainclause !< /\\./)";
        TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        TregexMatcher matcher = matchPattern.matcher(input);

        if (matcher.find()) {
            TsurgeonPattern p;
            List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>();
            List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

            ps.add(Tsurgeon.parseOperation("insert (. .) >-1 mainclause"));
            p = Tsurgeon.collectOperations(ps);
            ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
            Tsurgeon.processPatternsOnTree(ops, input);
        }
    }

    public static int getNumberOfMatchesInTree(String tregexExpression, Tree t) {
        int res = 0;
        TregexMatcher m = TregexPatternFactory.getPattern(tregexExpression).matcher(t);
        while (m.find()) {
            res++;
        }
        return res;
    }

    public static List<String> getSentences(String document) {
        List<String> res = new ArrayList<String>();
        StringReader reader = new StringReader(preprocess(document));

        DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(reader);

        for (List<HasWord> sentence : docPreprocessor) {
            res.add(SentenceUtils.listToString(sentence, true));
        }

        return res;
    }

    public static String abbrevTree(Tree tree) {
        ArrayList<String> toks = new ArrayList<String>();
        for (Tree L : tree.getLeaves()) {
            toks.add(L.label().toString());
        }
        return tree.label().toString() + "[" + StringUtils.join(toks, " ") + "]";
    }

    public static void downcaseFirstToken(Tree inputTree) {
        Tree firstWordTree = inputTree.getLeaves().get(0);
        if (firstWordTree == null)
            return;
        Tree preterm = firstWordTree.parent(inputTree);
        String firstWord = firstWordTree.yield().toString();
        if (!preterm.label().toString().matches("^NNP.*") && !firstWord.equals("I")) {
            //if(firstWord.indexOf('-') == -1 && !firstWord.equals("I")){
            firstWord = firstWord.substring(0, 1).toLowerCase() + firstWord.substring(1);
            firstWordTree.label().setValue(firstWord);
        }

        //if(QuestionTransducer.DEBUG) System.err.println("downcaseFirstToken: "+inputTree.toString());
    }

    public static void upcaseFirstToken(Tree inputTree) {
        Tree firstWordTree = inputTree.getLeaves().get(0);
        if (firstWordTree == null)
            return;

        String firstWord = firstWordTree.yield().toString();
        firstWord = firstWord.substring(0, 1).toUpperCase() + firstWord.substring(1);
        firstWordTree.label().setValue(firstWord);

        //if(QuestionTransducer.DEBUG) System.err.println("upcaseFirstToken: "+inputTree.toString());
    }

    public static String preprocess(String sentence) {
        //remove trailing whitespace   
        sentence = sentence.trim();

        //remove single words in parentheses.  
        //the stanford parser api messed up on these
        //by removing the parentheses but not the word in them
        sentence = sentence.replaceAll("\\(\\S*\\)", "");
        sentence = sentence.replaceAll("\\(\\s*\\)", "");

        //some common unicode characters that the tokenizer throws out otherwise
        sentence = sentence.replaceAll("", "--");
        sentence = sentence.replaceAll("", "'");
        sentence = sentence.replaceAll("?", "\"");
        sentence = sentence.replaceAll("", "\"");
        sentence = sentence.replaceAll("|||", "e");
        sentence = sentence.replaceAll("|||", "E");
        sentence = sentence.replaceAll("|||", "i");
        sentence = sentence.replaceAll("|?||?", "I");
        sentence = sentence.replaceAll("||||||", "a");
        sentence = sentence.replaceAll("|?|||||", "A");
        sentence = sentence.replaceAll("||||", "o");
        sentence = sentence.replaceAll("||||", "O");
        sentence = sentence.replaceAll("|||", "u");
        sentence = sentence.replaceAll("|||", "U");
        sentence = sentence.replaceAll("", "n");

        //contractions
        sentence = sentence.replaceAll("can't", "can not");
        sentence = sentence.replaceAll("won't", "will not");
        sentence = sentence.replaceAll("n't", " not"); //aren't shouldn't don't isn't
        sentence = sentence.replaceAll("are n't", "are not");

        //simply remove other unicode characters
        //if not, the tokenizer replaces them with spaces, 
        //which wreaks havoc on the final parse sometimes
        for (int i = 0; i < sentence.length(); i++) {
            if (sentence.charAt(i) > 'z') {
                sentence = sentence.substring(0, i) + sentence.substring(i + 1);
            }
        }

        //add punctuation to the end if necessary
        /*Matcher matcher = Pattern.compile(".*\\.['\"\n ]*$", Pattern.DOTALL).matcher(sentence);
        if(!matcher.matches()){
           sentence += ".";
        }*/

        return sentence;
    }

    public static String preprocessTreeString(String sentence) {
        sentence = sentence.replaceAll(" n't", " not");
        sentence = sentence.replaceAll("\\(MD ca\\)", "(MD can)");
        sentence = sentence.replaceAll("\\(MD wo\\)", "(MD will)");
        sentence = sentence.replaceAll("\\(MD 'd\\)", "(MD would)");
        sentence = sentence.replaceAll("\\(VBD 'd\\)", "(VBD had)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBP 're\\)", "(VBP are)");

        return sentence;
    }

    /*
    public VerbConjugator getConjugator(){
       return conjugator;
    }
    */

    /*
    public CollinsHeadFinder getHeadFinder(){
       return headfinder;
    }
    */

    /*
    public static QuestionUtil getInstance(){
       if(instance == null){
     instance = new QuestionUtil();
       }
       return instance;
    }
    */

    /*
    public ParseResult parseSentence(String sentence) {
       String result = "";
       //System.err.println(sentence);
       //see if a parser socket server is available
    int port = new Integer(GlobalProperties.getProperties().getProperty("parserServerPort","5556"));
    String host = "127.0.0.1";
    Socket client;
    PrintWriter pw;
    BufferedReader br;
    String line;
    Tree parse = null;
    double parseScore = Double.MIN_VALUE;
        
       try{
     client = new Socket(host, port);
        
     pw = new PrintWriter(client.getOutputStream());
     br = new BufferedReader(new InputStreamReader(client.getInputStream()));
     pw.println(sentence);
     pw.flush(); //flush to complete the transmission
        
        while((line = br.readLine())!= null){
            //if(!line.matches(".*\\S.*")){
            //        System.out.println();
            //}
            if(br.ready()){
               line = line.replaceAll("\n", "");
                line = line.replaceAll("\\s+", " ");
               result += line + " ";
            }else{
               parseScore = new Double(line);
            }
        }
        
     br.close();
     pw.close();
     client.close();
         
     if(parse == null){
        parse = readTreeFromString("(ROOT (. .))");
        parseScore = -99999.0;
     }
           
     if(GlobalProperties.getDebug()) System.err.println("result (parse):"+result);
     parse = readTreeFromString(result);
     return new ParseResult(true, parse, parseScore);
         
       } catch (Exception ex) {
     if(GlobalProperties.getDebug()) System.err.println("Could not connect to parser server.");
     //ex.printStackTrace();
       }
        
       System.err.println("parsing:"+sentence);
           
       //if socket server not available, then use a local parser object
       if(parser == null){
     try {
        Options op = new Options();
        String serializedInputFileOrUrl = GlobalProperties.getProperties().getProperty("parserGrammarFile", "config"+File.separator+"englishFactored.ser.gz");
        parser = LexicalizedParser.getParserFromSerializedFile(serializedInputFileOrUrl);
        parser.setOptionFlags("-maxLength", GlobalProperties.getProperties().getProperty("parserMaxLength", "40"));
        parser.setOptionFlags("-outputFormat", "oneline");
     } catch (Exception e){
        e.printStackTrace();
     }
       }
        
       try{
     parse = parser.parse(sentence);
     parseScore = parse.score();
     StringWriter sb = new StringWriter();
     pw = new PrintWriter(sb);
     parse.pennPrint(pw);
     pw.flush();
     return new ParseResult(true, parse, parseScore);
       }catch(Exception e){
       }
        
       parse = readTreeFromString("(ROOT (. .))");
    parseScore = -99999.0;
    return new ParseResult(false, parse, parseScore);
    }
     */

    public static String getLemma(String word, String pos) {
        if (!(pos.startsWith("N") || pos.startsWith("V") || pos.startsWith("J") || pos.startsWith("R"))
                || pos.startsWith("NNP")) {
            return word.toLowerCase();
        }

        String res = word.toLowerCase();

        if (res.equals("is") || res.equals("are") || res.equals("were") || res.equals("was")) {
            res = "be";
        } else {
            try {
                IndexWord iw;
                if (pos.startsWith("V"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.VERB, res);
                else if (pos.startsWith("N"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.NOUN, res);
                else if (pos.startsWith("J"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.ADJECTIVE, res);
                else
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.ADVERB, res);

                if (iw == null)
                    return res;
                res = iw.getLemma();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return res;
    }

    /**
     * Remove traces and non-terminal decorations (e.g., "-SUBJ" in "NP-SUBJ") from a Penn Treebank-style tree.
     * 
     * @param inputTree
     */
    public void normalizeTree(Tree inputTree) {
        inputTree.label().setFromString("ROOT");

        List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();
        List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>();
        String tregexOpStr;
        TregexPattern matchPattern;
        TsurgeonPattern p;
        TregexMatcher matcher;

        tregexOpStr = "/\\-NONE\\-/=emptynode";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        matcher = matchPattern.matcher(inputTree);
        ps.add(Tsurgeon.parseOperation("prune emptynode"));
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        p = Tsurgeon.collectOperations(ps);
        ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
        Tsurgeon.processPatternsOnTree(ops, inputTree);

        Label nonterminalLabel;

        tregexOpStr = "/.+\\-.+/=nonterminal < __";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        matcher = matchPattern.matcher(inputTree);
        while (matcher.find()) {
            nonterminalLabel = matcher.getNode("nonterminal");
            if (nonterminalLabel == null)
                continue;
            nonterminalLabel.setFromString(tlp.basicCategory(nonterminalLabel.value()));
        }

    }

    /**
     * remove extra quotation marks
     * (a hack due to annoying PTB conventions by which quote marks aren't in the same consituent)
     *  
     * @param input
     */
    public static void removeExtraQuotes(Tree input) {
        List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();
        String tregexOpStr;
        TregexPattern matchPattern;
        TsurgeonPattern p;
        List<TsurgeonPattern> ps;

        ps = new ArrayList<TsurgeonPattern>();
        tregexOpStr = "ROOT [ << (``=quote < `` !.. ('' < '')) | << (''=quote < '' !,, (`` < ``)) ] ";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        ps.add(Tsurgeon.parseOperation("prune quote"));
        p = Tsurgeon.collectOperations(ps);
        ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
        Tsurgeon.processPatternsOnTree(ops, input);

    }

    public static String getCleanedUpYield(Tree inputTree) {
        Tree copyTree = inputTree.deepCopy();

        //if(GlobalProperties.getDebug()) System.err.println("yield:"+copyTree.toString());

        return cleanUpSentenceString(copyTree.yield().toString());
    }

    public static String cleanUpSentenceString(String s) {
        String res = s;
        //if(res.length() > 1){
        //   res = res.substring(0,1).toUpperCase() + res.substring(1);
        //}

        res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1");
        res = res.replaceAll("(\\$)\\s", "$1");
        res = res.replaceAll("can not", "cannot");
        res = res.replaceAll("\\s*-LRB-\\s*", " (");
        res = res.replaceAll("\\s*-RRB-\\s*", ") ");
        res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 ");
        res = res.replaceAll("\\s+''", "''");
        //res = res.replaceAll("\"", "");
        res = res.replaceAll("``\\s+", "``");
        res = res.replaceAll("\\-[LR]CB\\-", ""); //brackets, e.g., [sic]
        res = res.replaceAll("\\. \\?", ".?");
        res = res.replaceAll(" 's(\\W)", "'s$1");
        res = res.replaceAll("(\\d,) (\\d)", "$1$2"); //e.g., "5, 000, 000" -> "5,000,000"
        res = res.replaceAll("``''", "");

        //remove extra spaces
        res = res.replaceAll("\\s\\s+", " ");
        res = res.trim();

        return res;
    }

    public static boolean cCommands(Tree root, Tree n1, Tree n2) {
        if (n1.dominates(n2))
            return false;

        Tree n1Parent = n1.parent(root);
        while (n1Parent != null && n1Parent.numChildren() == 1) {
            n1Parent = n1Parent.parent(root);
        }

        if (n1Parent != null && n1Parent.dominates(n2))
            return true;

        return false;
    }

    public static Tree readTreeFromString(String parseStr) {
        //read in the input into a Tree data structure
        TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), QuestionUtil.getTreeFactory());
        Tree inputTree = null;
        try {
            inputTree = treeReader.readTree();

        } catch (IOException e) {
            e.printStackTrace();
        }
        return inputTree;
    }

    public static boolean filterOutSentenceByPunctuation(String sentence) {
        //return (sentence.indexOf("\"") != -1 
        //|| sentence.indexOf("''") != -1 
        //|| sentence.indexOf("``") != -1
        //|| sentence.indexOf("*") != -1);
        if (sentence.indexOf("*") != -1) {
            return true;
        }

        //if(sentence.matches("[^\\w\\-\\/\\?\\.,;:\\$\\#\\&\\(\\) ]")){
        //   return true;
        //}

        return false;
    }

    /*
    public String getSurfaceForm(String lemma, String pos) {
       return conjugator.getSurfaceForm(lemma, pos);
    }
    */
}