qmul.util.parse.CreateTreeFromClarkCurranCcgGrs.java Source code

Java tutorial

Introduction

Here is the source code for qmul.util.parse.CreateTreeFromClarkCurranCcgGrs.java

Source

/*******************************************************************************
 * Copyright (c) 2009, 2013, 2014 Matthew Purver, Queen Mary University of London.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 ******************************************************************************/
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package qmul.util.parse;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.util.Pair;

/**
 * Create {@link Tree}s for the Clark & Curran CCG parses for the BNC
 * 
 * @author mpurver
 */
public class CreateTreeFromClarkCurranCcgGrs {

    private static TreeFactory tf = new LabeledScoredTreeFactory();

    private static HashMap<Integer, Boolean> options = null;
    public static final int INCLUDE_NO_SQUARE_BRACKET_SUBCATS = 0;
    public static final int INCLUDE_NO_PUNCTUATION = 7;

    public static final int INCLUDE_NO_PAUSE = 1;
    public static final int INCLUDE_NO_SELFREPAIR_BRACKETS = 2;
    /**
     * INTJ backchannels, openings, closings, filled pauses, e.g. uh, uh-huh, oh, yeah, okay, well, like, right, huh,
     * really, no, sure, hi, adios, bye
     */
    public static final int INCLUDE_NO_INTJ = 3; // 
    public static final int INCLUDE_NO_UNCLEAR = 4;
    public static final int INCLUDE_NO_E_S = 5;
    public static final int INCLUDE_NO_TRACES = 6;
    public static final int REPAIR_SELFREPAIRS = 8;
    public static final int CATEGORIES_NOT_FUNCTIONS = 9;

    private static final String IGNORE_MARKER = "ooo";

    /**
     * Set INCLUDE_NO_ options to default values (all true)
     */
    public static void setDefaultOptions() {
        if (options == null) {
            options = new HashMap<Integer, Boolean>();
        }
        options.clear();
        options.put(INCLUDE_NO_SQUARE_BRACKET_SUBCATS, false); // set flag to true to remove all bracketed features
        options.put(INCLUDE_NO_PAUSE, true); // set flag to true to remove pauses
        options.put(INCLUDE_NO_SELFREPAIR_BRACKETS, true); // set to true to remove repair markers [X + Y] -> X Y
        options.put(INCLUDE_NO_INTJ, true); // set to true to remove INTJs (filled pauses, backchannels, closings)
        options.put(INCLUDE_NO_UNCLEAR, true); // set to true to remove indet,?
        options.put(INCLUDE_NO_E_S, true); // set to true to remove E_S end-S-unit markers
        options.put(INCLUDE_NO_TRACES, true); // set to true to remove *T*-2, 0 syn traces
        options.put(INCLUDE_NO_PUNCTUATION, true); // set to true to remove punctuation nodes
        options.put(REPAIR_SELFREPAIRS, true); // set to true to replace [ X + Y ] with Y
        options.put(CATEGORIES_NOT_FUNCTIONS, true); // cats are second caps item: FUNCTION,CAT
    }

    /**
     * Set an INCLUDE_NO_ option
     * 
     * @param option
     * @param value
     */
    public static void setOption(int option, boolean value) {
        if (options == null) {
            options = new HashMap<Integer, Boolean>();
            setDefaultOptions();
        }
        options.put(option, value);
    }

    /**
     * Get an INCLUDE_NO_ option
     * 
     * @param option
     * @return value
     */
    public static boolean getOption(int option) {
        if (options == null) {
            options = new HashMap<Integer, Boolean>();
            setDefaultOptions();
        }
        return options.get(option);
    }

    /**
     * For testing: use the default file
     * 
     * @return the Stanford {@link Tree}
     */
    public static Tree makeTree() {
        return makeTree(new File("C://Tree1.txt"));
        // return makeTree(new File("D://Chattool//Tree.txt"));
    }

    /**
     * For testing: use the default file
     * 
     * @return the {@link List} of Stanford {@link Tree}s
     */
    public static List<Tree> makeTrees() {
        return makeTrees(new File("C://Tree1.txt"));
    }

    /**
     * @param string
     *            a string representation of a DCPSE tree
     * @return the Stanford {@link Tree}
     */
    public static Tree makeTree(String string) {
        return makeTree(new BufferedReader(new StringReader(string)));
    }

    /**
     * @param string
     *            a string representation of CCG trees
     * @return the {@link List} of Stanford {@link Tree}s
     */
    public static List<Tree> makeTrees(String string) {
        return makeTrees(new BufferedReader(new StringReader(string)));
    }

    /**
     * @param file
     *            a file containing a CCG tree
     * @return the Stanford {@link Tree}
     */
    public static Tree makeTree(File file) {
        try {
            FileInputStream fis = new FileInputStream(file);
            return makeTree(new BufferedReader(new InputStreamReader(fis)));
        } catch (FileNotFoundException fnfe) {
            System.err.println("FileNotFoundException: " + fnfe.getMessage());
            return null;
        }
    }

    /**
     * @param file
     *            a file containing CCG trees
     * @return the {@link List} of Stanford {@link Tree}s
     */
    public static List<Tree> makeTrees(File file) {
        try {
            FileInputStream fis = new FileInputStream(file);
            return makeTrees(new BufferedReader(new InputStreamReader(fis)));
        } catch (FileNotFoundException fnfe) {
            System.err.println("FileNotFoundException: " + fnfe.getMessage());
            return null;
        }
    }

    /**
     * @param reader
     *            a {@link BufferedReader}
     * @return the {@link List} of Stanford {@link Tree}s
     */
    public static List<Tree> makeTrees(BufferedReader reader) {
        ArrayList<Tree> trees = new ArrayList<Tree>();
        Tree tree;
        do {
            tree = makeTree(reader);
            if (tree != null) {
                trees.add(tree);
            }
        } while (tree != null);
        return trees;
    }

    // looks like we're going to need a shift-reduce parser to parse the parses ...
    private static List<Tree> treeStack = new ArrayList<Tree>();
    private static List<String> tagStack = new ArrayList<String>();

    /**
     * @param reader
     *            a {@link BufferedReader}
     * @return the Stanford {@link Tree}
     */
    public static Tree makeTree(BufferedReader reader) {
        if (options == null) {
            setDefaultOptions();
        }

        String line = null;
        treeStack.clear();
        tagStack.clear();
        try {
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                // empty line is the gap between sentences - stop and return what we've built so far
                if (line.isEmpty() && !treeStack.isEmpty()) {
                    break;
                }
                // wait for <c> line
                if (!line.startsWith("<c>")) {
                    continue;
                }
                String[] chunks = line.split("\\s+");
                for (String chunk : chunks) {
                    if (chunk.equals("<c>")) {
                        continue;
                    }
                    String[] fields = chunk.split("\\|");
                    if (fields.length != 6) {
                        throw new IllegalArgumentException("strange chunk " + chunk + " " + fields.length);
                    }
                    String word = fields[0];
                    String stem = fields[1];
                    String postag = fields[2];
                    String tag1 = fields[3];
                    String tag2 = fields[4];
                    String supertag = fields[5];
                    if (supertag.matches("^[,.:;!?]+$")) {
                        continue;
                    }
                    // each word gets a lexical leaf node plus a non-terminal supertag node
                    Tree synNode = tf.newTreeNode(supertag, new ArrayList<Tree>());
                    synNode.addChild(tf.newLeaf(word));
                    shift(synNode, supertag);
                    reduce();
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.exit(0);
        }
        reduce();
        if (treeStack.isEmpty()) {
            System.out.println("No nodes, return null");
            return null;
        } else if (treeStack.size() == 1) {
            System.out.println("1 nodes, return " + treeStack.get(0));
            return treeStack.get(0);
        } else {
            Tree tree = tf.newTreeNode("ROOT", treeStack);
            System.out.println(treeStack.size() + " nodes, return " + tree);
            return tree;
        }
    }

    private static void shift(Tree node, String tag) {
        treeStack.add(node);
        tagStack.add(tag);
        System.out.println("shift " + tagStack);
    }

    private static void reduce() {
        int n = (tagStack.size() - 1);
        if (n < 1) {
            return;
        }
        System.out.println("reduce " + tagStack);
        String tag = null;
        if (tag == null) {
            tag = matchRight(tagStack.get(n - 1), tagStack.get(n));
        }
        if (tag == null) {
            tag = matchLeft(tagStack.get(n - 1), tagStack.get(n));
        }
        if (tag == null) {
            return;
        }
        tagStack.remove(n);
        tagStack.remove(n - 1);
        tagStack.add(tag);
        Tree node = tf.newTreeNode(tag, new ArrayList<Tree>());
        node.addChild(treeStack.get(n - 1));
        node.addChild(treeStack.get(n));
        treeStack.remove(n);
        treeStack.remove(n - 1);
        treeStack.add(node);
        reduce();
        return;
    }

    /**
     * @param tag1
     * @param tag2
     * @return the new parent tag: X if tag1 = X/Y and tag2 = Y; X/Z if tag1 = X/Y and tag2 = Y/Z
     */
    private static String matchRight(String tag1, String tag2) {
        Pair<String, String> halves1 = split(tag1, '/');
        if (halves1 == null) {
            return null;
        }
        System.out.println("MR " + halves1.first + " " + halves1.second);
        // standard forward functional composition
        if (halves1.second().equals(tag2)) {
            return halves1.first();
        }
        // forward composition with B
        Pair<String, String> halves2 = split(tag2, '/');
        if (halves2 == null) {
            return null;
        }
        if (halves1.second().equals(halves2.first())) {
            return halves1.first() + "/" + halves2.second();
        }
        return null;
    }

    /**
     * @param tag1
     * @param tag2
     * @return the new parent tag: X if tag1 = Y and tag2 = X\Y; X\Z if tag1 = Y\Z and tag2 = X\Y
     */
    private static String matchLeft(String tag1, String tag2) {
        Pair<String, String> halves2 = split(tag2, '\\');
        if (halves2 == null) {
            return null;
        }
        System.out.println("ML " + halves2.first + " " + halves2.second);
        // standard backward functional composition
        if (tag1.equals(halves2.second())) {
            return halves2.first();
        }
        // backward composition with B
        Pair<String, String> halves1 = split(tag1, '\\');
        if (halves1 == null) {
            return null;
        }
        if (halves1.first().equals(halves2.second())) {
            return halves2.first() + "\\" + halves1.second();
        }
        return null;
    }

    /**
     * @param str
     * @param separator
     * @return the halves of str either side of separator, with grouping () brackets taken into consideration; or null
     *         if str does not contain separator
     */
    private static Pair<String, String> split(String str, char separator) {
        int numBrackets = 0;
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) == '(') {
                numBrackets++;
            } else if (str.charAt(i) == ')') {
                numBrackets--;
            } else if ((numBrackets == 0) && (str.charAt(i) == separator)) {
                return new Pair<String, String>(str.substring(0, i).replaceFirst("^\\((.*)\\)$", "$1"),
                        str.substring(i + 1).replaceFirst("^\\((.*)\\)$", "$1"));
            }
        }
        return null;
    }

    public static void main(String[] args) {
        // String test =
        // "<c> Right|right|RB|I-ADVP|I-TIM|S/S ,|,|,|O|O|, hello|hello|UH|I-INTJ|O|S/S ,|,|,|O|O|, yeah|yeah|UH|I-INTJ|O|S/S ,|,|,|O|O|, we|we|PRP|I-NP|O|NP 're|'re|VBP|I-VP|O|(S[dcl]\\NP)/(S[adj]\\NP) back|back|RB|I-ADVP|O|S[adj]\\NP .|.|.|O|O|.";
        File test = new File(
                "c:/Documents and Settings/mpurver/My Documents/dyndial/align/parsedno_idnewclean_ks_ksw");
        Tree t = makeTree(test);
        t.indentedListPrint();
        List<Tree> l = makeTrees(test);
        System.out.println("found " + l.size() + " trees");
    }
}