qmul.corpus.SwitchboardCorpus.java Source code

Java tutorial

Introduction

Here is the source code for qmul.corpus.SwitchboardCorpus.java

Source

/*******************************************************************************
 * Copyright (c) 2013, 2014 Matthew Purver, Queen Mary University of London.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Matthew Purver, Queen Mary University of London - initial API and implementation
 ******************************************************************************/
package qmul.corpus;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import qmul.util.FilenameToolkit;
import qmul.util.parse.CreateTreeFromSWBD;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreeReader;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.util.Filter;

/**
 * A {@link DialogueCorpus} implementation for SWBD as in the Penn Treebank: 1,126 dialogues with parse trees
 * 
 * @author mpurver
 */
public class SwitchboardCorpus extends DialogueCorpus {

    private static final long serialVersionUID = 1556878031093741728L;

    private static final String ID = "SWBD";

    private static final String BASE_DIR = "/import/imc-corpora/corpora/kcl/ldc/treebank_3";
    private static final String DATA_DIR = "parsed/mrg/swbd";
    private static final String METADATA_DIR = "../"; // rel to DATA_DIR

    private static final String DEFAULT_GENRE = "SWBD_DEFAULT";

    /**
     * Create a SWBD corpus, reading data files from the default (unix) directory
     */
    public SwitchboardCorpus() {
        this(BASE_DIR);
    }

    /**
     * Create a SWBD corpus, reading data files from disk
     * 
     * @param baseDir
     *            override the default (unix) path with your own
     */
    public SwitchboardCorpus(String baseDir) {
        super(ID, new File(baseDir, DATA_DIR));
    }

    /**
     * Create a SWBD corpus, reading data files from the default (unix) directory
     * 
     * @param minSpeakers
     *            discard any dialogue with fewer than this number of speakers (0 to allow all)
     * @param maxSpeakers
     *            discard any dialogue with more than this number of speakers (0 to allow all)
     * @param minGenreCount
     *            discard any dialogue whose genre appears in fewer than this number of dialogues (0 to allow all)
     * @param maxDialogues
     *            only read in at most this number of dialogues (0 to allow all)
     */
    public SwitchboardCorpus(int minSpeakers, int maxSpeakers, int minGenreCount, int maxDialogues) {
        this(BASE_DIR, minSpeakers, maxSpeakers, minGenreCount, maxDialogues);
    }

    /**
     * Create a SWBD corpus, reading data files from disk
     * 
     * @param baseDir
     *            override the default (unix) path with your own
     * @param minSpeakers
     *            discard any dialogue with fewer than this number of speakers (0 to allow all)
     * @param maxSpeakers
     *            discard any dialogue with more than this number of speakers (0 to allow all)
     * @param minGenreCount
     *            discard any dialogue whose genre appears in fewer than this number of dialogues (0 to allow all)
     * @param maxDialogues
     *            only read in at most this number of dialogues (0 to allow all)
     */
    public SwitchboardCorpus(String baseDir, int minSpeakers, int maxSpeakers, int minGenreCount,
            int maxDialogues) {
        super(ID, new File(baseDir, DATA_DIR), minSpeakers, maxSpeakers, minGenreCount, maxDialogues);
    }

    /**
     * Get the speaker & genre metadata from the text files
     */
    private void getMetaData() {
        File metaDataDir = new File(getDir(), METADATA_DIR);
        if (!metaDataDir.exists() || !metaDataDir.canRead()) {
            throw new RuntimeException("Error reading metadata dir " + metaDataDir);
        }
        getGenreCounts().put(DEFAULT_GENRE, Integer.MAX_VALUE);
    }

    /*
     * (non-Javadoc)
     * 
     * @see qmul.corpus.DialogueCorpus#setupCorpus()
     */
    @Override
    public boolean setupCorpus() {
        getMetaData();
        File[] subdirs = getDir().listFiles();
        FilenameToolkit.sortByFileNameIgnoreCase(subdirs);
        boolean success = true;
        System.out.println("Limiting number of dialogues: " + getMaxDialogues());
        DIR: for (File subdir : subdirs) {
            File[] files = subdir.listFiles();
            FilenameToolkit.sortByFileNameIgnoreCase(files);
            System.out.println("Subdir " + subdir + ", found " + files.length + " corpus files ...");
            for (File file : files) {
                if (!processFile(file)) {
                    // failure below this may be due to hitting the dialogue limit
                    success = (numDialogues() >= getMaxDialogues());
                    break DIR;
                }
            }
        }
        if (!sanityCheck()) {
            new RuntimeException("Failed sanity check!").printStackTrace();
            System.exit(0);
        }
        return success;
    }

    /**
     * @param file
     * @return whether to carry on or not
     */
    private boolean processFile(File file) {
        Pattern p = Pattern.compile("(?i)(.+)\\.mrg");
        Matcher m = p.matcher(file.getName());
        if (m.matches()) {
            String dialogueName = m.group(1).toUpperCase();
            // String genre = getGenreMap().get(dialogueName);
            String genre = DEFAULT_GENRE; // TODO genre information in SWBD?
            if (genre == null) {
                throw new RuntimeException("No metadata for dialogue " + dialogueName);
            }
            PennTreeReader reader;
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
                reader = new PennTreeReader(br, new LabeledScoredTreeFactory());
            } catch (FileNotFoundException e) {
                System.err.println("Error reading SWBD corpus file " + file + ": " + e.getMessage());
                return false;
            }
            System.out.println("Reading SWBD corpus file " + file + " ...");
            if (!getSentences(dialogueName, genre, reader)) {
                return false;
            }
        } else {
            System.out.println("WARNING: NOT processing non-matching corpus file " + file);
        }
        return true;
    }

    /*
     * (non-Javadoc)
     * 
     * @see qmul.corpus.DialogueCorpus#loadDialogue(java.lang.String)
     */
    @Override
    public boolean loadDialogue(String name) {
        File file = new File(getDir(), name + ".mrg");
        return processFile(file);
    }

    /**
     * @param dialogueName
     * @param genre
     * @param reader
     * @return whether to carry on or not
     */
    private boolean getSentences(String dialogueName, String genre, TreeReader reader) {
        Pattern p = Pattern.compile("\\(CODE\\s+(?:\\([A-Z]+\\s+)?Speaker([A-Za-z]+)(\\d+)");
        try {
            Dialogue dialogue = null;
            DialogueSpeaker speaker = null;
            DialogueSpeaker lastSpeaker = null;
            DialogueTurn currentTurn = null;
            int currentSubdialogue = -1;
            int turnNum = -1;
            Tree tree = reader.readTree();
            Filter<Tree> nodeFilter = new NodeFilter();
            while (tree != null) {
                Matcher m = p.matcher(tree.toString());
                if (m.find()) {
                    // get the metadata
                    turnNum = Integer.parseInt(m.group(2));
                    int subDialogue = 0; // apparently no subdialogues in SWBD ...
                    String spk = m.group(1).toUpperCase();
                    // start new dialogue if subdialogue changed
                    if (subDialogue != currentSubdialogue) {
                        if (dialogue != null) {
                            if (!checkDialogue(dialogue)) {
                                return false;
                            }
                        }
                        // dialogue = addDialogue(dialogueName + ":" + subDialogue, genre);
                        dialogue = addDialogue(dialogueName, genre);
                        // TODO genre in SWBD?
                        getGenreMap().put(dialogueName, genre);
                    }
                    currentSubdialogue = subDialogue;
                    // set up speaker
                    String spkId = dialogue.getId() + ":" + spk;
                    if (!getSpeakerMap().containsKey(spkId)) {
                        // TODO speaker info in SWBD?
                        getSpeakerMap().put(spkId, new DialogueSpeaker(spkId, "", "", "", "", ""));
                        // System.out.println("added new speaker " + spkId);
                    }
                    speaker = getSpeakerMap().get(spkId);
                } else {
                    // get the tree and extract the transcription
                    String trans = "";
                    // SWBD embeds trees within an extra unlabelled level ((S etc))
                    if (((tree.label() == null) || (tree.label().value() == null))
                            && (tree.children().length == 1)) {
                        tree = tree.getChild(0);
                    }
                    if (tree != null) {
                        tree = tree.prune(nodeFilter);
                        if (tree != null) {
                            for (Tree leaf : tree.getLeaves()) {
                                trans += leaf.label() + " ";
                            }
                            trans = trans.substring(0, trans.length() - 1);
                            // start new turn if speaker has changed
                            if ((lastSpeaker == null) || !speaker.equals(lastSpeaker) || (currentTurn == null)) {
                                currentTurn = dialogue.addTurn(turnNum, speaker);
                                // System.out.println("new turn " + turnNum + ", " + speaker + " " + currentTurn);
                                lastSpeaker = speaker;
                            }
                            // add sentence
                            dialogue.addSent(-1, currentTurn, trans, tree);
                            // DialogueSentence s = dialogue.addSent(-1, currentTurn, trans, tree);
                            // System.out.println("new sent " + s);
                            // System.out.println(s.getSyntax().pennString());
                        }
                    }
                }
                tree = reader.readTree();
            }
            return checkDialogue(dialogue);
        } catch (IOException e) {
            System.err.println("Error reading sentence line" + e.getMessage());
            return false;
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see qmul.corpus.DialogueCorpus#topTenSynProductions()
     */
    @Override
    public HashSet<String> topTenSynProductions() {
        return new HashSet<String>(Arrays.asList("NP:PRP", "S:NP:VP", "INTJ:UH", "PP:IN:NP", "ADVP:RB", "NP:DT:NN",
                "VP:VB:NP", "VP:VB", "S:VP", "NP:NN"));
    }

    /**
     * just for testing
     * 
     * @args put in your local corpus base dir if you want
     */
    public static void main(String[] args) {
        SwitchboardCorpus c = null;
        if ((args.length > 0) && !args[0].equals("dummy")) {
            System.out.println("Found arg, using non-default base dir " + args[0]);
            c = new SwitchboardCorpus(args[0]);
        } else {
            c = new SwitchboardCorpus();
        }
        c.writeToFile(new File("swbd.corpus.gz"));
    }

    /**
     * For removing certain kinds of node based on options
     * 
     * @author mpurver
     */
    private class NodeFilter implements Filter<Tree> {

        private static final long serialVersionUID = -2118859608177695349L;

        /*
         * (non-Javadoc)
         * 
         * @see edu.stanford.nlp.util.Filter#accept(java.lang.Object)
         */
        @Override
        public boolean accept(Tree obj) {
            if (obj == null) {
                return true;
            }
            if (obj.label() == null) {
                return true;
            }
            if (obj.label().value() == null) {
                return true;
            }
            if (obj.label().value().matches("^(E|N)_S$") && getOption(CreateTreeFromSWBD.INCLUDE_NO_E_S)) {
                return false;
            }
            if (obj.label().value().equals("-NONE-") && getOption(CreateTreeFromSWBD.INCLUDE_NO_TRACES)) {
                return false;
            }
            if (obj.label().value().equals("0") && getOption(CreateTreeFromSWBD.INCLUDE_NO_TRACES)) {
                return false;
            }
            if (obj.label().value().matches("^\\*((T\\*)?-\\d+)?$")
                    && getOption(CreateTreeFromSWBD.INCLUDE_NO_TRACES)) {
                return false;
            }
            if (obj.label().value().matches("^[,.:?!;]$") && getOption(CreateTreeFromSWBD.INCLUDE_NO_PUNCTUATION)) {
                return false;
            }
            if (obj.label().value().matches("^\\\\[\\[\\+\\]]$")
                    && getOption(CreateTreeFromSWBD.INCLUDE_NO_SELFREPAIR_BRACKETS)) {
                return false;
            }
            if (!obj.isLeaf() && obj.label().value().matches("^INTJ$")
                    && getOption(CreateTreeFromSWBD.INCLUDE_NO_INTJ)) {
                return false;
            }
            if (!obj.isLeaf() && obj.label().value().matches("^EDITED$")
                    && getOption(CreateTreeFromSWBD.REPAIR_SELFREPAIRS)) {
                return false;
            }
            // naughty - modifying values when we're really just supposed to be filtering ...
            if (getOption(CreateTreeFromSWBD.SIMPLIFY_CATEGORIES) && !obj.isLeaf()) {
                // remove detailed cats NP=1, NP-TMP=2, NP-SBJ-1, ADVP-LOC etc etc
                if (obj.label().value().matches(".+(-|=).+")) {
                    obj.label().setValue(obj.label().value().replaceFirst("(-|=).+", ""));
                }
                // remove ^ marker for second half of compounds
                if (obj.label().value().matches("\\^.+")) {
                    obj.label().setValue(obj.label().value().replaceFirst("\\^(.+)", "$1"));
                }
                // any verb PoS tag -> VB
                if (obj.label().value().matches("VB(Z|G|D|P|N)")) {
                    obj.label().setValue(obj.label().value().replaceFirst("(VB)(Z|G|D|P|N)", "$1"));
                }
                // the BES tag for "'s" contracted verb "is" -> VB (other "'s" are separately tagged e.g. POS)
                if (obj.label().value().matches("BES")) {
                    obj.label().setValue(obj.label().value().replaceFirst("BES", "VB"));
                }
                // plural NN(P)S same as sing NN(P)
                if (obj.label().value().matches("NNP?(S)")) {
                    obj.label().setValue(obj.label().value().replaceFirst("(NNP?)(S)", "$1"));
                }
                // comp/sup adjectives/adverbs same as normal adj/adv
                if (obj.label().value().matches("(JJ|RB)(R|S)")) {
                    obj.label().setValue(obj.label().value().replaceFirst("(JJ|RB)(R|S)", "$1"));
                }
                // possessive pronouns same as normal pronouns
                if (obj.label().value().matches("PRP(\\$)")) {
                    obj.label().setValue(obj.label().value().replaceFirst("(PRP)(\\$)", "$1"));
                }
            }
            return true;
        }

        private boolean getOption(int o) {
            return CreateTreeFromSWBD.getOption(o);
        }

    }

}