ims.cs.parc.ParcUtils.java Source code

Introduction

Here is the source code for ims.cs.parc.ParcUtils.java
Source

/*
 * This file is part of QSample.
 * QSample is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * QSample is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with QSample.  If not, see <http://www.gnu.org/licenses/>.
 */

package ims.cs.parc;

import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.GrammaticalRelation;
import ims.cs.corenlp.Helper;
import ims.cs.lingdata.ByteCount;
import ims.cs.lingdata.Document;
import ims.cs.lingdata.Sentence;
import ims.cs.lingdata.Token;
import edu.stanford.nlp.trees.Tree;
import org.jgrapht.alg.FloydWarshallShortestPaths;
import org.jgrapht.graph.SimpleDirectedGraph;

import java.util.Iterator;
import java.util.List;

/**
 * Collection of Utility functions
 */
public abstract class ParcUtils {

    /**
     * Find all head verbs in the corpus. The algorithm is taken from Pareti (2015).
     * @param sentence
     */
    public static void markHeadVerbs(Sentence sentence) {

        for (Tree tree : sentence.tree.preOrderNodeList()) {
            if (tree.label().value().equals("VP")) {
                boolean valid = true;
                for (Tree child : tree.children()) {
                    if (child.label().value().equals("VP")) {
                        valid = false;
                        break;
                    }
                }

                if (valid) {
                    for (Tree child : tree.children()) {
                        if (child.firstChild().isLeaf() && child.label().value().startsWith("V")) {
                            Token token = sentence.treeLookup.get(child.firstChild());
                            if (token != null)
                                token.isHeadVerb = true;
                        }
                    }
                }
            }
        }
    }

    /**
     * Annotates paragraph-continuing quotation marks. doParagraphAnnotation() needs to be called before this.
     * @param document
     */
    public static void markParagraphQuotes(Document document) {
        int quoteIndex = 1;

        for (Token token : document.tokenList) {
            if (Helper.isQuote(token)) {
                // ignore even quotes at paragraph begins
                if (token.paragraphBegins && quoteIndex % 2 == 0)
                    token.ignoreQuote = true;
                else
                    quoteIndex++;
            }
        }
    }

    /**
     * Annotates for each token whether it starts a paragraph by its raw text
     * @param document
     */
    public static void doParagraphAnnotation(Document document) {
        String documentText = document.text;
        Iterator<Token> tokenIter = document.tokenList.iterator();

        if (!tokenIter.hasNext()) {
            System.err.println("Skipping paragraph annotation empty document: " + document.docId);
            return;
        }

        Token token = tokenIter.next();
        ByteCount bc = token.goldByteCount;

        // iterate over all character positions in the text
        char prevC = 0;

        for (int i = 0; i < documentText.length(); i++) {
            if (i > bc.getEnd()) {
                if (!tokenIter.hasNext())
                    break; /* reached the last token */

                token = tokenIter.next();
                bc = token.goldByteCount;
            }

            char c = documentText.charAt(i);

            // two consecutive newlines indicate a paragraph
            if (prevC == '\n' && c == '\n') {
                token.paragraphBegins = true;
            }

            prevC = c;
        }
    }

    /**
     * Anonymizes certain named entities in the text
     * @param document
     */
    public static void anonymizeNamedEntities(Document document) {
        for (Token token : document.getTokenList()) {
            if (token.predNer.startsWith("ORGANIZATION") || token.predNer.startsWith("PERSON")) {
                String substText = "[NE]";
                token.originalPredText = token.predText;

                token.predLemma = substText;
                token.predText = substText;
                token.goldLemma = substText;
                token.goldText = substText;
            }
        }
    }

    /**
     * CoreNLP tries to predict opening and closing quotation marks.
     * This method maps the variation back to one symbol.
     * @param document
     */
    public static void sanitizeQuotationMarks(Document document) {
        for (Token token : document.getTokenList()) {
            // double quotes
            if (token.predLemma.equals("``") || token.predLemma.equals("\"") || token.predLemma.equals("''")) {
                token.predLemma = "\"";
                token.predPosTag = "\"";
                token.predText = "\"";
                token.goldPosTag = "\"";
                token.goldLemma = "\"";
                token.goldText = "\"";
            }

            // single quotes
            if (token.predLemma.equals("`") || token.predLemma.equals("''")) {
                token.predLemma = "'";
                token.predPosTag = "'";
                token.predText = "'";
                token.goldLemma = "'";
                token.goldPosTag = "'";
                token.goldText = "'";
            }

        }
    }

    /**
     * The FW implementation needs distinct objects as edges, which this class accomplishes.
     * CoreNLP seems to optimize storage by caching strings, so different edges have identical label strings.
     */
    public static class IndexedEdge {
        public GrammaticalRelation rel;
        public int index;

        public IndexedEdge(GrammaticalRelation rel, int index) {
            this.rel = rel;
            this.index = index;
        }
    }

    /**
     * Compute cached dependency paths using Floyd Warshall
     * @param dependencies
     * @return
     */
    public static FloydWarshallShortestPaths computeFloydWarshallSGE(List<SemanticGraphEdge> dependencies) {
        SimpleDirectedGraph<IndexedWord, IndexedEdge> graph = new SimpleDirectedGraph<IndexedWord, IndexedEdge>(
                IndexedEdge.class);
        int edgeId = 0;
        for (SemanticGraphEdge dep : dependencies) {
            graph.addVertex(dep.getGovernor());
            graph.addVertex(dep.getDependent());
            graph.addEdge(dep.getGovernor(), dep.getDependent(), new IndexedEdge(dep.getRelation(), edgeId));
        }
        return new FloydWarshallShortestPaths(graph);
    }

}