ConstituencyParse.java Source code

Introduction

Here is the source code for ConstituencyParse.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.HashMap;
import java.util.Properties;
import java.util.Scanner;

public class ConstituencyParse {

    private boolean tokenize;
    private BufferedWriter tokWriter, parentWriter;
    private LexicalizedParser parser;
    private TreeBinarizer binarizer;
    private CollapseUnaryTransformer transformer;
    private GrammaticalStructureFactory gsf;

    private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

    public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
        this.tokenize = tokenize;
        if (tokPath != null) {
            tokWriter = new BufferedWriter(new FileWriter(tokPath));
        }
        parentWriter = new BufferedWriter(new FileWriter(parentPath));
        parser = LexicalizedParser.loadModel(PCFG_PATH);
        binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
                parser.treebankLanguagePack());
        transformer = new CollapseUnaryTransformer();

        // set up to produce dependency representations from constituency trees
        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        gsf = tlp.grammaticalStructureFactory();
    }

    public List<HasWord> sentenceToTokens(String line) {
        List<HasWord> tokens = new ArrayList<>();
        if (tokenize) {
            PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
            for (Word label; tokenizer.hasNext();) {
                tokens.add(tokenizer.next());
            }
        } else {
            for (String word : line.split(" ")) {
                tokens.add(new Word(word));
            }
        }

        return tokens;
    }

    public Tree parse(List<HasWord> tokens) {
        Tree tree = parser.apply(tokens);
        return tree;
    }

    public int[] constTreeParents(Tree tree) {
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);
        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();
        List<Tree> leaves = collapsedUnary.getLeaves();
        int size = collapsedUnary.size() - leaves.size();
        int[] parents = new int[size];
        HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();

        int idx = leaves.size();
        int leafIdx = 0;
        for (Tree leaf : leaves) {
            Tree cur = leaf.parent(collapsedUnary); // go to preterminal
            int curIdx = leafIdx++;
            boolean done = false;
            while (!done) {
                Tree parent = cur.parent(collapsedUnary);
                if (parent == null) {
                    parents[curIdx] = 0;
                    break;
                }

                int parentIdx;
                int parentNumber = parent.nodeNumber(collapsedUnary);
                if (!index.containsKey(parentNumber)) {
                    parentIdx = idx++;
                    index.put(parentNumber, parentIdx);
                } else {
                    parentIdx = index.get(parentNumber);
                    done = true;
                }

                parents[curIdx] = parentIdx + 1;
                cur = parent;
                curIdx = parentIdx;
            }
        }

        return parents;
    }

    // convert constituency parse to a dependency representation and return the
    // parent pointer representation of the tree
    public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection<TypedDependency> tdl = gs.typedDependencies();
        int len = tokens.size();
        int[] parents = new int[len];
        for (int i = 0; i < len; i++) {
            // if a node has a parent of -1 at the end of parsing, then the node
            // has no parent.
            parents[i] = -1;
        }

        for (TypedDependency td : tdl) {
            // let root have index 0
            int child = td.dep().index();
            int parent = td.gov().index();
            parents[child - 1] = parent;
        }

        return parents;
    }

    public void printTokens(List<HasWord> tokens) throws IOException {
        int len = tokens.size();
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            if (tokenize) {
                sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
            } else {
                sb.append(tokens.get(i).word());
            }
            sb.append(' ');
        }

        if (tokenize) {
            sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
        } else {
            sb.append(tokens.get(len - 1).word());
        }

        sb.append('\n');
        tokWriter.write(sb.toString());
    }

    public void printParents(int[] parents) throws IOException {
        StringBuilder sb = new StringBuilder();
        int size = parents.length;
        for (int i = 0; i < size - 1; i++) {
            sb.append(parents[i]);
            sb.append(' ');
        }
        sb.append(parents[size - 1]);
        sb.append('\n');
        parentWriter.write(sb.toString());
    }

    public void close() throws IOException {
        if (tokWriter != null)
            tokWriter.close();
        parentWriter.close();
    }

    public static void main(String[] args) throws Exception {
        Properties props = StringUtils.argsToProperties(args);
        if (!props.containsKey("parentpath")) {
            System.err.println(
                    "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
            System.exit(1);
        }

        // whether to tokenize input sentences
        boolean tokenize = false;
        if (props.containsKey("tokenize")) {
            tokenize = true;
        }

        // whether to produce dependency trees from the constituency parse
        boolean deps = false;
        if (props.containsKey("deps")) {
            deps = true;
        }

        String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
        String parentPath = props.getProperty("parentpath");
        ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);

        Scanner stdin = new Scanner(System.in);
        int count = 0;
        long start = System.currentTimeMillis();
        while (stdin.hasNextLine()) {
            String line = stdin.nextLine();
            List<HasWord> tokens = processor.sentenceToTokens(line);
            Tree parse = processor.parse(tokens);

            // produce parent pointer representation
            int[] parents = deps ? processor.depTreeParents(parse, tokens) : processor.constTreeParents(parse);

            // print
            if (tokPath != null) {
                processor.printTokens(tokens);
            }
            processor.printParents(parents);

            count++;
            if (count % 1000 == 0) {
                double elapsed = (System.currentTimeMillis() - start) / 1000.0;
                System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
            }
        }

        long totalTimeMillis = System.currentTimeMillis() - start;
        System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0,
                totalTimeMillis / (double) count);
        processor.close();
    }
}