org.ets.research.nlp.stanford_thrift.parser.StanfordParserThrift.java Source code

Java tutorial

Introduction

Here is the source code for org.ets.research.nlp.stanford_thrift.parser.StanfordParserThrift.java

Source

/*
  Apache Thrift Server for Stanford CoreNLP (stanford-thrift)
  Copyright (C) 2013 Diane M. Napolitano, Educational Testing Service
    
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation, version 2
  of the License.
    
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
    
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.ets.research.nlp.stanford_thrift.parser;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.Function;

import org.apache.thrift.TApplicationException;
import org.ets.research.nlp.stanford_thrift.general.CoreNLPThriftUtil;

import CoreNLP.ParseTree;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.pipeline.DefaultPaths;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFunctions;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.Trees;

public class StanfordParserThrift {

    private LexicalizedParser parser;
    private TreePrint treePrinter;
    private TreebankLanguagePack tlp;

    public StanfordParserThrift(String modelFile) {
        loadModel(modelFile);
        tlp = new PennTreebankLanguagePack();
        // TODO: Initialize treePrinter here in case it could ever not be initialized below?
    }

    private void loadModel(String modelFile) {
        if (modelFile.equals("") || modelFile == null) {
            parser = LexicalizedParser.loadModel(DefaultPaths.DEFAULT_PARSER_MODEL, new String[] {});
        } else {
            parser = LexicalizedParser.loadModel(modelFile, new String[] {});
        }
    }

    public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException {
        List<ParseTree> results = new ArrayList<ParseTree>();

        try {
            treePrinter = ParserUtil.setOptions(outputFormat, tlp);

            // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
            DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text));
            Iterator<List<HasWord>> foundSentences = preprocess.iterator();
            while (foundSentences.hasNext()) {
                Tree parseTree = parser.apply(foundSentences.next());
                results.add(
                        new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score()));
            }
        } catch (Exception e) {
            // FIXME
            throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
        }

        return results;
    }

    /**
     * @param tokens One sentence worth of tokens at a time.
     * @return A ParseTree object of the String representation of the tree, plus its probability.
     * @throws TApplicationException
     */
    public ParseTree parse_tokens(List<String> tokens, List<String> outputFormat) throws TApplicationException {
        try {
            treePrinter = ParserUtil.setOptions(outputFormat, tlp);

            // a single sentence worth of tokens
            String[] tokenArray = new String[tokens.size()];
            tokens.toArray(tokenArray);
            List<CoreLabel> crazyStanfordFormat = Sentence.toCoreLabelList(tokenArray);
            Tree parseTree = parser.apply(crazyStanfordFormat);
            return new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score());
        } catch (Exception e) {
            // FIXME
            throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
        }
    }

    public ParseTree parse_tagged_sentence(String taggedSentence, List<String> outputFormat, String divider)
            throws TApplicationException {
        try {
            treePrinter = ParserUtil.setOptions(outputFormat, tlp);

            // a single sentence worth of tagged text, better be properly tokenized >:D
            Tree parseTree = parser
                    .apply(CoreNLPThriftUtil.getListOfTaggedWordsFromTaggedSentence(taggedSentence, divider));
            return new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score());
        } catch (Exception e) {
            // FIXME
            throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
        }
    }

    /** If one were to call any of these other methods to get a parse tree for some input sentence
     * with the -outputFormatOptions flag of "lexicalize", they would receive their parse tree,
     * in the -outputFormat of their choice, with every leaf marked with it's head word.
     * This function does exactly that on an existing parse tree.
     * NOTE that this WILL re-lexicalize a pre-lexicalized tree, so don't pass in a tree that
     * has been lexicalized and expect to get back the same thing as what you passed in.
     */
    public String lexicalize_parse_tree(String tree) throws TApplicationException {
        try {
            Tree parseTree = Tree.valueOf(tree);
            Tree lexicalizedTree = Trees.lexicalize(parseTree, tlp.headFinder());
            treePrinter = ParserUtil.setOptions(null, tlp); // use defaults
            Function<Tree, Tree> a = TreeFunctions.getLabeledToDescriptiveCoreLabelTreeFunction();
            lexicalizedTree = a.apply(lexicalizedTree);
            return ParserUtil.TreeObjectToString(lexicalizedTree, treePrinter);
        } catch (Exception e) {
            // FIXME
            throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
        }
    }
}