opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java Source code

Introduction

Here is the source code for opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.opinion_processor;

import java.io.IOException;
import java.util.List;

import edu.stanford.nlp.util.logging.Redwood;

import java.util.Iterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.logging.Logger;

import org.ejml.simple.SimpleMatrix;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordLemmaTag;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
import edu.stanford.nlp.sentiment.SentimentUtils;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;

public class DefaultSentimentProcessor {
    /** A logger for this class */
    private static final Logger log = Logger
            .getLogger("opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor");

    private static final NumberFormat NF = new DecimalFormat("0.0000");

    enum Output {
        PENNTREES, VECTORS, ROOT, PROBABILITIES
    }

    enum Input {
        TEXT, TREES
    }

    /**
     * Sets the labels on the tree (except the leaves) to be the integer value
     * of the sentiment prediction. Makes it easy to print out with
     * Tree.toString()
     */
    static void setSentimentLabels(Tree tree) {
        if (tree.isLeaf()) {
            return;
        }

        for (Tree child : tree.children()) {
            setSentimentLabels(child);
        }

        Label label = tree.label();
        if (!(label instanceof CoreLabel)) {
            throw new IllegalArgumentException("Required a tree with CoreLabels");
        }
        CoreLabel cl = (CoreLabel) label;
        cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
    }

    /**
     * Sets the labels on the tree to be the indices of the nodes. Starts
     * counting at the root and does a postorder traversal.
     */
    static int setIndexLabels(Tree tree, int index) {
        if (tree.isLeaf()) {
            return index;
        }

        tree.label().setValue(Integer.toString(index));
        index++;
        for (Tree child : tree.children()) {
            index = setIndexLabels(child, index);
        }
        return index;
    }

    /**
     * Outputs the vectors from the tree. Counts the tree nodes the same as
     * setIndexLabels.
     */
    static int outputTreeVectors(PrintStream out, Tree tree, int index) {
        if (tree.isLeaf()) {
            return index;
        }

        out.print("  " + index + ":");
        SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree);
        for (int i = 0; i < vector.getNumElements(); ++i) {
            out.print("  " + NF.format(vector.get(i)));
        }
        out.println();
        index++;
        for (Tree child : tree.children()) {
            index = outputTreeVectors(out, child, index);
        }
        return index;
    }

    /**
     * Outputs the scores from the tree. Counts the tree nodes the same as
     * setIndexLabels.
     */
    static int outputTreeScores(PrintStream out, Tree tree, int index) {
        if (tree.isLeaf()) {
            return index;
        }

        out.print("  " + index + ":");
        SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree);
        for (int i = 0; i < vector.getNumElements(); ++i) {
            out.print("  " + NF.format(vector.get(i)));
        }
        out.println();
        index++;
        for (Tree child : tree.children()) {
            index = outputTreeScores(out, child, index);
        }
        return index;
    }

    public static <T> String wordToString(T o, final boolean justValue) {
        return wordToString(o, justValue, null);
    }

    public static <T> String wordToString(T o, final boolean justValue, final String separator) {
        if (justValue && o instanceof Label) {
            if (o instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) o;
                String w = l.value();
                if (w == null)
                    w = l.word();
                return w;
            } else {
                return (((Label) o).value());
            }
        } else if (o instanceof CoreLabel) {
            CoreLabel l = ((CoreLabel) o);
            String w = l.value();
            if (w == null)
                w = l.word();
            if (l.tag() != null) {
                if (separator == null) {
                    return w + CoreLabel.TAG_SEPARATOR + l.tag();
                } else {
                    return w + separator + l.tag();
                }
            }
            return w;
            // an interface that covered these next four cases would be
            // nice, but we're moving away from these data types anyway
        } else if (separator != null && o instanceof TaggedWord) {
            return ((TaggedWord) o).toString(separator);
        } else if (separator != null && o instanceof LabeledWord) {
            return ((LabeledWord) o).toString();
        } else if (separator != null && o instanceof WordLemmaTag) {
            return ((WordLemmaTag) o).toString(separator);
        } else if (separator != null && o instanceof WordTag) {
            return ((WordTag) o).toString(separator);
        } else {
            return (o.toString());
        }
    }

    /**
     * Returns the sentence as a string with a space between words. It prints
     * out the {@code value()} of each item - this will give the expected answer
     * for a short form representation of the "sentence" over a range of cases.
     * It is equivalent to calling {@code toString(true)}.
     *
     * TODO: Sentence used to be a subclass of ArrayList, with this method as
     * the toString. Therefore, there may be instances of ArrayList being
     * printed that expect this method to be used.
     *
     * @param list
     *            The tokenized sentence to print out
     * @return The tokenized sentence as a String
     */
    public static <T> String listToString(List<T> list) {
        return listToString(list, true);
    }

    /**
     * Returns the sentence as a string with a space between words. Designed to
     * work robustly, even if the elements stored in the 'Sentence' are not of
     * type Label.
     *
     * This one uses the default separators for any word type that uses
     * separators, such as TaggedWord.
     *
     * @param list
     *            The tokenized sentence to print out
     * @param justValue
     *            If {@code true} and the elements are of type {@code Label},
     *            return just the {@code value()} of the {@code Label} of each
     *            word; otherwise, call the {@code toString()} method on each
     *            item.
     * @return The sentence in String form
     */
    public static <T> String listToString(List<T> list, final boolean justValue) {
        return listToString(list, justValue, null);
    }

    /**
     * As already described, but if separator is not null, then objects such as
     * TaggedWord
     *
     * @param separator
     *            The string used to separate Word and Tag in TaggedWord, etc
     */
    public static <T> String listToString(List<T> list, final boolean justValue, final String separator) {
        StringBuilder s = new StringBuilder();
        for (Iterator<T> wordIterator = list.iterator(); wordIterator.hasNext();) {
            T o = wordIterator.next();
            s.append(wordToString(o, justValue, separator));
            if (wordIterator.hasNext()) {
                s.append(' ');
            }
        }
        return s.toString();
    }

    /**
     * Outputs a tree using the output style requested
     */
    static void outputTree(PrintStream out, CoreMap sentence, List<Output> outputFormats) {
        Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
        for (Output output : outputFormats) {
            switch (output) {
            case PENNTREES: {
                Tree copy = tree.deepCopy();
                setSentimentLabels(copy);
                out.println(copy);
                break;
            }
            case VECTORS: {
                Tree copy = tree.deepCopy();
                setIndexLabels(copy, 0);
                out.println(copy);
                outputTreeVectors(out, tree, 0);
                break;
            }
            case ROOT: {
                out.println("  " + sentence.get(SentimentCoreAnnotations.SentimentClass.class));
                break;
            }
            case PROBABILITIES: {
                Tree copy = tree.deepCopy();
                setIndexLabels(copy, 0);
                out.println(copy);
                outputTreeScores(out, tree, 0);
                break;
            }
            default:
                throw new IllegalArgumentException("Unknown output format " + output);
            }
        }
    }

    /**
     * Reads an annotation from the given filename using the requested input.
     */
    public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename,
            boolean filterUnknown) {
        switch (inputFormat) {
        case TEXT: {
            String text = IOUtils.slurpFileNoExceptions(filename);
            Annotation annotation = new Annotation(text);
            tokenizer.annotate(annotation);
            List<Annotation> annotations = Generics.newArrayList();
            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
                nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
                annotations.add(nextAnnotation);
            }
            return annotations;
        }
        case TREES: {
            List<Tree> trees;
            if (filterUnknown) {
                trees = SentimentUtils.readTreesWithGoldLabels(filename);
                trees = SentimentUtils.filterUnknownRoots(trees);
            } else {
                trees = Generics.newArrayList();
                MemoryTreebank treebank = new MemoryTreebank("utf-8");
                treebank.loadPath(filename, null);
                for (Tree tree : treebank) {
                    trees.add(tree);
                }
            }

            List<Annotation> annotations = Generics.newArrayList();
            for (Tree tree : trees) {
                CoreMap sentence = new Annotation(listToString(tree.yield()));
                sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
                List<CoreMap> sentences = Collections.singletonList(sentence);
                Annotation annotation = new Annotation("");
                annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
                annotations.add(annotation);
            }
            return annotations;
        }
        default:
            throw new IllegalArgumentException("Unknown format " + inputFormat);
        }
    }

    /** Runs the tree-based sentiment model on some text. */
    public void processTextWithArgs(String[] args) throws IOException {
        String parserModel = null;
        String sentimentModel = null;

        String filename = null;
        String fileList = null;
        boolean stdin = false;

        boolean filterUnknown = false;

        List<Output> outputFormats = Collections.singletonList(Output.ROOT);
        Input inputFormat = Input.TEXT;

        String tlppClass = "DEFAULT_TLPP_CLASS";

        for (int argIndex = 0; argIndex < args.length;) {
            if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
                sentimentModel = args[argIndex + 1];
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
                parserModel = args[argIndex + 1];
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-file")) {
                filename = args[argIndex + 1];
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-fileList")) {
                fileList = args[argIndex + 1];
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-stdin")) {
                stdin = true;
                argIndex++;
            } else if (args[argIndex].equalsIgnoreCase("-input")) {
                inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase());
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-output")) {
                String[] formats = args[argIndex + 1].split(",");
                outputFormats = new ArrayList<>();
                for (String format : formats) {
                    outputFormats.add(Output.valueOf(format.toUpperCase()));
                }
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) {
                filterUnknown = true;
                argIndex++;
            } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) {
                tlppClass = args[argIndex + 1];
                argIndex += 2;
            } else if (args[argIndex].equalsIgnoreCase("-help")) {
                System.exit(0);
            } else {
                log.info("Unknown argument " + args[argIndex + 1]);
                throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]);
            }
        }

        // We construct two pipelines. One handles tokenization, if
        // necessary. The other takes tokenized sentences and converts
        // them to sentiment trees.
        Properties pipelineProps = new Properties();
        Properties tokenizerProps = null;
        if (sentimentModel != null) {
            pipelineProps.setProperty("sentiment.model", sentimentModel);
        }
        if (parserModel != null) {
            pipelineProps.setProperty("parse.model", parserModel);
        }
        if (inputFormat == Input.TREES) {
            pipelineProps.setProperty("annotators", "binarizer, sentiment");
            pipelineProps.setProperty("customAnnotatorClass.binarizer",
                    "edu.stanford.nlp.pipeline.BinarizerAnnotator");
            pipelineProps.setProperty("binarizer.tlppClass", tlppClass);
            pipelineProps.setProperty("enforceRequirements", "false");
        } else {
            pipelineProps.setProperty("annotators", "parse, sentiment");
            pipelineProps.setProperty("enforceRequirements", "false");
            tokenizerProps = new Properties();
            tokenizerProps.setProperty("annotators", "tokenize, ssplit");
        }

        if (stdin && tokenizerProps != null) {
            tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true");
        }

        int count = 0;
        if (filename != null)
            count++;
        if (fileList != null)
            count++;
        if (stdin)
            count++;
        if (count > 1) {
            throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin");
        }
        if (count == 0) {
            throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin");
        }

        StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
        StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps);

        if (filename != null) {
            // Process a file. The pipeline will do tokenization, which
            // means it will split it into sentences as best as possible
            // with the tokenizer.
            List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown);
            for (Annotation annotation : annotations) {
                pipeline.annotate(annotation);

                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    System.out.println(sentence);
                    outputTree(System.out, sentence, outputFormats);
                }
            }
        } else if (fileList != null) {
            // Process multiple files. The pipeline will do tokenization,
            // which means it will split it into sentences as best as
            // possible with the tokenizer. Output will go to filename.out
            // for each file.
            for (String file : fileList.split(",")) {
                List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown);
                FileOutputStream fout = new FileOutputStream(file + ".out");
                PrintStream pout = new PrintStream(fout);
                for (Annotation annotation : annotations) {
                    pipeline.annotate(annotation);

                    for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                        pout.println(sentence);
                        outputTree(pout, sentence, outputFormats);
                    }
                }
                pout.flush();
                fout.close();
            }
        } else {
            // Process stdin. Each line will be treated as a single sentence.
            log.info("Reading in text from stdin.");
            log.info("Please enter one sentence per line.");
            log.info("Processing will end when EOF is reached.");
            BufferedReader reader = IOUtils.readerFromStdin("utf-8");

            for (String line; (line = reader.readLine()) != null;) {
                line = line.trim();
                if (!line.isEmpty()) {
                    Annotation annotation = tokenizer.process(line);
                    pipeline.annotate(annotation);
                    for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                        outputTree(System.out, sentence, outputFormats);
                    }
                } else {
                    // Output blank lines for blank lines so the tool can be
                    // used for line-by-line text processing
                    System.out.println();
                }
            }

        }
    }

    public float getNumericSentimentValue(String expression) {
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        int mainSentiment = 0;
        if (expression != null && expression.length() > 0) {
            int longest = 0;
            Annotation annotation = pipeline.process(expression);
            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                Tree tree = sentence.get(SentimentAnnotatedTree.class);
                int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                String partText = sentence.toString();
                if (partText.length() > longest) {
                    mainSentiment = sentiment;
                    longest = partText.length();
                }
            }
        }
        return mainSentiment;
    }
}