Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.parse_thicket.opinion_processor; import java.io.IOException; import java.util.List; import edu.stanford.nlp.util.logging.Redwood; import java.util.Iterator; import java.io.BufferedReader; import java.io.IOException; import java.io.FileOutputStream; import java.io.PrintStream; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.logging.Logger; import org.ejml.simple.SimpleMatrix; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.ling.LabeledWord; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.ling.WordLemmaTag; import edu.stanford.nlp.ling.WordTag; import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree; import edu.stanford.nlp.sentiment.SentimentUtils; import edu.stanford.nlp.trees.MemoryTreebank; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.CoreMap; public class DefaultSentimentProcessor { /** A logger for this class */ private static final Logger log = Logger .getLogger("opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor"); private static final NumberFormat NF = new DecimalFormat("0.0000"); enum Output { PENNTREES, VECTORS, ROOT, PROBABILITIES } enum Input { TEXT, TREES } /** * Sets the labels on the tree (except the leaves) to be the integer value * of the sentiment prediction. Makes it easy to print out with * Tree.toString() */ static void setSentimentLabels(Tree tree) { if (tree.isLeaf()) { return; } for (Tree child : tree.children()) { setSentimentLabels(child); } Label label = tree.label(); if (!(label instanceof CoreLabel)) { throw new IllegalArgumentException("Required a tree with CoreLabels"); } CoreLabel cl = (CoreLabel) label; cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree))); } /** * Sets the labels on the tree to be the indices of the nodes. Starts * counting at the root and does a postorder traversal. */ static int setIndexLabels(Tree tree, int index) { if (tree.isLeaf()) { return index; } tree.label().setValue(Integer.toString(index)); index++; for (Tree child : tree.children()) { index = setIndexLabels(child, index); } return index; } /** * Outputs the vectors from the tree. Counts the tree nodes the same as * setIndexLabels. */ static int outputTreeVectors(PrintStream out, Tree tree, int index) { if (tree.isLeaf()) { return index; } out.print(" " + index + ":"); SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree); for (int i = 0; i < vector.getNumElements(); ++i) { out.print(" " + NF.format(vector.get(i))); } out.println(); index++; for (Tree child : tree.children()) { index = outputTreeVectors(out, child, index); } return index; } /** * Outputs the scores from the tree. Counts the tree nodes the same as * setIndexLabels. */ static int outputTreeScores(PrintStream out, Tree tree, int index) { if (tree.isLeaf()) { return index; } out.print(" " + index + ":"); SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree); for (int i = 0; i < vector.getNumElements(); ++i) { out.print(" " + NF.format(vector.get(i))); } out.println(); index++; for (Tree child : tree.children()) { index = outputTreeScores(out, child, index); } return index; } public static <T> String wordToString(T o, final boolean justValue) { return wordToString(o, justValue, null); } public static <T> String wordToString(T o, final boolean justValue, final String separator) { if (justValue && o instanceof Label) { if (o instanceof CoreLabel) { CoreLabel l = (CoreLabel) o; String w = l.value(); if (w == null) w = l.word(); return w; } else { return (((Label) o).value()); } } else if (o instanceof CoreLabel) { CoreLabel l = ((CoreLabel) o); String w = l.value(); if (w == null) w = l.word(); if (l.tag() != null) { if (separator == null) { return w + CoreLabel.TAG_SEPARATOR + l.tag(); } else { return w + separator + l.tag(); } } return w; // an interface that covered these next four cases would be // nice, but we're moving away from these data types anyway } else if (separator != null && o instanceof TaggedWord) { return ((TaggedWord) o).toString(separator); } else if (separator != null && o instanceof LabeledWord) { return ((LabeledWord) o).toString(); } else if (separator != null && o instanceof WordLemmaTag) { return ((WordLemmaTag) o).toString(separator); } else if (separator != null && o instanceof WordTag) { return ((WordTag) o).toString(separator); } else { return (o.toString()); } } /** * Returns the sentence as a string with a space between words. It prints * out the {@code value()} of each item - this will give the expected answer * for a short form representation of the "sentence" over a range of cases. * It is equivalent to calling {@code toString(true)}. * * TODO: Sentence used to be a subclass of ArrayList, with this method as * the toString. Therefore, there may be instances of ArrayList being * printed that expect this method to be used. * * @param list * The tokenized sentence to print out * @return The tokenized sentence as a String */ public static <T> String listToString(List<T> list) { return listToString(list, true); } /** * Returns the sentence as a string with a space between words. Designed to * work robustly, even if the elements stored in the 'Sentence' are not of * type Label. * * This one uses the default separators for any word type that uses * separators, such as TaggedWord. * * @param list * The tokenized sentence to print out * @param justValue * If {@code true} and the elements are of type {@code Label}, * return just the {@code value()} of the {@code Label} of each * word; otherwise, call the {@code toString()} method on each * item. * @return The sentence in String form */ public static <T> String listToString(List<T> list, final boolean justValue) { return listToString(list, justValue, null); } /** * As already described, but if separator is not null, then objects such as * TaggedWord * * @param separator * The string used to separate Word and Tag in TaggedWord, etc */ public static <T> String listToString(List<T> list, final boolean justValue, final String separator) { StringBuilder s = new StringBuilder(); for (Iterator<T> wordIterator = list.iterator(); wordIterator.hasNext();) { T o = wordIterator.next(); s.append(wordToString(o, justValue, separator)); if (wordIterator.hasNext()) { s.append(' '); } } return s.toString(); } /** * Outputs a tree using the output style requested */ static void outputTree(PrintStream out, CoreMap sentence, List<Output> outputFormats) { Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); for (Output output : outputFormats) { switch (output) { case PENNTREES: { Tree copy = tree.deepCopy(); setSentimentLabels(copy); out.println(copy); break; } case VECTORS: { Tree copy = tree.deepCopy(); setIndexLabels(copy, 0); out.println(copy); outputTreeVectors(out, tree, 0); break; } case ROOT: { out.println(" " + sentence.get(SentimentCoreAnnotations.SentimentClass.class)); break; } case PROBABILITIES: { Tree copy = tree.deepCopy(); setIndexLabels(copy, 0); out.println(copy); outputTreeScores(out, tree, 0); break; } default: throw new IllegalArgumentException("Unknown output format " + output); } } } /** * Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } } /** Runs the tree-based sentiment model on some text. */ public void processTextWithArgs(String[] args) throws IOException { String parserModel = null; String sentimentModel = null; String filename = null; String fileList = null; boolean stdin = false; boolean filterUnknown = false; List<Output> outputFormats = Collections.singletonList(Output.ROOT); Input inputFormat = Input.TEXT; String tlppClass = "DEFAULT_TLPP_CLASS"; for (int argIndex = 0; argIndex < args.length;) { if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-file")) { filename = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-fileList")) { fileList = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-stdin")) { stdin = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-input")) { inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase()); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-output")) { String[] formats = args[argIndex + 1].split(","); outputFormats = new ArrayList<>(); for (String format : formats) { outputFormats.add(Output.valueOf(format.toUpperCase())); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) { filterUnknown = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) { tlppClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-help")) { System.exit(0); } else { log.info("Unknown argument " + args[argIndex + 1]); throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]); } } // We construct two pipelines. One handles tokenization, if // necessary. The other takes tokenized sentences and converts // them to sentiment trees. Properties pipelineProps = new Properties(); Properties tokenizerProps = null; if (sentimentModel != null) { pipelineProps.setProperty("sentiment.model", sentimentModel); } if (parserModel != null) { pipelineProps.setProperty("parse.model", parserModel); } if (inputFormat == Input.TREES) { pipelineProps.setProperty("annotators", "binarizer, sentiment"); pipelineProps.setProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator"); pipelineProps.setProperty("binarizer.tlppClass", tlppClass); pipelineProps.setProperty("enforceRequirements", "false"); } else { pipelineProps.setProperty("annotators", "parse, sentiment"); pipelineProps.setProperty("enforceRequirements", "false"); tokenizerProps = new Properties(); tokenizerProps.setProperty("annotators", "tokenize, ssplit"); } if (stdin && tokenizerProps != null) { tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true"); } int count = 0; if (filename != null) count++; if (fileList != null) count++; if (stdin) count++; if (count > 1) { throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin"); } if (count == 0) { throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin"); } StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps); StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps); if (filename != null) { // Process a file. The pipeline will do tokenization, which // means it will split it into sentences as best as possible // with the tokenizer. List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { System.out.println(sentence); outputTree(System.out, sentence, outputFormats); } } } else if (fileList != null) { // Process multiple files. The pipeline will do tokenization, // which means it will split it into sentences as best as // possible with the tokenizer. Output will go to filename.out // for each file. for (String file : fileList.split(",")) { List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown); FileOutputStream fout = new FileOutputStream(file + ".out"); PrintStream pout = new PrintStream(fout); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { pout.println(sentence); outputTree(pout, sentence, outputFormats); } } pout.flush(); fout.close(); } } else { // Process stdin. Each line will be treated as a single sentence. log.info("Reading in text from stdin."); log.info("Please enter one sentence per line."); log.info("Processing will end when EOF is reached."); BufferedReader reader = IOUtils.readerFromStdin("utf-8"); for (String line; (line = reader.readLine()) != null;) { line = line.trim(); if (!line.isEmpty()) { Annotation annotation = tokenizer.process(line); pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { outputTree(System.out, sentence, outputFormats); } } else { // Output blank lines for blank lines so the tool can be // used for line-by-line text processing System.out.println(); } } } } public float getNumericSentimentValue(String expression) { Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, parse, sentiment"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); int mainSentiment = 0; if (expression != null && expression.length() > 0) { int longest = 0; Annotation annotation = pipeline.process(expression); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Tree tree = sentence.get(SentimentAnnotatedTree.class); int sentiment = RNNCoreAnnotations.getPredictedClass(tree); String partText = sentence.toString(); if (partText.length() > longest) { mainSentiment = sentiment; longest = partText.length(); } } } return mainSentiment; } }