BuildBinarizedDataset.java Source code

Introduction

Here is the source code for BuildBinarizedDataset.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 *
 * @author rahuliyer
 */
import java.io.StringReader;
import java.util.List;
import java.util.Map;
import java.io.FileWriter;
import java.io.IOException;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.sentiment.CollapseUnaryTransformer;
import edu.stanford.nlp.sentiment.SentimentCostAndGradient;
import edu.stanford.nlp.sentiment.SentimentModel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
 * @author John Bauer
 * @author Richard Socher
 */
public class BuildBinarizedDataset {

    private BuildBinarizedDataset() {
    } // static methods only

    /**
     * Sets all of the labels on a tree to the given default value.
     */
    public static void setUnknownLabels(Tree tree, Integer defaultLabel) {
        if (tree.isLeaf()) {
            return;
        }

        for (Tree child : tree.children()) {
            setUnknownLabels(child, defaultLabel);
        }

        tree.label().setValue(defaultLabel.toString());
    }

    public static void setPredictedLabels(Tree tree) {
        if (tree.isLeaf()) {
            return;
        }

        for (Tree child : tree.children()) {
            setPredictedLabels(child);
        }

        tree.label().setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
    }

    public static void extractLabels(Map<Pair<Integer, Integer>, String> spanToLabels, List<HasWord> tokens,
            String line) {
        String[] pieces = line.trim().split("\\s+");
        if (pieces.length == 0) {
            return;
        }
        if (pieces.length == 1) {
            String error = "Found line with label " + line + " but no tokens to associate with that line";
            throw new RuntimeException(error);
        }

        //TODO: BUG: The pieces are tokenized differently than the splitting, e.g., on possessive markers as in "actors' expenses"
        for (int i = 0; i < tokens.size() - pieces.length + 2; ++i) {
            boolean found = true;
            for (int j = 1; j < pieces.length; ++j) {
                if (!tokens.get(i + j - 1).word().equals(pieces[j])) {
                    found = false;
                    break;
                }
            }
            if (found) {
                spanToLabels.put(new Pair<>(i, i + pieces.length - 1), pieces[0]);
            }
        }
    }

    public static boolean setSpanLabel(Tree tree, Pair<Integer, Integer> span, String value) {
        if (!(tree.label() instanceof CoreLabel)) {
            throw new AssertionError("Expected CoreLabels");
        }
        CoreLabel label = (CoreLabel) tree.label();
        if (label.get(CoreAnnotations.BeginIndexAnnotation.class).equals(span.first)
                && label.get(CoreAnnotations.EndIndexAnnotation.class).equals(span.second)) {
            label.setValue(value);
            return true;
        }
        if (label.get(CoreAnnotations.BeginIndexAnnotation.class) > span.first
                && label.get(CoreAnnotations.EndIndexAnnotation.class) < span.second) {
            return false;
        }
        for (Tree child : tree.children()) {
            if (setSpanLabel(child, span, value)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Turns a text file into trees for use in a RNTN classifier such as
     * the treebank used in the Sentiment project.
     * <br>
     * The expected input file is one sentence per line, with sentences
     * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
     * Lines after the first sentence line but before
     * the blank line will be treated as labeled sub-phrases.  The
     * labels should start with the label and then contain a list of
     * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
     *  For example:
     * <br>
     * <code>
     * 1 Today is not a good day.<br>
     * 3 good<br>
     * 3 good day <br>
     * 3 a good day <br>
     * <br>
     * (next block starts here) <br>
     * </code>
     * By default the englishPCFG parser is used.  This can be changed
     * with the <code>-parserModel</code> flag.  Specify an input file
     * with <code>-input</code>.
     * <br>
     * If a sentiment model is provided with -sentimentModel, that model
     * will be used to prelabel the sentences.  Any spans with given
     * labels will then be used to adjust those labels.
     */
    public static void main(String[] arg) throws IOException {
        CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
        // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
        String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
                "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
        String inputPath = "D:\\dataset\\good.txt";

        String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
        SentimentModel sentimentModel = null;

        /* for (int argIndex = 0; argIndex < args.length; ) {
           if (args[argIndex].equalsIgnoreCase("-input")) {
             inputPath = args[argIndex + 1];
             argIndex += 2;
           } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
             parserModel = args[argIndex + 1];
             argIndex += 2;
           } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
             sentimentModelPath = args[argIndex + 1];
             argIndex += 2;
           } else {
             System.err.println("Unknown argument " + args[argIndex]);
             System.exit(2);
           }
         }*/

        if (inputPath == null) {
            throw new IllegalArgumentException("Must specify input file with -input");
        }

        LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
        TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
                parser.treebankLanguagePack());

        if (sentimentModelPath != null) {
            sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
        }

        String text = IOUtils.slurpFileNoExceptions(inputPath);
        String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

        for (String chunk : chunks) {
            if (chunk.trim().isEmpty()) {
                continue;
            }
            // The expected format is that line 0 will be the text of the
            // sentence, and each subsequence line, if any, will be a value
            // followed by the sequence of tokens that get that value.

            // Here we take the first line and tokenize it as one sentence.
            String[] lines = chunk.trim().split("\\n");
            String sentence = lines[0];
            StringReader sin = new StringReader(sentence);
            DocumentPreprocessor document = new DocumentPreprocessor(sin);
            document.setSentenceFinalPuncWords(new String[] { "\n" });
            List<HasWord> tokens = document.iterator().next();
            Integer mainLabel = new Integer(tokens.get(0).word());
            //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
            tokens = tokens.subList(1, tokens.size());
            //System.err.println(tokens);

            Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
            for (int i = 1; i < lines.length; ++i) {
                extractLabels(spanToLabels, tokens, lines[i]);
            }

            // TODO: add an option which treats the spans as constraints when parsing

            Tree tree = parser.apply(tokens);
            Tree binarized = binarizer.transformTree(tree);
            Tree collapsedUnary = transformer.transformTree(binarized);

            // if there is a sentiment model for use in prelabeling, we
            // label here and then use the user given labels to adjust
            if (sentimentModel != null) {
                Trees.convertToCoreLabels(collapsedUnary);
                SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                scorer.forwardPropagateTree(collapsedUnary);
                setPredictedLabels(collapsedUnary);
            } else {
                setUnknownLabels(collapsedUnary, mainLabel);
                //collapsedUnary.label().setValue(mainLabel.toString());
                //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
            }

            Trees.convertToCoreLabels(collapsedUnary);
            collapsedUnary.indexSpans();

            for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
                setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
            }
            String x = collapsedUnary.toString();
            //x.replaceAll("\\s","");
            x = x.replace("(", "[");
            x = x.replace(")", "]");
            //writer.write(x);
            //writer.write("\r\n"); 
            System.out.println(x);
            //System.out.println();
        }
        //writer.close();
    } // end main

}