Example usage for edu.stanford.nlp.process DocumentPreprocessor setSentenceFinalPuncWords

List of usage examples for edu.stanford.nlp.process DocumentPreprocessor setSentenceFinalPuncWords

Introduction

In this page you can find the example usage for edu.stanford.nlp.process DocumentPreprocessor setSentenceFinalPuncWords.

Prototype

public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) 

Source Link

Document

Sets the end-of-sentence delimiters.

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*w  w w . ja va2s  .c o  m*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:reck.parser.lexparser.RECKLexicalizedParser.java

License:Open Source License

/** Parse the files with names given in the String array args elements from
 *  index argIndex on.//from   ww w .  j av a 2  s  . co m
 */
public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized,
        TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document,
        DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper,
        int tagDelimiter) {
    ArrayList treeList = new ArrayList();
    PrintWriter pwOut = op.tlpParams.pw();
    PrintWriter pwErr = op.tlpParams.pw(System.err);
    RECKTreePrint treePrint = getRECKTreePrint(op);
    int numWords = 0;
    int numSents = 0;
    int numUnparsable = 0;
    int numNoMemory = 0;
    int numFallback = 0;
    int numSkipped = 0;
    Timing timer = new Timing();
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    // set the tokenizer
    if (tokenized) {
        tokenizerFactory = WhitespaceTokenizer.factory();
    }
    if (tokenizerFactory == null) {
        tokenizerFactory = tlp.getTokenizerFactory();
    }
    if (Test.verbose) {
        System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
        System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords()));
        System.err.println("File encoding is: " + op.tlpParams.getInputEncoding());
    }
    documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
    documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding());
    boolean saidMemMessage = false;

    // evaluation setup
    boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages"));
    boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary"));
    AbstractEval.ScoreEval pcfgLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) {
        pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
    }
    AbstractEval.ScoreEval depLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) {
        depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
    }
    AbstractEval.ScoreEval factLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) {
        factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
    }

    /** Hide for performance
    timer.start();
            
    System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/
    PrintWriter pwo = pwOut;

    int num = 0, docIndex = startSentence;
    for (List sentence : document) {
        // System.out.println(sentence.toString());
        num++;
        numSents++;
        int len = sentence.size();
        numWords += len;

        Tree ansTree = null;
        try {
            if (!parse(sentence)) {
                pwErr.print("Sentence couldn't be parsed by grammar.");
                if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
                    pwErr.println("... falling back to PCFG parse.");
                    ansTree = getBestPCFGParse();
                    numFallback++;
                } else {
                    pwErr.println();
                    numUnparsable++;
                }
            } else {
                // System.out.println("Score: " + lp.pparser.bestScore);
                ansTree = getBestParse();
            }
            if (pcfgLL != null && pparser != null) {
                pcfgLL.recordScore(pparser, pwErr);
            }
            if (depLL != null && dparser != null) {
                depLL.recordScore(dparser, pwErr);
            }
            if (factLL != null && bparser != null) {
                factLL.recordScore(bparser, pwErr);
            }
        } catch (OutOfMemoryError e) {
            if (Test.maxLength != -0xDEADBEEF) {
                // this means they explicitly asked for a length they cannot handle. Throw exception.
                pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                throw e;
            } else {
                if (!saidMemMessage) {
                    printOutOfMemory(pwErr);
                    saidMemMessage = true;
                }
                if (pparser.hasParse() && fallbackToPCFG) {
                    try {
                        String what = "dependency";
                        if (dparser.hasParse()) {
                            what = "factored";
                        }
                        pwErr.println(
                                "Sentence too long for " + what + " parser.  Falling back to PCFG parse...");
                        ansTree = getBestPCFGParse();
                        numFallback++;
                    } catch (OutOfMemoryError oome) {
                        oome.printStackTrace();
                        numNoMemory++;
                        pwErr.println("No memory to gather PCFG parse. Skipping...");
                        pwo.println("Sentence skipped:  no PCFG fallback.");
                        pparser.nudgeDownArraySize();
                    }
                } else {
                    pwErr.println(
                            "Sentence has no parse using PCFG grammar (or no PCFG fallback).  Skipping...");
                    pwo.println("Sentence skipped: no PCFG fallback.");
                    numSkipped++;
                }
            }
        } catch (UnsupportedOperationException uEx) {
            pwErr.println("Sentence too long (or zero words).");
            pwo.println("Sentence skipped: too long (or zero words).");
            numWords -= len;
            numSkipped++;
        }

        if (ansTree != null) {
            computePosition(docIndex, (Sentence) sentence, content);
            TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
            if (TDs.size() > 0)
                TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
            RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                    sentencePosition);
            DPTree = this.splitHyphen_Dependency(DPTree);
            DPTree = this.splitPoint_Dependency(DPTree);
            RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
            CTTree = this.splitHyphen_Constituent(CTTree);
            CTTree = this.splitPoint_Constituent(CTTree);
            RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree);
            treeList.add(rpTree);
        }
        // crude addition of k-best tree printing
        if (Test.printPCFGkBest > 0 && pparser.hasParse()) {
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }

        } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) {
            // DZ: debug n best trees
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }
        }

        docIndex = sentencePosition.getEnd().intValue();

    } // for sentence : document

    if (Test.writeOutputFiles) {
        pwo.close();
    }
    System.out.println("Parsed file: " + filename + " [" + num + " sentences].");

    /** Hide for performance
    long millis = timer.stop();
            
    if (summary) {
    if (pcfgLL != null) pcfgLL.display(false, pwErr);
    if (depLL != null) depLL.display(false, pwErr);
    if (factLL != null) factLL.display(false, pwErr);
    }*/

    if (saidMemMessage) {
        printOutOfMemory(pwErr);
    }
    /** Hide for performance
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
            
    System.out.println("Parsed " + numWords + " words in " + numSents +
        " sentences (" + nf.format(wordspersec) + " wds/sec; " +
        nf.format(sentspersec) + " sents/sec).");
     */
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength);
        }
    }

    return treeList;
}