Example usage for edu.stanford.nlp.process WhitespaceTokenizer factory

List of usage examples for edu.stanford.nlp.process WhitespaceTokenizer factory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process WhitespaceTokenizer factory.

Prototype

public static TokenizerFactory<Word> factory() 

Source Link

Usage

From source file:reck.parser.lexparser.RECKLexicalizedParser.java

License:Open Source License

/** Parse the files with names given in the String array args elements from
 *  index argIndex on./* w ww .  java2 s . com*/
 */
public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized,
        TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document,
        DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper,
        int tagDelimiter) {
    ArrayList treeList = new ArrayList();
    PrintWriter pwOut = op.tlpParams.pw();
    PrintWriter pwErr = op.tlpParams.pw(System.err);
    RECKTreePrint treePrint = getRECKTreePrint(op);
    int numWords = 0;
    int numSents = 0;
    int numUnparsable = 0;
    int numNoMemory = 0;
    int numFallback = 0;
    int numSkipped = 0;
    Timing timer = new Timing();
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    // set the tokenizer
    if (tokenized) {
        tokenizerFactory = WhitespaceTokenizer.factory();
    }
    if (tokenizerFactory == null) {
        tokenizerFactory = tlp.getTokenizerFactory();
    }
    if (Test.verbose) {
        System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
        System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords()));
        System.err.println("File encoding is: " + op.tlpParams.getInputEncoding());
    }
    documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
    documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding());
    boolean saidMemMessage = false;

    // evaluation setup
    boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages"));
    boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary"));
    AbstractEval.ScoreEval pcfgLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) {
        pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
    }
    AbstractEval.ScoreEval depLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) {
        depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
    }
    AbstractEval.ScoreEval factLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) {
        factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
    }

    /** Hide for performance
    timer.start();
            
    System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/
    PrintWriter pwo = pwOut;

    int num = 0, docIndex = startSentence;
    for (List sentence : document) {
        // System.out.println(sentence.toString());
        num++;
        numSents++;
        int len = sentence.size();
        numWords += len;

        Tree ansTree = null;
        try {
            if (!parse(sentence)) {
                pwErr.print("Sentence couldn't be parsed by grammar.");
                if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
                    pwErr.println("... falling back to PCFG parse.");
                    ansTree = getBestPCFGParse();
                    numFallback++;
                } else {
                    pwErr.println();
                    numUnparsable++;
                }
            } else {
                // System.out.println("Score: " + lp.pparser.bestScore);
                ansTree = getBestParse();
            }
            if (pcfgLL != null && pparser != null) {
                pcfgLL.recordScore(pparser, pwErr);
            }
            if (depLL != null && dparser != null) {
                depLL.recordScore(dparser, pwErr);
            }
            if (factLL != null && bparser != null) {
                factLL.recordScore(bparser, pwErr);
            }
        } catch (OutOfMemoryError e) {
            if (Test.maxLength != -0xDEADBEEF) {
                // this means they explicitly asked for a length they cannot handle. Throw exception.
                pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                throw e;
            } else {
                if (!saidMemMessage) {
                    printOutOfMemory(pwErr);
                    saidMemMessage = true;
                }
                if (pparser.hasParse() && fallbackToPCFG) {
                    try {
                        String what = "dependency";
                        if (dparser.hasParse()) {
                            what = "factored";
                        }
                        pwErr.println(
                                "Sentence too long for " + what + " parser.  Falling back to PCFG parse...");
                        ansTree = getBestPCFGParse();
                        numFallback++;
                    } catch (OutOfMemoryError oome) {
                        oome.printStackTrace();
                        numNoMemory++;
                        pwErr.println("No memory to gather PCFG parse. Skipping...");
                        pwo.println("Sentence skipped:  no PCFG fallback.");
                        pparser.nudgeDownArraySize();
                    }
                } else {
                    pwErr.println(
                            "Sentence has no parse using PCFG grammar (or no PCFG fallback).  Skipping...");
                    pwo.println("Sentence skipped: no PCFG fallback.");
                    numSkipped++;
                }
            }
        } catch (UnsupportedOperationException uEx) {
            pwErr.println("Sentence too long (or zero words).");
            pwo.println("Sentence skipped: too long (or zero words).");
            numWords -= len;
            numSkipped++;
        }

        if (ansTree != null) {
            computePosition(docIndex, (Sentence) sentence, content);
            TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
            if (TDs.size() > 0)
                TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
            RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                    sentencePosition);
            DPTree = this.splitHyphen_Dependency(DPTree);
            DPTree = this.splitPoint_Dependency(DPTree);
            RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
            CTTree = this.splitHyphen_Constituent(CTTree);
            CTTree = this.splitPoint_Constituent(CTTree);
            RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree);
            treeList.add(rpTree);
        }
        // crude addition of k-best tree printing
        if (Test.printPCFGkBest > 0 && pparser.hasParse()) {
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }

        } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) {
            // DZ: debug n best trees
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }
        }

        docIndex = sentencePosition.getEnd().intValue();

    } // for sentence : document

    if (Test.writeOutputFiles) {
        pwo.close();
    }
    System.out.println("Parsed file: " + filename + " [" + num + " sentences].");

    /** Hide for performance
    long millis = timer.stop();
            
    if (summary) {
    if (pcfgLL != null) pcfgLL.display(false, pwErr);
    if (depLL != null) depLL.display(false, pwErr);
    if (factLL != null) factLL.display(false, pwErr);
    }*/

    if (saidMemMessage) {
        printOutOfMemory(pwErr);
    }
    /** Hide for performance
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
            
    System.out.println("Parsed " + numWords + " words in " + numSents +
        " sentences (" + nf.format(wordspersec) + " wds/sec; " +
        nf.format(sentspersec) + " sents/sec).");
     */
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength);
        }
    }

    return treeList;
}