Example usage for edu.stanford.nlp.trees TreebankLanguagePack getTokenizerFactory

List of usage examples for edu.stanford.nlp.trees TreebankLanguagePack getTokenizerFactory

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees TreebankLanguagePack getTokenizerFactory.

Prototype

TokenizerFactory<? extends HasWord> getTokenizerFactory();

Source Link

Document

Return a tokenizer factory which might be suitable for tokenizing text that will be used with this Treebank/Language pair.

Usage

From source file:reck.parser.lexparser.RECKLexicalizedParser.java

License:Open Source License

/** Parse the files with names given in the String array args elements from
 *  index argIndex on.//from   w ww. j  a va  2s  . c o m
 */
public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized,
        TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document,
        DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper,
        int tagDelimiter) {
    ArrayList treeList = new ArrayList();
    PrintWriter pwOut = op.tlpParams.pw();
    PrintWriter pwErr = op.tlpParams.pw(System.err);
    RECKTreePrint treePrint = getRECKTreePrint(op);
    int numWords = 0;
    int numSents = 0;
    int numUnparsable = 0;
    int numNoMemory = 0;
    int numFallback = 0;
    int numSkipped = 0;
    Timing timer = new Timing();
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    // set the tokenizer
    if (tokenized) {
        tokenizerFactory = WhitespaceTokenizer.factory();
    }
    if (tokenizerFactory == null) {
        tokenizerFactory = tlp.getTokenizerFactory();
    }
    if (Test.verbose) {
        System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
        System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords()));
        System.err.println("File encoding is: " + op.tlpParams.getInputEncoding());
    }
    documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
    documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding());
    boolean saidMemMessage = false;

    // evaluation setup
    boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages"));
    boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary"));
    AbstractEval.ScoreEval pcfgLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) {
        pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
    }
    AbstractEval.ScoreEval depLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) {
        depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
    }
    AbstractEval.ScoreEval factLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) {
        factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
    }

    /** Hide for performance
    timer.start();
            
    System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/
    PrintWriter pwo = pwOut;

    int num = 0, docIndex = startSentence;
    for (List sentence : document) {
        // System.out.println(sentence.toString());
        num++;
        numSents++;
        int len = sentence.size();
        numWords += len;

        Tree ansTree = null;
        try {
            if (!parse(sentence)) {
                pwErr.print("Sentence couldn't be parsed by grammar.");
                if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
                    pwErr.println("... falling back to PCFG parse.");
                    ansTree = getBestPCFGParse();
                    numFallback++;
                } else {
                    pwErr.println();
                    numUnparsable++;
                }
            } else {
                // System.out.println("Score: " + lp.pparser.bestScore);
                ansTree = getBestParse();
            }
            if (pcfgLL != null && pparser != null) {
                pcfgLL.recordScore(pparser, pwErr);
            }
            if (depLL != null && dparser != null) {
                depLL.recordScore(dparser, pwErr);
            }
            if (factLL != null && bparser != null) {
                factLL.recordScore(bparser, pwErr);
            }
        } catch (OutOfMemoryError e) {
            if (Test.maxLength != -0xDEADBEEF) {
                // this means they explicitly asked for a length they cannot handle. Throw exception.
                pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                throw e;
            } else {
                if (!saidMemMessage) {
                    printOutOfMemory(pwErr);
                    saidMemMessage = true;
                }
                if (pparser.hasParse() && fallbackToPCFG) {
                    try {
                        String what = "dependency";
                        if (dparser.hasParse()) {
                            what = "factored";
                        }
                        pwErr.println(
                                "Sentence too long for " + what + " parser.  Falling back to PCFG parse...");
                        ansTree = getBestPCFGParse();
                        numFallback++;
                    } catch (OutOfMemoryError oome) {
                        oome.printStackTrace();
                        numNoMemory++;
                        pwErr.println("No memory to gather PCFG parse. Skipping...");
                        pwo.println("Sentence skipped:  no PCFG fallback.");
                        pparser.nudgeDownArraySize();
                    }
                } else {
                    pwErr.println(
                            "Sentence has no parse using PCFG grammar (or no PCFG fallback).  Skipping...");
                    pwo.println("Sentence skipped: no PCFG fallback.");
                    numSkipped++;
                }
            }
        } catch (UnsupportedOperationException uEx) {
            pwErr.println("Sentence too long (or zero words).");
            pwo.println("Sentence skipped: too long (or zero words).");
            numWords -= len;
            numSkipped++;
        }

        if (ansTree != null) {
            computePosition(docIndex, (Sentence) sentence, content);
            TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
            if (TDs.size() > 0)
                TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
            RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                    sentencePosition);
            DPTree = this.splitHyphen_Dependency(DPTree);
            DPTree = this.splitPoint_Dependency(DPTree);
            RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
            CTTree = this.splitHyphen_Constituent(CTTree);
            CTTree = this.splitPoint_Constituent(CTTree);
            RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree);
            treeList.add(rpTree);
        }
        // crude addition of k-best tree printing
        if (Test.printPCFGkBest > 0 && pparser.hasParse()) {
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }

        } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) {
            // DZ: debug n best trees
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }
        }

        docIndex = sentencePosition.getEnd().intValue();

    } // for sentence : document

    if (Test.writeOutputFiles) {
        pwo.close();
    }
    System.out.println("Parsed file: " + filename + " [" + num + " sentences].");

    /** Hide for performance
    long millis = timer.stop();
            
    if (summary) {
    if (pcfgLL != null) pcfgLL.display(false, pwErr);
    if (depLL != null) depLL.display(false, pwErr);
    if (factLL != null) factLL.display(false, pwErr);
    }*/

    if (saidMemMessage) {
        printOutOfMemory(pwErr);
    }
    /** Hide for performance
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
            
    System.out.println("Parsed " + numWords + " words in " + numSents +
        " sentences (" + nf.format(wordspersec) + " wds/sec; " +
        nf.format(sentspersec) + " sents/sec).");
     */
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength);
        }
    }

    return treeList;
}