List of usage examples for edu.stanford.nlp.trees TreebankLanguagePack getTokenizerFactory
TokenizerFactory<? extends HasWord> getTokenizerFactory();
From source file:reck.parser.lexparser.RECKLexicalizedParser.java
License:Open Source License
/** Parse the files with names given in the String array args elements from * index argIndex on.//from w ww. j a va 2s . c o m */ public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized, TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document, DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper, int tagDelimiter) { ArrayList treeList = new ArrayList(); PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); RECKTreePrint treePrint = getRECKTreePrint(op); int numWords = 0; int numSents = 0; int numUnparsable = 0; int numNoMemory = 0; int numFallback = 0; int numSkipped = 0; Timing timer = new Timing(); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); // set the tokenizer if (tokenized) { tokenizerFactory = WhitespaceTokenizer.factory(); } if (tokenizerFactory == null) { tokenizerFactory = tlp.getTokenizerFactory(); } if (Test.verbose) { System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory); System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords())); System.err.println("File encoding is: " + op.tlpParams.getInputEncoding()); } documentPreprocessor.setTokenizerFactory(tokenizerFactory); documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords()); documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding()); boolean saidMemMessage = false; // evaluation setup boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages")); boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary")); AbstractEval.ScoreEval pcfgLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) { pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages); } AbstractEval.ScoreEval depLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) { depLL = new AbstractEval.ScoreEval("depLL", runningAverages); } AbstractEval.ScoreEval factLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) { factLL = new AbstractEval.ScoreEval("factLL", runningAverages); } /** Hide for performance timer.start(); System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/ PrintWriter pwo = pwOut; int num = 0, docIndex = startSentence; for (List sentence : document) { // System.out.println(sentence.toString()); num++; numSents++; int len = sentence.size(); numWords += len; Tree ansTree = null; try { if (!parse(sentence)) { pwErr.print("Sentence couldn't be parsed by grammar."); if (pparser != null && pparser.hasParse() && fallbackToPCFG) { pwErr.println("... falling back to PCFG parse."); ansTree = getBestPCFGParse(); numFallback++; } else { pwErr.println(); numUnparsable++; } } else { // System.out.println("Score: " + lp.pparser.bestScore); ansTree = getBestParse(); } if (pcfgLL != null && pparser != null) { pcfgLL.recordScore(pparser, pwErr); } if (depLL != null && dparser != null) { depLL.recordScore(dparser, pwErr); } if (factLL != null && bparser != null) { factLL.recordScore(bparser, pwErr); } } catch (OutOfMemoryError e) { if (Test.maxLength != -0xDEADBEEF) { // this means they explicitly asked for a length they cannot handle. Throw exception. pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); throw e; } else { if (!saidMemMessage) { printOutOfMemory(pwErr); saidMemMessage = true; } if (pparser.hasParse() && fallbackToPCFG) { try { String what = "dependency"; if (dparser.hasParse()) { what = "factored"; } pwErr.println( "Sentence too long for " + what + " parser. Falling back to PCFG parse..."); ansTree = getBestPCFGParse(); numFallback++; } catch (OutOfMemoryError oome) { oome.printStackTrace(); numNoMemory++; pwErr.println("No memory to gather PCFG parse. Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); pparser.nudgeDownArraySize(); } } else { pwErr.println( "Sentence has no parse using PCFG grammar (or no PCFG fallback). Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); numSkipped++; } } } catch (UnsupportedOperationException uEx) { pwErr.println("Sentence too long (or zero words)."); pwo.println("Sentence skipped: too long (or zero words)."); numWords -= len; numSkipped++; } if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } // crude addition of k-best tree printing if (Test.printPCFGkBest > 0 && pparser.hasParse()) { if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) { // DZ: debug n best trees if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } docIndex = sentencePosition.getEnd().intValue(); } // for sentence : document if (Test.writeOutputFiles) { pwo.close(); } System.out.println("Parsed file: " + filename + " [" + num + " sentences]."); /** Hide for performance long millis = timer.stop(); if (summary) { if (pcfgLL != null) pcfgLL.display(false, pwErr); if (depLL != null) depLL.display(false, pwErr); if (factLL != null) factLL.display(false, pwErr); }*/ if (saidMemMessage) { printOutOfMemory(pwErr); } /** Hide for performance double wordspersec = numWords / (((double) millis) / 1000); double sentspersec = numSents / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.out.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec)."); */ if (numFallback > 0) { pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength); } } return treeList; }