List of usage examples for edu.stanford.nlp.process DocumentPreprocessor setSentenceFinalPuncWords
public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords)
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>/*w w w . ja va2s .c o m*/ * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:reck.parser.lexparser.RECKLexicalizedParser.java
License:Open Source License
/** Parse the files with names given in the String array args elements from * index argIndex on.//from ww w . j av a 2 s . co m */ public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized, TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document, DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper, int tagDelimiter) { ArrayList treeList = new ArrayList(); PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); RECKTreePrint treePrint = getRECKTreePrint(op); int numWords = 0; int numSents = 0; int numUnparsable = 0; int numNoMemory = 0; int numFallback = 0; int numSkipped = 0; Timing timer = new Timing(); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); // set the tokenizer if (tokenized) { tokenizerFactory = WhitespaceTokenizer.factory(); } if (tokenizerFactory == null) { tokenizerFactory = tlp.getTokenizerFactory(); } if (Test.verbose) { System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory); System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords())); System.err.println("File encoding is: " + op.tlpParams.getInputEncoding()); } documentPreprocessor.setTokenizerFactory(tokenizerFactory); documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords()); documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding()); boolean saidMemMessage = false; // evaluation setup boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages")); boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary")); AbstractEval.ScoreEval pcfgLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) { pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages); } AbstractEval.ScoreEval depLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) { depLL = new AbstractEval.ScoreEval("depLL", runningAverages); } AbstractEval.ScoreEval factLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) { factLL = new AbstractEval.ScoreEval("factLL", runningAverages); } /** Hide for performance timer.start(); System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/ PrintWriter pwo = pwOut; int num = 0, docIndex = startSentence; for (List sentence : document) { // System.out.println(sentence.toString()); num++; numSents++; int len = sentence.size(); numWords += len; Tree ansTree = null; try { if (!parse(sentence)) { pwErr.print("Sentence couldn't be parsed by grammar."); if (pparser != null && pparser.hasParse() && fallbackToPCFG) { pwErr.println("... falling back to PCFG parse."); ansTree = getBestPCFGParse(); numFallback++; } else { pwErr.println(); numUnparsable++; } } else { // System.out.println("Score: " + lp.pparser.bestScore); ansTree = getBestParse(); } if (pcfgLL != null && pparser != null) { pcfgLL.recordScore(pparser, pwErr); } if (depLL != null && dparser != null) { depLL.recordScore(dparser, pwErr); } if (factLL != null && bparser != null) { factLL.recordScore(bparser, pwErr); } } catch (OutOfMemoryError e) { if (Test.maxLength != -0xDEADBEEF) { // this means they explicitly asked for a length they cannot handle. Throw exception. pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); throw e; } else { if (!saidMemMessage) { printOutOfMemory(pwErr); saidMemMessage = true; } if (pparser.hasParse() && fallbackToPCFG) { try { String what = "dependency"; if (dparser.hasParse()) { what = "factored"; } pwErr.println( "Sentence too long for " + what + " parser. Falling back to PCFG parse..."); ansTree = getBestPCFGParse(); numFallback++; } catch (OutOfMemoryError oome) { oome.printStackTrace(); numNoMemory++; pwErr.println("No memory to gather PCFG parse. Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); pparser.nudgeDownArraySize(); } } else { pwErr.println( "Sentence has no parse using PCFG grammar (or no PCFG fallback). Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); numSkipped++; } } } catch (UnsupportedOperationException uEx) { pwErr.println("Sentence too long (or zero words)."); pwo.println("Sentence skipped: too long (or zero words)."); numWords -= len; numSkipped++; } if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } // crude addition of k-best tree printing if (Test.printPCFGkBest > 0 && pparser.hasParse()) { if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) { // DZ: debug n best trees if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } docIndex = sentencePosition.getEnd().intValue(); } // for sentence : document if (Test.writeOutputFiles) { pwo.close(); } System.out.println("Parsed file: " + filename + " [" + num + " sentences]."); /** Hide for performance long millis = timer.stop(); if (summary) { if (pcfgLL != null) pcfgLL.display(false, pwErr); if (depLL != null) depLL.display(false, pwErr); if (factLL != null) factLL.display(false, pwErr); }*/ if (saidMemMessage) { printOutOfMemory(pwErr); } /** Hide for performance double wordspersec = numWords / (((double) millis) / 1000); double sentspersec = numSents / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.out.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec)."); */ if (numFallback > 0) { pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength); } } return treeList; }