List of usage examples for edu.stanford.nlp.trees.tregex TregexMatcher getNode
public Tree getNode(String name)
From source file:com.github.kutschkem.Qgen.annotators.SimpleSentenceDecompositionAnnotator.java
License:Open Source License
private List<String> decompose(String documentText) { List<Tree> trees = new ArrayList<Tree>(); for (String sentence : AnalysisUtilities.getSentences(documentText)) { trees.add(AnalysisUtilities.getInstance().parseSentence(sentence).parse); }/*from w w w .j a v a2s .c om*/ List<String> result = new ArrayList<String>(); for (Tree t : trees) { TregexPattern p = TregexPattern.compile("ROOT << (NP=np $++ VP=vp) "); TregexMatcher m = p.matcher(t); while (m.find()) { Tree np = m.getNode("np"); Tree vp = m.getNode("vp"); Tree np2 = np.deepCopy(); TregexPattern p2 = TregexPattern.compile("NP << (/^S.*/=sbarq ?. /,/=c1 ?, /,/=c2)"); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); ps.add(Tsurgeon.parseOperation("prune sbarq")); ps.add(Tsurgeon.parseOperation("prune c1")); ps.add(Tsurgeon.parseOperation("prune c2")); Tsurgeon.processPattern(p2, Tsurgeon.collectOperations(ps), np2); np = np2; Tree newTree = Tree.valueOf("(S " + np + vp + "(. .))"); result.add(AnalysisUtilities.orginialSentence(newTree.yield())); } } return result; }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
/** * Remove traces and non-terminal decorations (e.g., "-SUBJ" in "NP-SUBJ") from a Penn Treebank-style tree. * * @param inputTree//from ww w . j av a 2s. co m */ public void normalizeTree(Tree inputTree) { inputTree.label().setFromString("ROOT"); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; TregexMatcher matcher; tregexOpStr = "/\\-NONE\\-/=emptynode"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); ps.add(Tsurgeon.parseOperation("prune emptynode")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); Label nonterminalLabel; tregexOpStr = "/.+\\-.+/=nonterminal < __"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); while (matcher.find()) { nonterminalLabel = matcher.getNode("nonterminal"); if (nonterminalLabel == null) continue; nonterminalLabel.setFromString(tlp.basicCategory(nonterminalLabel.value())); } }
From source file:edu.cmu.ark.nlp.question.Question.java
License:Open Source License
public List<Tree> findLogicalWordsAboveIntermediateTree() { List<Tree> res = new ArrayList<Tree>(); Tree pred = intermediateTree.getChild(0).headPreTerminal(this.hf); String lemma = QuestionUtil.getLemma(pred.yield().toString(), pred.label().toString()); String tregexOpStr;// w ww.ja v a 2 s .com TregexPattern matchPattern; TregexMatcher matcher; Tree sourcePred = null; for (Tree leaf : sourceTree.getLeaves()) { Tree tmp = leaf.parent(sourceTree); String sourceLemma = QuestionUtil.getLemma(leaf.label().toString(), tmp.label().toString()); if (sourceLemma.equals(lemma)) { sourcePred = tmp; break; } } tregexOpStr = "RB|VB|VBD|VBP|VBZ|IN|MD|WRB|WDT|CC=command"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(sourceTree); Tree command; while (matcher.find() && sourcePred != null) { command = matcher.getNode("command"); if (QuestionUtil.cCommands(sourceTree, command, sourcePred) && command.parent(sourceTree) != sourcePred.parent(sourceTree)) { res.add(command); } } return res; }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., John and James like Susan. -> John likes Susan. * //from w w w . jav a2 s . co m */ private void extractConjoinedNPs(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; Tree conjoinedNode; Tree parent; TregexMatcher matcher; Question newQuestion; //only extract conjoined NPs that are arguments or adjuncts of the main verb // in the tree, this means the closest S will be the one under the root tregexOpStr = "NP=parent < (CONJP|CC !< or|nor [ " + " $+ /^(N.*|PRP|SBAR)$/=child $-- /^(N.*|PRP|SBAR)$/ | " //there must be a noun on each side of the conjunction + " $-- /^(N.*|PRP|SBAR)$/=child $+ /^(N.*|PRP|SBAR)$/ ] ) " //this avoids extracting from flat NPs such as "the smaller and darker form" + " !>> (/.*/ $ (CC|CONJP !< or|nor)) " //this cannot be nested within a larger conjunction or followed by a conjunction (we recur later to catch this) + " !$ (CC|CONJP !< or|nor)" + " !.. (CC|CONJP !< or|nor > NP|PP|S|SBAR|VP) !>> SBAR "; //+ " >> (ROOT !< (S <+(VP) (/^VB.*$/ < are|were|be|seem|appear))) " ; //don't break plural predicate nominatives (e.g., "John and Mary are two of my best friends.") matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); List<Integer> nodeIndexes = new ArrayList<Integer>(); List<Integer> parentIDs = new ArrayList<Integer>(); while (matcher.find()) { //store the parents' IDs (in the tree) parent = matcher.getNode("parent"); parentIDs.add(parent.nodeNumber(input.getIntermediateTree())); conjoinedNode = matcher.getNode("child"); //store the conjoined nodes' index into their parent's list of children int idx = parent.objectIndexOf(conjoinedNode); if (!nodeIndexes.contains(idx)) nodeIndexes.add(idx); } //for each of the conjoined children, //create a new tree by removing all the nodes they are conjoined with Collections.sort(nodeIndexes);//sort, just to keep them in the original order for (int i = 0; i < nodeIndexes.size(); i++) { newQuestion = input.deeperCopy(); Tree t = newQuestion.getIntermediateTree(); parent = t.getNodeNumber(parentIDs.get(i)); Tree gparent = parent.parent(t); conjoinedNode = parent.getChild(nodeIndexes.get(i)); String siblingLabel; //Remove all the nodes that are conjoined //with the selected noun (or are conjunctions, commas). //These can have labels NP, NN, ..., PRP for pronouns, CC, "," for commas, ":" for semi-colons for (int j = 0; j < parent.numChildren(); j++) { if (parent.getChild(j) == conjoinedNode) continue; siblingLabel = parent.getChild(j).label().toString(); if (siblingLabel.matches("^[NCP,:S].*")) { parent.removeChild(j); j--; } } //if there is an trivial unary "NP -> NP", //remove the parent and put the child in its place if (parent.numChildren() == 1 && parent.getChild(0).label().equals("NP")) { int tmpIndex = gparent.objectIndexOf(parent); gparent.removeChild(tmpIndex); gparent.addChild(tmpIndex, parent.getChild(0)); } correctTense(conjoinedNode, gparent); addQuotationMarksIfNeeded(newQuestion.getIntermediateTree()); //if(GlobalProperties.getDebug()) System.err.println("extractConjoinedNPs: "+newQuestion.getIntermediateTree().toString()); if (this.getComputeFeatures) newQuestion.setFeatureValue("extractedFromConjoinedPhrases", 1.0); //old feature name if (this.getComputeFeatures) newQuestion.setFeatureValue("extractedFromConjoinedNPs", 1.0); extracted.add(newQuestion); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., John ran and Bill walked. -> John ran. Bill walked. * /* w ww. ja v a2s. c om*/ */ private void extractConjoinedPhrases(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; Tree conjoinedNode; TregexMatcher matcher; //Tree newTree = copy.getIntermediateTree(); Tree newTree; int nodeindex; tregexOpStr = "__ " + " [ < (VP < (/VB.*/=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) " + //get the first conjunction, to avoid spurious duplicate matches " | < (VP < (VP=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) " + // verb phrases may be conjoined by commas and adverbs (e.g., "John ran, then walked.") " | < (S|SINV < (S|SINV=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) " + " | < (S|SINV < (S|SINV=child $ (/:/ < /;/ !$++ /:/))) " + //" | < (ADJP < (JJ|JJR|ADJP=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) " + //" | < (ADVP < (RB|RBR|ADVP=child $ RB|RBR|ADVP=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) "+ //" | < (PP < (PP=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) " + " | < (SBAR < (SBAR=child $ (CC|CONJP !< or|nor !$++ CC|CONJP))) ] " + " !$ (CC|CONJP !< or|nor)" + //this cannot be nested within a larger conjunction or followed by a conjunction (we recur later to catch this) " !< (CC|CONJP !< or|nor) " + " !.. (CC|CONJP !< or|nor > NP|PP|S|SBAR|VP) !>> SBAR"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { conjoinedNode = matcher.getNode("child"); nodeindex = conjoinedNode.nodeNumber(input.getIntermediateTree()); //make a copy of the input for this iteration newTree = input.getIntermediateTree().deepCopy(); removeConjoinedSiblingsHelper(newTree, nodeindex); //for conjoined main clauses, add punctuation if necessary QuestionUtil.addPeriodIfNeeded(newTree); //make a new Question object and add it addQuotationMarksIfNeeded(newTree); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); //if(GlobalProperties.getDebug()) System.err.println("extractConjoinedPhrases: "+newTree.toString()); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromConjoinedPhrases", 1.0); //old feature name if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromConjoined", 1.0); addIfNovel(extracted, newTreeWithFeatures); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * //from w w w . j a v a2s .c o m * John studied, hoping to get a good grade. -> John hoped to get a good grade. * * @param extracted * @param input */ private void extractVerbParticipialModifiers(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = "S=sub $- /,/ !< NP < (VP=participial < VBG=verb) " + " >+(VP) (S|SINV < NP=subj) " + " >> (ROOT <<# /VB.*/=tense) "; //tense determined by top-most verb matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { String verbPOS = findTense(matcher.getNode("tense")); Tree p = matcher.getNode("participial").deepCopy(); Tree verb = matcher.getNode("verb"); String verbLemma = QuestionUtil.getLemma(verb.getChild(0).label().toString(), verb.label().toString()); String newVerb = this.conjugator.getSurfaceForm(verbLemma, verbPOS); int verbIndex = p.objectIndexOf(verb); p.removeChild(verbIndex); p.addChild(verbIndex, QuestionUtil.readTreeFromString("(" + verbPOS + " " + newVerb + ")")); String treeStr = "(ROOT (S " + matcher.getNode("subj").toString() + " " + p.toString() + " (. .)))"; Tree newTree = QuestionUtil.readTreeFromString(treeStr); correctTense(newTree.getChild(0).getChild(0), newTree.getChild(0)); addQuotationMarksIfNeeded(newTree); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromParticipial", 1.0); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromVerbParticipial", 1.0); if (this.getComputeFeatures) System.err.println("extractVerbParticipialModifiers: " + newTree.toString()); addIfNovel(extracted, newTreeWithFeatures); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., As John slept, I studied. -> John slept. * //w ww .j a v a 2s . c om */ private void extractSubordinateClauses(Collection<Question> extracted, Question input) { Tree subord; String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = " SBAR [ > VP < IN | > S|SINV ] " + //not a complement " !< (IN < if|unless|that)" + //not a conditional antecedent " < (S=sub !< (VP < VBG)) " + //+ //not a participial phrase " >S|SINV|VP "; //not part of a noun phrase or PP (other methods for those) matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { Tree newTree = factory.newTreeNode("ROOT", new ArrayList<Tree>()); subord = matcher.getNode("sub"); newTree.addChild(subord.deepCopy()); QuestionUtil.addPeriodIfNeeded(newTree); addQuotationMarksIfNeeded(newTree); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromFiniteClause", 1.0); //old feature name if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromSubordinateClause", 1.0); //if(GlobalProperties.getDebug()) System.err.println("extractSubordinateClauses: "+newTree.toString()); addIfNovel(extracted, newTreeWithFeatures); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
private void extractComplementClauses(Collection<Question> extracted, Question input) { Tree subord;/*ww w .j a v a2 s. c o m*/ String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; //TODO should also address infinitive complements tregexOpStr = "SBAR " + " < (S=sub !< (VP < VBG)) " + //+ //not a participial phrase " !> NP|PP " + //not part of a noun phrase or PP (other methods for those) " [ $- /^VB.*/=verb | >+(SBAR) (SBAR $- /^VB.*/=verb) ] "; //complement of a VP (follows the verb) matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { Tree newTree = factory.newTreeNode("ROOT", new ArrayList<Tree>()); subord = matcher.getNode("sub"); Tree verb = matcher.getNode("verb"); String verbLemma = QuestionUtil.getLemma(verb.yield().toString(), verb.label().toString()); if (!verbImpliesComplement(verbLemma)) { continue; } newTree.addChild(subord.deepCopy()); QuestionUtil.addPeriodIfNeeded(newTree); addQuotationMarksIfNeeded(newTree); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromFiniteClause", 1.0); //old feature name if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromComplementClause", 1.0); //if(GlobalProperties.getDebug()) System.err.println("extractComplementClauses: "+newTree.toString()); addIfNovel(extracted, newTreeWithFeatures); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., Lincoln, the 16th president, was tall. -> Lincoln was the 16th president. * The meeting, in 1984, was important. -> The meeting was in 1984. *///w ww.j a va2 s. c om private void extractAppositives(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = "NP < (NP=noun !$-- NP $+ (/,/ $++ NP|PP=appositive !$ CC|CONJP)) " + " >> (ROOT <<# /^VB.*/=mainverb) "; //extract the main verb to capture the verb tense matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { Tree verbtree = matcher.getNode("mainverb"); Tree nountree = matcher.getNode("noun").deepCopy(); Tree appositivetree = matcher.getNode("appositive"); makeDeterminerDefinite(nountree); //if both are proper nouns, do not extract because this is not an appositive(e.g., "Pittsburgh, PA") /*if(nountree.headPreTerminal(this.hf).label().toString().equals("NNP") && appositivetree.headPreTerminal(this.hf).label().toString().equals("NNP")) { continue; }*/ //make a new tree for a copula sentence with the noun and appositive String pos = verbtree.label().toString(); String copula; if (pos.equals("VBD")) { if (isPlural(nountree)) { copula = "(VBD were)"; } else { copula = "(VBD was)"; } } else { if (isPlural(nountree)) { copula = "(VBD are)"; } else { copula = "(VBD is)"; } } Tree newTree = QuestionUtil.readTreeFromString( "(ROOT (S " + nountree + " (VP " + copula + " " + appositivetree + ") (. .)))"); addQuotationMarksIfNeeded(newTree); //if(GlobalProperties.getDebug()) System.err.println("extractAppositives: "+ newTree.toString()); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromAppositive", 1.0); addIfNovel(extracted, newTreeWithFeatures); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., John, who is a friend of mine, likes Susan. -> John is a friend of mine. * //w ww .j a va 2 s . co m */ private void extractNonRestrictiveRelativeClauses(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; TregexMatcher matcherclause; tregexOpStr = "NP=np < (SBAR=sbar [ < (WHADVP=wherecomp < (WRB < where)) " + " | < (WHNP !< /WP\\$/) " + " | < (WHNP=possessive < /WP\\$/)" //John, whose car was + " | < (WHPP < IN|TO=preposition) ] $-- NP $- /,/ " + " < S=relclause !< WHADJP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); //iterate over all the relative clauses in the input //and create an output sentence for each one. while (matcher.find()) { Tree missingArgumentTree = matcher.getNode("np"); Tree relclause = matcher.getNode("relclause"); if (missingArgumentTree == null || relclause == null) continue; missingArgumentTree = missingArgumentTree.deepCopy(); relclause = relclause.deepCopy(); Tree possessive = matcher.getNode("possessive"); Tree sbar = matcher.getNode("sbar").deepCopy(); makeDeterminerDefinite(missingArgumentTree); if (possessive != null) { possessive = possessive.deepCopy(); possessive.removeChild(0); String newTree = "(NP (NP " + missingArgumentTree.toString() + " (POS 's))"; for (int i = 0; i < possessive.numChildren(); i++) newTree += possessive.getChild(i).toString() + " "; newTree += ")"; missingArgumentTree = QuestionUtil.readTreeFromString(newTree); } //remove the relative clause and the commas surrounding it from the missing argument tree for (int i = 0; i < missingArgumentTree.numChildren(); i++) { if (missingArgumentTree.getChild(i).equals(sbar)) { //remove the relative clause missingArgumentTree.removeChild(i); //remove the comma after the relative clause if (i < missingArgumentTree.numChildren() && missingArgumentTree.getChild(i).label().toString().equals(",")) { missingArgumentTree.removeChild(i); } //remove the comma before the relative clause if (i > 0 && missingArgumentTree.getChild(i - 1).label().toString().equals(",")) { missingArgumentTree.removeChild(i - 1); i--; } i--; } } //put the noun in the clause at the topmost place with an opening for a noun. //Note that this may mess up if there are noun phrase adjuncts like "The man I met Tuesday". //specifically: //the parent of the noun can be either a clause (S) as in "The man who met me" //or a verb phrase as in "The man who I met". //for verb phrases, add the noun to the end since it will be an object. //for clauses, add the noun to the beginning since it will be the subject. tregexOpStr = "S|VP=newparent !< NP < (VP=verb !< TO !$ TO)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcherclause = matchPattern.matcher(relclause); boolean subjectMovement = true; if (!matcherclause.find()) { tregexOpStr = "VP=newparent !< VP < /VB.*/=verb !>> (S !< NP) !<< (VP !< VP !< NP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcherclause = matchPattern.matcher(relclause); subjectMovement = false; } //reset (so the first match isn't skipped) matcherclause = matchPattern.matcher(relclause); if (matcherclause.find()) { Tree newparenttree = matcherclause.getNode("newparent"); Tree verbtree = matcherclause.getNode("verb"); boolean ppRelativeClause = false; if (matcher.getNode("wherecomp") != null) { String tmp = "(PP (IN at) " + missingArgumentTree.toString() + ")"; missingArgumentTree = QuestionUtil.readTreeFromString(tmp); ppRelativeClause = true; subjectMovement = false; } else if (matcher.getNode("preposition") != null) { String tmp = "(PP (IN " + matcher.getNode("preposition").yield().toString() + ") " + missingArgumentTree.toString() + ")"; missingArgumentTree = QuestionUtil.readTreeFromString(tmp); ppRelativeClause = true; } if (subjectMovement) { //subject newparenttree.addChild(newparenttree.objectIndexOf(verbtree), missingArgumentTree); } else { // newparentlabel is VP if (ppRelativeClause) newparenttree.addChild(newparenttree.numChildren(), missingArgumentTree); else newparenttree.addChild(newparenttree.objectIndexOf(verbtree) + 1, missingArgumentTree); } //create a new tree with punctuation Tree newTree = factory.newTreeNode("ROOT", new ArrayList<Tree>()); newTree.addChild(relclause); QuestionUtil.addPeriodIfNeeded(newTree); //if(GlobalProperties.getDebug()) System.err.println("extractRelativeClauses: "+ newTree.toString()); addQuotationMarksIfNeeded(newTree); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (this.getComputeFeatures) newTreeWithFeatures.setFeatureValue("extractedFromRelativeClause", 1.0); addIfNovel(extracted, newTreeWithFeatures); } } }