List of usage examples for edu.stanford.nlp.util Pair Pair
public Pair(T1 first, T2 second)
From source file:BuildBinarizedDataset.java
public static void extractLabels(Map<Pair<Integer, Integer>, String> spanToLabels, List<HasWord> tokens, String line) {//from w w w .ja v a2s.c om String[] pieces = line.trim().split("\\s+"); if (pieces.length == 0) { return; } if (pieces.length == 1) { String error = "Found line with label " + line + " but no tokens to associate with that line"; throw new RuntimeException(error); } //TODO: BUG: The pieces are tokenized differently than the splitting, e.g., on possessive markers as in "actors' expenses" for (int i = 0; i < tokens.size() - pieces.length + 2; ++i) { boolean found = true; for (int j = 1; j < pieces.length; ++j) { if (!tokens.get(i + j - 1).word().equals(pieces[j])) { found = false; break; } } if (found) { spanToLabels.put(new Pair<>(i, i + pieces.length - 1), pieces[0]); } } }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
public static void addPeriodIfNeeded(Tree input) { String tregexOpStr = "ROOT < (S=mainclause !< /\\./)"; TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr); TregexMatcher matcher = matchPattern.matcher(input); if (matcher.find()) { TsurgeonPattern p;//from ww w . ja v a2s . c o m List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps.add(Tsurgeon.parseOperation("insert (. .) >-1 mainclause")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, input); } }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
/** * Remove traces and non-terminal decorations (e.g., "-SUBJ" in "NP-SUBJ") from a Penn Treebank-style tree. * * @param inputTree//w w w. ja v a 2 s .c o m */ public void normalizeTree(Tree inputTree) { inputTree.label().setFromString("ROOT"); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; TregexMatcher matcher; tregexOpStr = "/\\-NONE\\-/=emptynode"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); ps.add(Tsurgeon.parseOperation("prune emptynode")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); Label nonterminalLabel; tregexOpStr = "/.+\\-.+/=nonterminal < __"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); while (matcher.find()) { nonterminalLabel = matcher.getNode("nonterminal"); if (nonterminalLabel == null) continue; nonterminalLabel.setFromString(tlp.basicCategory(nonterminalLabel.value())); } }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
/** * remove extra quotation marks//from w w w . j a v a2 s. com * (a hack due to annoying PTB conventions by which quote marks aren't in the same consituent) * * @param input */ public static void removeExtraQuotes(Tree input) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; List<TsurgeonPattern> ps; ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "ROOT [ << (``=quote < `` !.. ('' < '')) | << (''=quote < '' !,, (`` < ``)) ] "; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("prune quote")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, input); }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * /*from w w w . j a va2 s. c om*/ * e.g., However, John did not study. -> John did not study. * * @param q * @return */ private boolean removeClauseLevelModifiers(Question q) { List<Pair<TregexPattern, TsurgeonPattern>> ops; String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; List<TsurgeonPattern> ps; boolean modified = false; //remove subordinate clauses and various phrases //leave conditional antecedents (i.e., with "if" or "unless" as complementizers. punt on "even if") tregexOpStr = "ROOT=root < (S=mainclause < (/SBAR|ADVP|ADJP|CC|PP|S|NP/=fronted !< (IN < if|unless) !$ `` $++ NP=subject))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); TregexMatcher matcher = matchPattern.matcher(q.getIntermediateTree()); if (matcher.find()) { ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "ROOT=root < (S=mainclause < (/[,:]/=comma $ (/SBAR|ADVP|ADJP|CC|PP|S|NP/=fronted !< (IN < if|unless) $++ NP=subject)))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("prune comma")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "ROOT=root < (S=mainclause < (/SBAR|ADVP|ADJP|CC|PP|S|NP/=fronted !< (IN < if|unless) $++ NP=subject))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("prune fronted")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); addQuotationMarksIfNeeded(q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedClauseLevelModifiers", 1.0); modified = true; } return modified; }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * /* w ww.ja va2s . co m*/ * e.g., John studied, hoping to get a good grade. -> John studied. * * @param input * @return whether or not a change was made */ private boolean removeVerbalModifiersAfterCommas(Question q) { List<Pair<TregexPattern, TsurgeonPattern>> ops; String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; List<TsurgeonPattern> ps; ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); tregexOpStr = "ROOT=root << (VP !< VP < (/,/=comma $+ /[^`].*/=modifier))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); //remove modifiers ps = new ArrayList<TsurgeonPattern>(); if (matchPattern.matcher(q.getIntermediateTree()).find()) { ps.add(Tsurgeon.parseOperation("prune modifier")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); //now remove the comma ops.clear(); ps.clear(); tregexOpStr = "ROOT=root << (VP !< VP < /,/=comma)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("prune comma")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); addQuotationMarksIfNeeded(q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedVerbalModifiersAfterCommas", 1.0); return true; } else { return false; } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
private void addQuotationMarksIfNeeded(Tree input) { String tregexOpStr;/*from www . j a va 2 s.c om*/ TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = "__=parent < (/`/ !.. /'/)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input); if (matcher.find()) { TsurgeonPattern p; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps.add(Tsurgeon.parseOperation("insert ('' '') >-1 parent")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, input); } }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * Convert a non-definite determiner to "the". * Used when extracting from noun modifiers such as relative clauses. * E.g., "A tall man, who was named Bob, entered the store." * -> "A tall man was named Bob."/*from w w w . j av a 2 s. c o m*/ * -> "THE tall man was named Bob." * * @param np */ private void makeDeterminerDefinite(Tree np) { String tregexOpStr = "NP !> __ <+(NP) (DT=det !< the)"; TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr); TsurgeonPattern p; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps.add(Tsurgeon.parseOperation("replace det (DT the)")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, np); }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * e.g., John, who hoped to get a good grade, studied. -> John studied. * /* www . ja va2 s .c om*/ */ private boolean removeNonRestrRelClausesAndParticipials(Question q) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; List<TsurgeonPattern> ps; ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "NP < (VP|SBAR=mod $- /,/=punc !$+ /,/ !$ CC|CONJP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); boolean modified = false; if (matchPattern.matcher(q.getIntermediateTree()).find()) { ps.add(Tsurgeon.parseOperation("prune punc")); ps.add(Tsurgeon.parseOperation("prune mod")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedNonRestrRelClausesAndParticipials", 1.0); modified = true; } ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "NP < (VP|SBAR=mod $- /,/=punc $+ /,/=punc2 !$ CC|CONJP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (matchPattern.matcher(q.getIntermediateTree()).find()) { ps.add(Tsurgeon.parseOperation("prune punc")); ps.add(Tsurgeon.parseOperation("prune mod")); ps.add(Tsurgeon.parseOperation("prune punc2")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedNonRestrRelClausesAndParticipials", 1.0); modified = true; } return modified; }
From source file:edu.cmu.ark.nlp.sent.SentenceSimplifier.java
License:Open Source License
/** * /* w w w.j a v a 2 s .co m*/ * e.g., John Smith (1931-1992) was a fireman. -> John Smith was a Fireman. * * @return whether or not a change was made */ private boolean removeParentheticals(Question q) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; List<TsurgeonPattern> ps; boolean res = false; ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "__=parenthetical [ $- /-LRB-/=leadingpunc $+ /-RRB-/=trailingpunc " + " | $+ /,/=leadingpunc $- /,/=trailingpunc !$ CC|CONJP " + " | $+ (/:/=leadingpunc < --) $- (/:/=trailingpunc < /--/) ]"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (matchPattern.matcher(q.getIntermediateTree()).find()) { ps.add(Tsurgeon.parseOperation("prune leadingpunc")); ps.add(Tsurgeon.parseOperation("prune parenthetical")); ps.add(Tsurgeon.parseOperation("prune trailingpunc")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); if (res) addQuotationMarksIfNeeded(q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedParentheticals", 1.0); res = true; } ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "PRN=parenthetical"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (matchPattern.matcher(q.getIntermediateTree()).find()) { ps.add(Tsurgeon.parseOperation("prune parenthetical")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, q.getIntermediateTree()); if (this.getComputeFeatures) q.setFeatureValue("removedParentheticals", 1.0); res = true; } return res; }