List of usage examples for edu.stanford.nlp.trees Tree toString
@Override
public String toString()
From source file:edu.cmu.ark.SentenceSimplifier.java
License:Open Source License
/** * e.g., In January, John wore his winter coat. -> John wore his winter coat in January. * /* w w w .j av a 2s. c om*/ * @param input * @return */ private void moveLeadingPPsAndQuotes(Question input) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; Tree mainvp = null; Tree subj = null; tregexOpStr = "ROOT < (S|SINV=mainclause < (NP|SBAR=subj !$++ /,/) < VP=mainvp " + " [ < (PP=modifier < NP) " //must be a PP with an NP object + "| < (S=modifier < SBAR|NP <<# VB|VBD|VBP|VBZ) ] ) "; //OR: a quote, which is an S clause with a subject and finite main verb //the modifiers to move must be immediately followed by commas matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); List<Tree> modifiers = new ArrayList<Tree>(); while (matcher.find()) { if (mainvp == null) { mainvp = matcher.getNode("mainvp").deeperCopy(); subj = matcher.getNode("subj").deeperCopy(); } Tree mainclause = matcher.getNode("mainclause"); Tree modifier = matcher.getNode("modifier").deeperCopy(); int idx = mainclause.indexOf(modifier); if (modifiers.contains(modifier)) continue; //just in case the tregex expression catches duplicates //add commas and quotation marks if they appeared in the original if (idx > 0 && mainclause.getChild(idx - 1).label().toString().equals("``")) { modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("(, ,)")); modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("(`` ``)")); Tree sbar = factory.newTreeNode("SBAR", new ArrayList<Tree>()); sbar.addChild(modifier); modifiers.add(sbar); modifiers.add(AnalysisUtilities.getInstance().readTreeFromString("('' '')")); } else { modifiers.add(modifier); } } if (mainvp != null) { //any matches? for (Tree m : modifiers) { mainvp.addChild(m); } Tree newTree = factory.newTreeNode("ROOT", new ArrayList<Tree>()); Tree clause = factory.newTreeNode("S", new ArrayList<Tree>()); newTree.addChild(clause); clause.addChild(subj); clause.addChild(mainvp); AnalysisUtilities.addPeriodIfNeeded(newTree); addQuotationMarksIfNeeded(newTree); if (GlobalProperties.getDebug()) System.err.println("moveLeadingModifiers: " + newTree.toString()); input.setIntermediateTree(newTree); if (GlobalProperties.getComputeFeatures()) input.setFeatureValue("movedLeadingPPs", 1.0); } }
From source file:edu.cmu.ark.SentenceSimplifier.java
License:Open Source License
/** * e.g., John, hoping to get a good grade, studied. -> John hoped to get a good grade. * Walking to the store, John saw Susan -> John was walking to the store. * //from w w w .java 2 s . c o m * NOTE: This method produces false positives for sentences like, * "Broadly speaking, the project was successful." * where the participial phrase does not modify the subject. * * @param extracted * @param input */ private void extractNounParticipialModifiers(Collection<Question> extracted, Question input) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = "ROOT < (S " + " [ << (NP < (NP=subj $++ (/,/ $+ (VP=modifier <# VBN|VBG|VP=tense )))) " //modifiers that appear after nouns + " | < (S !< NP|SBAR < (VP=modifier <# VBN|VBG|VP=tense) $+ (/,/ $+ NP=subj)) " //modifiers before the subject. e.g., Founded by John, the company... + " | < (SBAR < (S !< NP|SBAR < (VP=modifier <# VBN|VBG=tense)) $+ (/,/ $+ NP=subj)) " //e.g., While walking to the store, John saw Susan. + " | < (PP=modifier !< NP <# VBG=tense $+ (/,/ $+ NP=subj)) ] ) " // e.g., Walking to the store, John saw Susan. + " <<# /^VB.*$/=maintense "; //tense determined by top-most verb matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(input.getIntermediateTree()); while (matcher.find()) { Tree nountree = matcher.getNode("subj").deeperCopy(); Tree vptree = matcher.getNode("modifier"); Tree verb = matcher.getNode("tense"); makeDeterminerDefinite(nountree); if (vptree.label().toString().equals("PP")) vptree.label().setValue("VP"); String verbPOS = findTense(matcher.getNode("maintense")); if (vptree == null || nountree == null) return; String newTreeStr; if (verb.label().toString().equals("VBG")) { //for present partcipials, change the tense to the tense of the main verb //e.g., walking to the store -> walked to the store String verbLemma = AnalysisUtilities.getInstance().getLemma(verb.getChild(0).label().toString(), verb.label().toString()); String newVerb = AnalysisUtilities.getInstance().getSurfaceForm(verbLemma, verbPOS); int verbIndex = vptree.indexOf(verb); vptree = vptree.deeperCopy(); vptree.removeChild(verbIndex); vptree.addChild(verbIndex, AnalysisUtilities.getInstance().readTreeFromString("(" + verbPOS + " " + newVerb + ")")); newTreeStr = "(ROOT (S " + matcher.getNode("subj").toString() + " " + vptree.toString() + " (. .)))"; } else { //for past participials, add a copula //e.g., John, exhausted, -> John was exhausted //(or for conjunctions, just add the copula---kind of a hack to make the moby dick sentence work out) String auxiliary; if (verbPOS.equals("VBP") || verbPOS.equals("VBD")) { if (isPlural(nountree)) auxiliary = "(VBD were)"; else auxiliary = "(VBD was)"; } else { if (isPlural(nountree)) auxiliary = "(VB are)"; else auxiliary = "(VBZ is)"; } newTreeStr = "(ROOT (S " + nountree + " (VP " + auxiliary + " " + vptree + ") (. .)))"; } Tree newTree = AnalysisUtilities.getInstance().readTreeFromString(newTreeStr); correctTense(newTree.getChild(0).getChild(0), newTree.getChild(0)); addQuotationMarksIfNeeded(newTree); if (GlobalProperties.getDebug()) System.err.println("extractNounParticipialModifiers: " + newTree.toString()); Question newTreeWithFeatures = input.deeperCopy(); newTreeWithFeatures.setIntermediateTree(newTree); if (GlobalProperties.getComputeFeatures()) newTreeWithFeatures.setFeatureValue("extractedFromParticipial", 1.0); //old feature name if (GlobalProperties.getComputeFeatures()) newTreeWithFeatures.setFeatureValue("extractedFromNounParticipial", 1.0); extracted.add(newTreeWithFeatures); } }
From source file:edu.cmu.deiis.annotator.StanfordCoreNLPAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);//from w w w. jav a 2 s .c o m token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }
From source file:edu.cmu.deiis.annotators.StanfordAnnotator.java
License:Open Source License
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos);/* w w w. j a v a2s. com*/ token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException(String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); //String line = mention.getCoveredText(); //System.out.println(line); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort(mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort(entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { //NamedEntityMention mention=entity.getMentions(3); //System.out.println(mention.getBegin()); entity.addToIndexes(); } }
From source file:edu.cornell.law.entitylinking.utils.Utility.java
public static List<String> getInnerNounPhrases(String paragraph) { List<String> nounPhrases = new ArrayList<String>(); try {/* www . j a va 2 s . c om*/ StringTokenizer tokenizer = new StringTokenizer(paragraph, "\\.;?,:"); while (tokenizer.hasMoreTokens()) { Annotation document = new Annotation(tokenizer.nextToken()); pipeline.annotate(document); Tree tree = null; // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // the parse tree of the current sentence tree = sentence.get(TreeAnnotation.class); List<Tree> phraseList = new ArrayList<Tree>(); for (Tree subtree : tree) { if ((subtree.label().value().equals("NP")) || (subtree.label().value().equals("WHNP"))) { phraseList.add(subtree); } } if (!phraseList.isEmpty()) { String skipPhrase = "false"; for (Tree subList : phraseList) { StringBuilder phraseString = new StringBuilder(); String phrase = subList.toString(); String[] tokens = phrase.split(" "); for (String token : tokens) { if (token.contains("(")) { if (token.contains("(NP")) { // Check if there are more NP or WHNP in it? String subPhrase = phrase.replaceFirst("\\(NP", ""); if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) { skipPhrase = "true"; break; } } else if (token.contains("(WHNP")) { // Check if there are more NP or WHNP in it? String subPhrase = phrase.replaceFirst("\\(WHNP", ""); if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) { skipPhrase = "true"; break; } } else { // do nothing, just drop the keyword. } } else { token = token.replace(")", ""); phraseString.append(token + " "); skipPhrase = "false"; } } if (!skipPhrase.equals("true")) { String temp = phraseString.toString().trim(); if (temp.startsWith("(?i)the")) temp = temp.replaceFirst("(?i)the ", ""); else if (temp.startsWith("(?i)a")) temp = temp.replaceFirst("(?i)a ", ""); else if (temp.startsWith("(?i)an")) temp = temp.replaceFirst("(?i)an ", ""); if (temp.contains(" or ")) { String[] nptokens = temp.split(" or "); for (String s : nptokens) { nounPhrases.add(s); } } else { nounPhrases.add(temp); } } } } } } } catch (OutOfMemoryError e) { System.out.println("Result too long to read into memory"); } return nounPhrases; }
From source file:edu.rpi.tw.linkipedia.search.nlp.NaturalLanguageProcessor.java
License:Open Source License
private List<String> getNounPhraseFromParseTree(Tree parse) { List<String> phraseList = new ArrayList<String>(); for (Tree subtree : parse) { if (subtree.label().value().equals("NP")) { String subtreeString = subtree.toString(); if (subtreeString.lastIndexOf("(NP") != subtreeString.indexOf("(NP")) continue; //System.out.println(subtree); List<LabeledWord> words = subtree.labeledYield(); String currentPhrase = ""; for (LabeledWord word : words) { currentPhrase += word.word() + "|" + word.tag() + " "; }//w ww . j a va2 s. c o m currentPhrase = currentPhrase.trim(); phraseList.add(currentPhrase); } } return phraseList; }
From source file:elkfed.expletives.ExpletiveInstance.java
License:Apache License
public ExpletiveInstance(Tree root, Tree pronoun, String id) { _root = root;//from www .ja v a2s. c om _pronoun = pronoun; _id = id; List<Tree> wordsT = root.getLeaves(); List<Label> posT = root.preTerminalYield(); // get words and POS into an array so that // we get an idea of the pronoun's surrounding String[] words = new String[wordsT.size()]; String[] pos = new String[wordsT.size()]; if (!root.dominates(pronoun)) { System.err.format("%s does not dominate %s. WTF?", root, pronoun); } for (int here = 0; here < wordsT.size(); here++) { Tree w1 = wordsT.get(here); Label p1 = posT.get(here); words[here] = w1.toString(); pos[here] = p1.value(); if (w1 == pronoun) { _idx = here; } else if (pronoun.dominates(w1)) { _idx = here; pronoun = w1; } } assert _idx >= 0 : String.format("wanted %s in %s", pronoun, root); assert pos[_idx].equals("PRP") : String.format("wanted PRP got '%s'", pos[_idx]); _words = words; _pos = pos; }
From source file:elkfed.expletives.TrainingData.java
License:Apache License
public static void extractExamples(String file, Set<String> anaphoricPronouns, List<ExpletiveInstance> instances) throws FileNotFoundException, IOException { TreeReader tr = new PennTreeReader(new FileReader(file), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree t;/*from w w w.j ava 2 s .com*/ String file_id = file.substring(file.length() - 8, file.length() - 4); int sent_idx = 1; while ((t = tr.readTree()) != null) { //t.pennPrint(); int word_idx = 1; for (Tree t1 : t.getLeaves()) { String s = t1.toString(); if ("it".equals(s) || "It".equals(s)) { String id = String.format("%s:S%d:%d-%d", file_id, sent_idx, word_idx, word_idx); ExpletiveInstance inst = new ExpletiveInstance(t, t1, id); boolean is_positive = anaphoricPronouns.contains(id); inst.setFeature(PairInstance.FD_POSITIVE, !is_positive); instances.add(inst); String cls = is_positive ? "+1" : "-1"; System.out.format("%s\t%s\t(%s)\n", s, id, cls); } word_idx++; } //System.out.println(); //System.out.println(t); sent_idx++; } }
From source file:elkfed.mmax.importer.DetermineMinSpan.java
License:Apache License
/** adds min_ids and min_span attributes so that * BART's chunk-based coref resolution works *//*from w ww . j a v a 2s . c o m*/ public static void addMinSpan(int start, Tree tree, IMarkable tag, List<String> tokens) { List<Tree> leaves = tree.getLeaves(); Tree startNode; Tree endNode; try { startNode = leaves.get(tag.getLeftmostDiscoursePosition() - start); endNode = leaves.get(tag.getRightmostDiscoursePosition() - start); if (".".equals(endNode.parent(tree).value())) { //System.err.println("Sentence-final dot in "+ // tokens.subList(tag.start, tag.end + 1)+ "removed."); endNode = leaves.get(tag.getRightmostDiscoursePosition() - start - 1); } } catch (IndexOutOfBoundsException ex) { System.out.format("indices not found: %d,%d in %s [wanted: %s] [ctx: %s]", tag.getLeftmostDiscoursePosition() - start, tag.getRightmostDiscoursePosition() - start, leaves, tokens.subList(tag.getLeftmostDiscoursePosition(), tag.getRightmostDiscoursePosition() + 1), tokens.subList(start, tag.getLeftmostDiscoursePosition())); throw ex; } Tree parentNode = startNode; while (parentNode != null && !parentNode.dominates(endNode)) { parentNode = parentNode.parent(tree); } if (parentNode == null) { System.err.println("Could not match tree (1)"); return; } if (startNode.leftCharEdge(tree) != parentNode.leftCharEdge(tree) || endNode.rightCharEdge(tree) != parentNode.rightCharEdge(tree)) { System.err.println("Could not match tree (2)"); return; } Tree oldParent = parentNode; ModCollinsHeadFinder hf = new ModCollinsHeadFinder(); // use the head finder to narrow down the span. // stop if (a) the head is no longer an NP or // (b) the NP is a conjunction go_up: while (true) { for (Tree t : parentNode.getChildrenAsList()) { if (t.value().equals("CC")) { break go_up; } } Tree headDtr = hf.determineHead(parentNode); if (headDtr == null || !headDtr.value().equals("NP")) { break; } parentNode = headDtr; } if (parentNode != oldParent) { List<Tree> newLeaves = parentNode.getLeaves(); int newStart = start + find_same(leaves, newLeaves.get(0)); int newEnd = newStart + newLeaves.size() - 1; if (newStart <= tag.getLeftmostDiscoursePosition()) { if (tag.getLeftmostDiscoursePosition() - newStart > 1) { System.err.println("NP node is too big:" + parentNode.toString() + " wanted:" + tokens .subList(tag.getLeftmostDiscoursePosition(), tag.getRightmostDiscoursePosition() + 1) + " in: " + tree); return; } for (int i = newStart - start; i < tag.getLeftmostDiscoursePosition() - start; i++) { System.err.println("additional prefix in syntax:" + leaves.get(i)); } // switch NP boundary and tag boundary // (even [Connie Cheung]) => min_words="Connie Cheung" int tmp = tag.getLeftmostDiscoursePosition(); tag.adjustSpan(newStart, tag.getRightmostDiscoursePosition()); newStart = tmp; } assert newEnd <= tag.getRightmostDiscoursePosition(); // this relies on MiniDiscourse's default word numbering // which is ugly but should generally work... if (newStart == newEnd) { tag.setAttributeValue("min_ids", "word_" + (newStart + 1)); } else { tag.setAttributeValue("min_ids", String.format("word_%d..word_%d", newStart + 1, newEnd + 1)); } StringBuffer buf = new StringBuffer(); for (Tree t : newLeaves) { buf.append(t.toString().toLowerCase()); buf.append(' '); } buf.setLength(buf.length() - 1); tag.setAttributeValue("min_words", buf.toString()); } }
From source file:elkfed.mmax.importer.ImportOntonotes.java
License:Apache License
/** adds pos and chunk information */ private void addParseInfo(int start, Tree tree) { /** Retrieve chunk tags from the parse tree and add chunk markables */ boolean inNP = false; int startNP = -1; int wordLoc = 0; int depth = 0; for (String tok : tree.toString().replaceAll("\\)", ") ").split("\\s+")) { if (tok.matches("\\(NP")) { inNP = true;//from w w w. jav a2 s . com startNP = wordLoc; depth = 0; } if ((inNP) && (tok.matches(".*\\)"))) { depth--; } if ((inNP) && (tok.matches("\\(.*"))) { depth++; } if (tok.matches(".+\\)")) { wordLoc++; } if ((depth == 0) && (inNP)) { inNP = false; Tag t = new Tag(); t.tag = DEFAULT_CHUNK_LEVEL; t.attrs.put("tag", "np"); t.start = start + startNP; t.end = start + wordLoc - 1; tags.add(t); } } /** Retrieve POS tags from the parse tree */ List<Label> taggedSent = new ArrayList<Label>(tree.preTerminalYield()); for (int i = 0; i < taggedSent.size(); i++) { Tag t = new Tag(); t.tag = DEFAULT_POS_LEVEL; t.start = t.end = start + i; String tag = taggedSent.get(i).value(); t.attrs.put("tag", tag.toLowerCase()); tags.add(t); } }