List of usage examples for edu.stanford.nlp.trees Tree taggedYield
public ArrayList<TaggedWord> taggedYield()
From source file:DependencyParser.RunStanfordParser.java
public RunStanfordParser(String filename) throws FileNotFoundException, IOException { // input format: data directory, and output directory String fileToParse = filename; LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want // Call parser on files, and tokenize the contents FileInputStream fstream = new FileInputStream(fileToParse); DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream BufferedReader br = new BufferedReader(new InputStreamReader(in)); StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage PTBTokenizer tkzr; // tokenizer object WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object // Read File Line By Line String strLine;/*ww w .j a v a 2 s . co m*/ while ((strLine = br.readLine()) != null) { System.out.println("Tokenizing and Parsing: " + strLine); // print current line to console // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(strLine); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Output Option 1: Printing out various data by accessing it programmatically // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { if (tw.tag().startsWith("N") || tw.tag().startsWith("J")) { words.add(tw.word()); tags.add(tw.tag()); } } System.out.println("Noun and Ajective words: " + words); System.out.println("POStags: " + tags); // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWordsAndTags: " + stems); System.out.println("typedDependencies: " + tdl); // getAspect_OpinionWord(tdl.toString(),words,tags); TreePrint tp = new TreePrint("words,penn"); //TreePrint tn = new TreePrint("words,typedDependenciesCollapsed"); //TreePrint to = new TreePrint("rootLabelOnlyFormat,penn"); //System.out.println("Tree print"+tp.); tp.printTree(parse); //tn.printTree(parse); System.out.println("Noun Phrases are: -------"); //(NP (DT a) (JJ temporary) (NN searcher)) String reg = "(\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\(DT \\w*\\) \\((JJ||JJR||JJS) \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\))"; Pattern patt = Pattern.compile(reg); System.out.println(" Noun Phrase List:.."); dfs(parse, parse, patt); //for (Tree subtree: parse) //{ /* if(subtree.label().value().equals("NP")) { String a=subtree.toString(); //System.out.println(a); Matcher match = patt.matcher(a.trim()); while(match.find()) { System.out.println("NP: "+match.group()); } }*/ /*for(Tree np:subtree) { if(np.label().value().equals("NP")) { for(Tree n:np) { if(np.label().value().equals("\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else if(np.label().value().equals("\\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else if(np.label().value().equals("\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else{ if(n.label().value().equals("NP")) { System.out.println("N tag Tags: "+n); System.out.println(Sentence.listToString(n.yield())); } } } } }*/ //} //} System.out.println(); // separate output lines*/ } }
From source file:edu.cmu.cs.in.hoop.visualizers.HoopParseTreeViewer.java
License:Open Source License
/** * //from w w w. j a v a 2s. c o m */ private void processInput(String aSentence) { debug("processInput (" + aSentence + ")"); Tree thisTree = theLexicalizedParser.apply(aSentence); debug("We have a tree!"); PrintWriter pw = new PrintWriter(System.out, true); TreePrint posPrinter = new TreePrint("wordsAndTags"); posPrinter.printTree(thisTree, pw); ArrayList ar = thisTree.taggedYield(); debug(ar.toString()); for (Tree subtree : thisTree) { if (thisTree.isLeaf() == true) { debug("Tree leaf: " + subtree.label().value()); } else { debug("Tree node: " + subtree.label().value()); } } treePanel.setTree(thisTree); }
From source file:gov.llnl.ontology.text.parse.StanfordParser.java
License:Open Source License
private List<SimpleDependencyTreeNode> parseTokens(String header, List<HasWord> sentence) { List<SimpleDependencyTreeNode> nodes = Lists.newArrayList(); // Parse the sentence. If the sentence has no tokens or the // parser fails, simply return an empty string. if (sentence.size() == 0 || sentence.size() > 100 || !parser.parse(sentence)) return nodes; // Get the parse tree and tagged words for the sentence. Tree tree = parser.getBestParse(); List<TaggedWord> taggedSent = tree.taggedYield(); // Convert the tree to a collection of dependency links. GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> dep = gs.typedDependencies(); List<Link> links = Lists.newArrayList(); for (TypedDependency dependency : dep) { int nodeIndex = dependency.dep().index(); int parentIndex = dependency.gov().index(); String relation = dependency.reln().toString(); String token = taggedSent.get(nodeIndex - 1).word(); String pos = taggedSent.get(nodeIndex - 1).tag(); nodes.add(new SimpleDependencyTreeNode(token, pos, nodeIndex)); links.add(new Link(nodeIndex, relation, parentIndex)); }/*www . j a v a 2 s . c om*/ Link.addLinksToTree(nodes, links); return nodes; }
From source file:nlpOperations.RunStanfordParser.java
public static String tagOperations(String sent) { String resultStr = ""; StringReader sr;//w w w. ja v a 2 s.co m PTBTokenizer tkzr; WordStemmer ls = new WordStemmer(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(sent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); resultStr += "tokens: " + toks + "\n\n"; Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWords: " + stems); System.out.println("typedDependencies: " + tdl); resultStr += "words: " + words + "\n\n"; resultStr += "POStags: " + tags + "\n\n"; resultStr += "stemmedWords: " + stems + "\n\n"; resultStr += "typedDependencies: " + tdl + "\n\n"; TreePrint tp1 = new TreePrint("wordsAndTags,latexTree"); tp1.printTree(parse); System.out.println(); // separate output lines return resultStr; }
From source file:nlpOperations.RunStanfordParser.java
public static String sentStemming(Map sent) { String nounsStr = ""; Iterator iter = sent.keySet().iterator(); while (iter.hasNext()) { nounsStr += " " + (String) iter.next(); }// ww w . ja v a2s .co m String outputStr = ""; StringReader sr; PTBTokenizer tkzr; WordStemmer ls = new WordStemmer(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(nounsStr); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } for (int i = 0; i < stems.size(); i++) { outputStr += stems.get(i) + " "; } return outputStr; }
From source file:nlpOperations.RunStanfordParser.java
public static String sentStemming(String sent) { String outputStr = ""; StringReader sr;//from w w w .j a va 2s . c o m PTBTokenizer tkzr; WordStemmer ls = new WordStemmer(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(sent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } for (int i = 0; i < stems.size(); i++) { outputStr += stems.get(i) + " "; } return outputStr; }
From source file:nlpOperations.RunStanfordParser.java
public static Vector taggingStemming(String sent) { Vector resVector = new Vector(); String resultStr = ""; StringReader sr;/*from w w w. j a v a 2 s .c o m*/ PTBTokenizer tkzr; WordStemmer ls = new WordStemmer(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(sent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); resultStr += "tokens: " + toks + "\n\n"; Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } for (int i = 0; i < toks.size(); i++) { ExpandedTerm expandedTerm = new ExpandedTerm(); expandedTerm.setTermOriginWord(toks.get(i).toString()); expandedTerm.setTermStemmedWord(stems.get(i)); expandedTerm.setTermTag(tags.get(i)); expandedTerm.setIsStopWord(StopWordList.isStopWord(stems.get(i))); resVector.add(expandedTerm); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWords: " + stems); System.out.println("typedDependencies: " + tdl); resultStr += "words: " + words + "\n\n"; resultStr += "POStags: " + tags + "\n\n"; resultStr += "stemmedWordsAndTags: " + stems + "\n\n"; resultStr += "typedDependencies: " + tdl + "\n\n"; TreePrint tp1 = new TreePrint("wordsAndTags,latexTree"); tp1.printTree(parse); System.out.println(); // separate output lines return resVector; }
From source file:nlpOperations.RunStanfordParser.java
public static Map getNouns(String sent) { String resultStr = ""; StringReader sr;//from w w w. j a va 2 s . com PTBTokenizer tkzr; Map nouns = new HashMap(); WordStemmer ls = new WordStemmer(); ArrayList<String> stems = new ArrayList(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(sent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get stems ls.visitTree(parse); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { if (tw.tag().startsWith("N")) { int nounIndex = sent.indexOf(tw.word()); resultStr += tw.word() + ":" + nounIndex + "#"; nouns.put(tw.word(), nounIndex); } if (tw.tag().startsWith("JJ")) { int adjIndex = sent.indexOf(tw.word()); resultStr += tw.word() + ":" + adjIndex + "#"; nouns.put(tw.word(), adjIndex); } } return nouns; }
From source file:nlpOperations.RunStanfordParser.java
public static void main(String[] args) throws Exception { String fileToParse = "E:\\OWL\\test.txt"; String englishDataUrl = "E:\\phd-project-tools\\q-system\\stanford-parser-full-2014-06-16\\stanford-parser-full-2014-06-16\\englishPCFG.ser.gz"; LexicalizedParser lp = LexicalizedParser.loadModel(englishDataUrl, "-maxLength", "80", "-retainTmpSubcategories"); // Call parser on files, and tokenize the contents FileInputStream fstream = new FileInputStream(fileToParse); DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream BufferedReader br = new BufferedReader(new InputStreamReader(in)); StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage PTBTokenizer tkzr; // tokenizer object WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object // Read File Line By Line String strLine;//from w ww .jav a2 s.co m while ((strLine = br.readLine()) != null) { System.out.println("Tokenizing and Parsing: " + strLine); // print current line to console // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(strLine); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Output Option 1: Printing out various data by accessing it programmatically // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWordsAndTags: " + stems); System.out.println("typedDependencies: " + tdl); // Output Option 2: Printing out various data using TreePrint // Various TreePrint options // "penn", // constituency parse // "oneline", // rootLabelOnlyFormat, // "words", // "wordsAndTags", // unstemmed words and pos tags // "dependencies", // unlabeled dependency parse // "typedDependencies", // dependency parse // "typedDependenciesCollapsed", // "latexTree", // "collocations", // "semanticGraph" // Print using TreePrint with various options // TreePrint tp = new TreePrint("wordsAndTags,semanticGraph"); // tp.printTree(parse); // System.out.println(); // separate output lines TreePrint tp1 = new TreePrint("wordsAndTags,latexTree"); tp1.printTree(parse); System.out.println(); // separate output lines // TreePrint tp2 = new TreePrint("wordsAndTags,collocations"); // tp2.printTree(parse); // System.out.println(); // separate output lines // // TreePrint tp3 = new TreePrint("wordsAndTags,dependencies"); // tp3.printTree(parse); // System.out.println(); // separate output lines } }
From source file:nlpOperations.RunStanfordParser.java
public static String getPhrases(String sent) { StringReader sr;/* www.ja va 2 s. c o m*/ PTBTokenizer tkzr; WordStemmer ls = new WordStemmer(); // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(sent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWordsAndTags: " + stems); System.out.println("typedDependencies: " + tdl); /* dependecy mainpulation */ // remove [ ] //tokenization Object[] wordRelationsArr = tdl.toArray(); //get nn,amod relations String requiredRelations = ""; for (int i = 0; i < wordRelationsArr.length; i++) { String oneRelation = wordRelationsArr[i].toString(); if (oneRelation.trim().startsWith("nn") || (oneRelation.trim().startsWith("amod"))) { requiredRelations += oneRelation + "#"; } } String phrases = ""; //get nn words String[] requiredRelationsArr = requiredRelations.split("#"); for (int i = 0; i < requiredRelationsArr.length; i++) { String oneRelation = requiredRelationsArr[i]; if (oneRelation.trim().startsWith("nn")) { oneRelation = oneRelation.replace("(", ""); oneRelation = oneRelation.replace(")", ""); oneRelation = oneRelation.replace("nn", ""); String[] oneRelationArr = oneRelation.split(","); String w1 = oneRelationArr[0].split("-")[0]; String w2 = oneRelationArr[1].split("-")[0]; int phraseIndex = sent.indexOf(w2.trim() + " " + w1.trim()); phrases += w2.trim() + " " + w1.trim() + ":" + phraseIndex + "#"; } } //get amod words String[] requiredRelationsArr2 = requiredRelations.split("#"); for (int i = 0; i < requiredRelationsArr2.length; i++) { String oneRelation = requiredRelationsArr2[i]; if (oneRelation.trim().startsWith("amod")) { oneRelation = oneRelation.replace("(", ""); oneRelation = oneRelation.replace(")", ""); oneRelation = oneRelation.replace("amod", ""); String[] oneRelationArr = oneRelation.split(","); String w1 = oneRelationArr[0].split("-")[0]; String w2 = oneRelationArr[1].split("-")[0]; int phraseIndex = sent.indexOf(w2.trim() + " " + w1.trim()); phrases += w2.trim() + " " + w1.trim() + ":" + phraseIndex + "#"; } } System.out.println("phrases are " + phrases); TreePrint tp1 = new TreePrint("wordsAndTags,latexTree"); tp1.printTree(parse); System.out.println(); // separate output lines return phrases; }