List of usage examples for edu.stanford.nlp.parser.lexparser LexicalizedParser apply
@Override public Tree apply(List<? extends HasWord> words)
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>// w w w . ja va 2 s. co m * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:Anaphora_Resolution.ParseAllXMLDocuments.java
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException, TransformerException { // File dataFolder = new File("DataToPort"); // File[] documents; String grammar = "grammar/englishPCFG.ser.gz"; String[] options = { "-maxLength", "100", "-retainTmpSubcategories" }; //LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); ////www . j a v a 2 s .co m // if (dataFolder.isDirectory()) { // documents = dataFolder.listFiles(); // } else { // documents = new File[] {dataFolder}; // } // int currfile = 0; // int totfiles = documents.length; // for (File paper : documents) { // currfile++; // if (paper.getName().equals(".DS_Store")||paper.getName().equals(".xml")) { // currfile--; // totfiles--; // continue; // } // System.out.println("Working on "+paper.getName()+" (file "+currfile+" out of "+totfiles+")."); // // DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); // This is for XML // DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); // Document doc = docBuilder.parse(paper.getAbsolutePath()); // // NodeList textlist = doc.getElementsByTagName("text"); // for(int i=0; i < textlist.getLength(); i++) { // Node currentnode = textlist.item(i); // String wholetext = textlist.item(i).getTextContent(); String wholetext = "How about other changes for example the ways of doing the work and \n" + "\n" + "Iwould say security has , there 's more pressure put on people now than there used to be because obviously , especially after Locherbie , they tightened up on security and there 's a lot more pressure now especially from the ETR and stuff like that \n" + "People do n't feel valued any more , they feel I do n't know I think they feel that nobody cares about them really anyway \n"; //System.out.println(wholetext); //Iterable<List<? extends HasWord>> sentences; ArrayList<Tree> parseTrees = new ArrayList<Tree>(); String asd = ""; int j = 0; StringReader stringreader = new StringReader(wholetext); DocumentPreprocessor dp = new DocumentPreprocessor(stringreader); @SuppressWarnings("rawtypes") ArrayList<List> sentences = preprocess(dp); for (List sentence : sentences) { parseTrees.add(lp.apply(sentence)); // Parsing a new sentence and adding it to the parsed tree ArrayList<Tree> PronounsList = findPronouns(parseTrees.get(j)); // Locating all pronouns to resolve in the sentence Tree corefedTree; for (Tree pronounTree : PronounsList) { parseTrees.set(parseTrees.size() - 1, HobbsResolve(pronounTree, parseTrees)); // Resolving the coref and modifying the tree for each pronoun } StringWriter strwr = new StringWriter(); PrintWriter prwr = new PrintWriter(strwr); TreePrint tp = new TreePrint("penn"); tp.printTree(parseTrees.get(j), prwr); prwr.flush(); asd += strwr.toString(); j++; } String armando = ""; for (Tree sentence : parseTrees) { for (Tree leaf : Trees.leaves(sentence)) armando += leaf + " "; } System.out.println(wholetext); System.out.println(); System.out.println("......"); System.out.println(armando); System.out.println("All done."); // currentnode.setTextContent(asd); // } // TransformerFactory transformerFactory = TransformerFactory.newInstance(); // Transformer transformer = transformerFactory.newTransformer(); // DOMSource source = new DOMSource(doc); // StreamResult result = new StreamResult(paper); // transformer.transform(source, result); // // System.out.println("Done"); // } }
From source file:com.parse.Dependency.java
public static void main(String[] args) { LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "80", "-retainTmpSubcategories", }); String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint();/* w ww . j av a2 s . c o m*/ System.out.println(); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); //System.out.println(); //TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); // tp.printTree(parse); String sentence = "which movies were directed by Christopher Nolan"; Tree t2 = lp.parse(sentence); System.out.println(t2.firstChild().toString()); gs = gsf.newGrammaticalStructure(t2); tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(tdl.get(0).dep().nodeString()); }
From source file:DependencyParser.Parser.java
public void CallParser(String text) // start of the main method { try {/*from w ww . ja v a2 s .co m*/ TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); System.out.println(tdl); PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt"); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree, pw); pw.close(); Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3); assert (new File("image.png").exists()); } catch (FileNotFoundException f) { } catch (Exception ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:DependencyParser.RunStanfordParser.java
public RunStanfordParser(String filename) throws FileNotFoundException, IOException { // input format: data directory, and output directory String fileToParse = filename; LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want // Call parser on files, and tokenize the contents FileInputStream fstream = new FileInputStream(fileToParse); DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream BufferedReader br = new BufferedReader(new InputStreamReader(in)); StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage PTBTokenizer tkzr; // tokenizer object WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object // Read File Line By Line String strLine;/*from ww w. ja va 2s. c om*/ while ((strLine = br.readLine()) != null) { System.out.println("Tokenizing and Parsing: " + strLine); // print current line to console // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(strLine); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Output Option 1: Printing out various data by accessing it programmatically // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { if (tw.tag().startsWith("N") || tw.tag().startsWith("J")) { words.add(tw.word()); tags.add(tw.tag()); } } System.out.println("Noun and Ajective words: " + words); System.out.println("POStags: " + tags); // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWordsAndTags: " + stems); System.out.println("typedDependencies: " + tdl); // getAspect_OpinionWord(tdl.toString(),words,tags); TreePrint tp = new TreePrint("words,penn"); //TreePrint tn = new TreePrint("words,typedDependenciesCollapsed"); //TreePrint to = new TreePrint("rootLabelOnlyFormat,penn"); //System.out.println("Tree print"+tp.); tp.printTree(parse); //tn.printTree(parse); System.out.println("Noun Phrases are: -------"); //(NP (DT a) (JJ temporary) (NN searcher)) String reg = "(\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\(DT \\w*\\) \\((JJ||JJR||JJS) \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)) | (\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\))"; Pattern patt = Pattern.compile(reg); System.out.println(" Noun Phrase List:.."); dfs(parse, parse, patt); //for (Tree subtree: parse) //{ /* if(subtree.label().value().equals("NP")) { String a=subtree.toString(); //System.out.println(a); Matcher match = patt.matcher(a.trim()); while(match.find()) { System.out.println("NP: "+match.group()); } }*/ /*for(Tree np:subtree) { if(np.label().value().equals("NP")) { for(Tree n:np) { if(np.label().value().equals("\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else if(np.label().value().equals("\\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else if(np.label().value().equals("\\(DT \\w*\\) \\((NN||NNS||NNP) \\w*\\) \\((NN||NNS||NNP) \\w*\\)")) { System.out.println("NP tag Tags: "+np); System.out.println(Sentence.listToString(np.yield())); } else{ if(n.label().value().equals("NP")) { System.out.println("N tag Tags: "+n); System.out.println(Sentence.listToString(n.yield())); } } } } }*/ //} //} System.out.println(); // separate output lines*/ } }
From source file:Engines.Test.StanfordParser.TreeHandling.java
License:Open Source License
public static void test(String text) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); }
From source file:englishparser.EnglishParser.java
public static void demoDP(LexicalizedParser lp, String filename) throws FileNotFoundException, IOException { printer_NP = new ResultSaver("/home/bigstone/Documents/medicine_NP.txt"); printer_NN = new ResultSaver("/home/bigstone/Documents/medicine_NN.txt"); printer_NNP = new ResultSaver("/home/bigstone/Documents/medicine_NNP.txt"); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); for (List<HasWord> sentence : new DocumentPreprocessor(filename)) { Tree parse = lp.apply(sentence); extractNP(parse);/* w w w. ja v a2s . co m*/ extractNN(parse); extractNNP(parse); } printer_NP.close(); }
From source file:englishparser.EnglishParser.java
/** * demoAPI demonstrates other ways of calling the parser with already * tokenized text, or in some cases, raw text that needs to be tokenized as * a single sentence. Output is handled with a TreePrint object. Note that * the options used when creating the TreePrint can determine what results * to print out. Once again, one can capture the output by passing a * PrintWriter to TreePrint.printTree./*from w w w. j a v a2 s.com*/ */ public static void demoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer String sent2 = "This is another sentence."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
From source file:nlpOperations.RunStanfordParser.java
public static void main(String[] args) throws Exception { String fileToParse = "E:\\OWL\\test.txt"; String englishDataUrl = "E:\\phd-project-tools\\q-system\\stanford-parser-full-2014-06-16\\stanford-parser-full-2014-06-16\\englishPCFG.ser.gz"; LexicalizedParser lp = LexicalizedParser.loadModel(englishDataUrl, "-maxLength", "80", "-retainTmpSubcategories"); // Call parser on files, and tokenize the contents FileInputStream fstream = new FileInputStream(fileToParse); DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream BufferedReader br = new BufferedReader(new InputStreamReader(in)); StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage PTBTokenizer tkzr; // tokenizer object WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object // Read File Line By Line String strLine;// w w w.j a v a 2 s .com while ((strLine = br.readLine()) != null) { System.out.println("Tokenizing and Parsing: " + strLine); // print current line to console // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(strLine); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); System.out.println("tokens: " + toks); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Output Option 1: Printing out various data by accessing it programmatically // Get words, stemmed words and POS tags ArrayList<String> words = new ArrayList(); ArrayList<String> stems = new ArrayList(); ArrayList<String> tags = new ArrayList(); // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { words.add(tw.word()); tags.add(tw.tag()); } // Get stems ls.visitTree(parse); // apply the stemmer to the tree for (TaggedWord tw : parse.taggedYield()) { stems.add(tw.word()); } // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCollapsed(); // And print! System.out.println("words: " + words); System.out.println("POStags: " + tags); System.out.println("stemmedWordsAndTags: " + stems); System.out.println("typedDependencies: " + tdl); // Output Option 2: Printing out various data using TreePrint // Various TreePrint options // "penn", // constituency parse // "oneline", // rootLabelOnlyFormat, // "words", // "wordsAndTags", // unstemmed words and pos tags // "dependencies", // unlabeled dependency parse // "typedDependencies", // dependency parse // "typedDependenciesCollapsed", // "latexTree", // "collocations", // "semanticGraph" // Print using TreePrint with various options // TreePrint tp = new TreePrint("wordsAndTags,semanticGraph"); // tp.printTree(parse); // System.out.println(); // separate output lines TreePrint tp1 = new TreePrint("wordsAndTags,latexTree"); tp1.printTree(parse); System.out.println(); // separate output lines // TreePrint tp2 = new TreePrint("wordsAndTags,collocations"); // tp2.printTree(parse); // System.out.println(); // separate output lines // // TreePrint tp3 = new TreePrint("wordsAndTags,dependencies"); // tp3.printTree(parse); // System.out.println(); // separate output lines } }
From source file:nlpOperations.RunStanfordParser.java
public static String getVerbs(String oneSent) { String resultStr = ""; String englishDataUrl = "D:\\owl-ontology\\englishPCFG.ser.gz"; LexicalizedParser lp = LexicalizedParser.loadModel(englishDataUrl, "-maxLength", "80", "-retainTmpSubcategories"); StringReader sr;// www. j a v a2s .c om PTBTokenizer tkzr; // do all the standard java over-complication to use the stanford parser tokenizer sr = new StringReader(oneSent); tkzr = PTBTokenizer.newPTBTokenizer(sr); List toks = tkzr.tokenize(); Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something // Get words and Tags for (TaggedWord tw : parse.taggedYield()) { if (tw.tag().startsWith("V")) { resultStr += tw.word() + "#"; } } return resultStr; }