List of usage examples for edu.stanford.nlp.trees TreeReader readTree
public Tree readTree() throws IOException;
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils.java
License:Open Source License
/** * Reads in a Penn Treebank-style String and returns a tree. * //from w w w .ja va 2 s. co m * @param pennString * A Penn Treebank-style String as produced by the StandfordParser * @return a tree representation of the PennString (LabeledScoredTree) */ public static Tree pennString2Tree(String pennString) { TreeReader tr = null; try { tr = new PennTreeReader(new StringReader(pennString), new LabeledScoredTreeFactory()); return tr.readTree(); } catch (IOException e) { throw new IllegalStateException(e); } finally { closeQuietly(tr); } }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
public Tree readTreeFromString(String parseStr) { //read in the input into a Tree data structure TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), tree_factory); Tree inputTree = null;//from w ww . ja va 2 s . com try { inputTree = treeReader.readTree(); } catch (IOException e) { e.printStackTrace(); } return inputTree; }
From source file:edu.cmu.ark.nlp.question.QuestionUtil.java
License:Open Source License
public static Tree readTreeFromString(String parseStr) { //read in the input into a Tree data structure TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), QuestionUtil.getTreeFactory()); Tree inputTree = null;//from ww w . j a va 2 s . com try { inputTree = treeReader.readTree(); } catch (IOException e) { e.printStackTrace(); } return inputTree; }
From source file:edu.jhu.agiga.StanfordAgigaSentence.java
License:Open Source License
public Tree getStanfordContituencyTree() { TreeFactory tf = new LabeledScoredTreeFactory(); StringReader r = new StringReader(getParseText()); TreeReader tr = new PennTreeReader(r, tf); try {//from ww w. j a v a2s. co m return tr.readTree(); } catch (IOException e) { throw new RuntimeException("Error: IOException should not be thrown by StringReader"); } }
From source file:elkfed.expletives.TrainingData.java
License:Apache License
public static void extractExamples(String file, Set<String> anaphoricPronouns, List<ExpletiveInstance> instances) throws FileNotFoundException, IOException { TreeReader tr = new PennTreeReader(new FileReader(file), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree t;/*from w w w . j a va2 s .c o m*/ String file_id = file.substring(file.length() - 8, file.length() - 4); int sent_idx = 1; while ((t = tr.readTree()) != null) { //t.pennPrint(); int word_idx = 1; for (Tree t1 : t.getLeaves()) { String s = t1.toString(); if ("it".equals(s) || "It".equals(s)) { String id = String.format("%s:S%d:%d-%d", file_id, sent_idx, word_idx, word_idx); ExpletiveInstance inst = new ExpletiveInstance(t, t1, id); boolean is_positive = anaphoricPronouns.contains(id); inst.setFeature(PairInstance.FD_POSITIVE, !is_positive); instances.add(inst); String cls = is_positive ? "+1" : "-1"; System.out.format("%s\t%s\t(%s)\n", s, id, cls); } word_idx++; } //System.out.println(); //System.out.println(t); sent_idx++; } }
From source file:elkfed.mmax.importer.ImportOntonotes.java
License:Apache License
public MiniDiscourse importFile(String fname) { try {//from w w w . j av a 2s . c o m boolean had_space = true; boolean need_bugfix = System.getProperty("elkfed.BuggyOntonotes", "no").matches("y|yes|true"); List<Tag> names_stack = new ArrayList<Tag>(); Alphabet<String> sets = new Alphabet<String>(); sets.lookupIndex("*DUMMY*"); int sent_id = 0; Tag sentence_tag = null; OntonotesReader reader = new OntonotesReader(new File(fname + ".coref")); OntonotesReader readerNE = new OntonotesReader(new File(fname + ".name")); TreeReader tr = new PennTreeReader(new FileReader(fname + ".parse"), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree tree = null; int eventType = reader.getNextEvent(); boolean in_text = false; do { if (eventType == OntonotesReader.START_TAG && "COREF".equals(reader.getName())) { Tag t; if (need_bugfix) { t = buggy_push_tag("coref", tag_stack); } else { t = push_tag("coref"); } if ("IDENT".equals(reader.getAttribute("TYPE"))) { t.attrs.put("coref_set", "set_" + sets.lookupIndex(reader.getAttribute("ID"))); } had_space = true; } else if (eventType == OntonotesReader.END_TAG && "COREF".equals(reader.getName())) { Tag t = pop_tag("coref"); DetermineMinSpan.addMinSpan(sentence_tag.start, tree, t, tokens); had_space = true; } else if (in_text && eventType == OntonotesReader.TOKEN) { if (!reader.isTrace()) { // process up to the next token in the names part int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.TOKEN) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else { throw new IllegalStateException("Unexpected event:" + names_event); } names_event = readerNE.getNextEvent(); } assert (reader.getToken().equals(readerNE.getToken())); String tok = reader.getToken(); if (tok.equals("-LRB-")) tok = "("; if (tok.equals("-RRB-")) tok = ")"; if (tok.equals("-LSB-")) tok = "["; if (tok.equals("-RSB-")) tok = "]"; if (tok.equals("-LCB-")) tok = "{"; if (tok.equals("-RCB-")) tok = "}"; add_token(tok); } } else if (in_text && eventType == OntonotesReader.NEWLINE) { //System.out.println("sentence break"); if (sentence_tag != null) { sentence_tag.end = tokens.size() - 1; if (sentence_tag.end >= sentence_tag.start) { tags.add(sentence_tag); if (tree != null) { Tag parse_tag = new Tag(); parse_tag.tag = "parse"; parse_tag.start = sentence_tag.start; parse_tag.end = sentence_tag.end; parse_tag.attrs.put("tag", tree.toString()); tags.add(parse_tag); assert sentence_tag.end - sentence_tag.start + 1 == tree.yield().size() : String .format("%s / %s", tokens.subList(sentence_tag.start, sentence_tag.end + 1), tree.yield()); addParseInfo(sentence_tag.start, tree); } } } // process up to end of sentence in names annotation int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.NEWLINE) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else if (names_event == OntonotesReader.END_TAG && "DOC".equals(readerNE.getName())) { // ignore } else { throw new IllegalStateException( "Unexpected event:" + readerNE.describeEvent(names_event)); } names_event = readerNE.getNextEvent(); } // prepare new parse and sentence sentence_tag = new Tag(); sentence_tag.start = tokens.size(); sentence_tag.tag = "sentence"; sentence_tag.attrs.put("orderid", "" + sent_id++); tree = tr.readTree(); } else if (eventType == OntonotesReader.END_TAG && "DOCNO".equals(reader.getName())) { in_text = true; // go to the end of the DOCNO part in name doc int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.END_TAG || !"DOCNO".equals(reader.getName())) { names_event = readerNE.getNextEvent(); } } else if (eventType == OntonotesReader.START_TAG && "TURN".equals(reader.getName())) { int names_event = readerNE.getNextEvent(); if (names_event != OntonotesReader.START_TAG || !"TURN".equals(readerNE.getName())) { throw new UnsupportedOperationException("TURN in coref but not in names"); } // parse level seems to be inconsistent... so don't check here :-| System.err.println("TURN parse:" + tree.toString()); tree = tr.readTree(); eventType = reader.getNextEvent(); names_event = readerNE.getNextEvent(); if (eventType != OntonotesReader.NEWLINE || names_event != OntonotesReader.NEWLINE) { throw new UnsupportedOperationException("No Newline after TURN"); } } eventType = reader.getNextEvent(); } while (eventType != OntonotesReader.END_DOCUMENT); return create(); } catch (IOException ex) { throw new RuntimeException("Cannot read file", ex); } }
From source file:qmul.corpus.SwitchboardCorpus.java
License:Open Source License
/** * @param dialogueName/* w w w . java 2 s . c om*/ * @param genre * @param reader * @return whether to carry on or not */ private boolean getSentences(String dialogueName, String genre, TreeReader reader) { Pattern p = Pattern.compile("\\(CODE\\s+(?:\\([A-Z]+\\s+)?Speaker([A-Za-z]+)(\\d+)"); try { Dialogue dialogue = null; DialogueSpeaker speaker = null; DialogueSpeaker lastSpeaker = null; DialogueTurn currentTurn = null; int currentSubdialogue = -1; int turnNum = -1; Tree tree = reader.readTree(); Filter<Tree> nodeFilter = new NodeFilter(); while (tree != null) { Matcher m = p.matcher(tree.toString()); if (m.find()) { // get the metadata turnNum = Integer.parseInt(m.group(2)); int subDialogue = 0; // apparently no subdialogues in SWBD ... String spk = m.group(1).toUpperCase(); // start new dialogue if subdialogue changed if (subDialogue != currentSubdialogue) { if (dialogue != null) { if (!checkDialogue(dialogue)) { return false; } } // dialogue = addDialogue(dialogueName + ":" + subDialogue, genre); dialogue = addDialogue(dialogueName, genre); // TODO genre in SWBD? getGenreMap().put(dialogueName, genre); } currentSubdialogue = subDialogue; // set up speaker String spkId = dialogue.getId() + ":" + spk; if (!getSpeakerMap().containsKey(spkId)) { // TODO speaker info in SWBD? getSpeakerMap().put(spkId, new DialogueSpeaker(spkId, "", "", "", "", "")); // System.out.println("added new speaker " + spkId); } speaker = getSpeakerMap().get(spkId); } else { // get the tree and extract the transcription String trans = ""; // SWBD embeds trees within an extra unlabelled level ((S etc)) if (((tree.label() == null) || (tree.label().value() == null)) && (tree.children().length == 1)) { tree = tree.getChild(0); } if (tree != null) { tree = tree.prune(nodeFilter); if (tree != null) { for (Tree leaf : tree.getLeaves()) { trans += leaf.label() + " "; } trans = trans.substring(0, trans.length() - 1); // start new turn if speaker has changed if ((lastSpeaker == null) || !speaker.equals(lastSpeaker) || (currentTurn == null)) { currentTurn = dialogue.addTurn(turnNum, speaker); // System.out.println("new turn " + turnNum + ", " + speaker + " " + currentTurn); lastSpeaker = speaker; } // add sentence dialogue.addSent(-1, currentTurn, trans, tree); // DialogueSentence s = dialogue.addSent(-1, currentTurn, trans, tree); // System.out.println("new sent " + s); // System.out.println(s.getSyntax().pennString()); } } } tree = reader.readTree(); } return checkDialogue(dialogue); } catch (IOException e) { System.err.println("Error reading sentence line" + e.getMessage()); return false; } }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
private static void sharedTaskSpanGen(File treeFile) throws IOException { TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), Util.ENCODING)); TreeReader tr = new PennTreeReader(r, tf); Tree root = tr.readTree(); String article = treeFile.getName().substring(0, 8); String outFileName = treeFile.toString(); outFileName = outFileName.substring(0, outFileName.lastIndexOf('.')); BufferedReader reader = Util.reader(outFileName + ".tkn"); PrintWriter printer = new PrintWriter(outFileName + ".csv"); int treeNumber = 0; while (root != null) { String lineRead = reader.readLine(); if (root.children().length > 0) { List<Tree> leaves = root.getLeaves(); HashMap<String, String[]> tokens = sharedTaskTokens(lineRead); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String word = leaf.toString(); String wordKey = word.replaceAll("/", "\\\\/"); wordKey = wordKey.replaceAll("\\*", "\\\\*"); String[] spanLine = tokens.get(wordKey); String key = article + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); printer.println(key + "," + spanLine[1] + "," + word + "," + spanLine[2]); }//w w w . j a v a 2 s . c om } root = tr.readTree(); printer.flush(); ++treeNumber; } printer.close(); tr.close(); }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
public static void anyTextToSpanGen(File treeFile, File inputFile) throws IOException { log.info("Generating the spans of each node in the parse trees."); String orgText = Util.readFile(inputFile); orgText = orgText.replaceAll("`", "'").replaceAll("", "\"").replaceAll("", "'"); ;//from w w w . j a v a 2 s . c om PrintWriter pw = new PrintWriter(treeFile + ".csv"); TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), Util.ENCODING)); TreeReader tr = new PennTreeReader(r, tf); int index = 0; Tree root = tr.readTree(); int treeNumber = 0; while (root != null) { List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String parentValue = leaf.parent(root).value(); if (parentValue.equals("-NONE-")) { continue; } String word = nodeToString(leaf).trim(); word = word.replaceAll("`", "'"); word = word.replaceAll("", "'"); word = word.replaceAll("\\.\\.\\.", ". . ."); int span = orgText.indexOf(word, index); if (span == -1) { continue; } index = span + word.length() - 1; String spanString = (span + ".." + (span + word.length())); String key = treeFile.getName() + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); pw.println(key + "," + spanString + "," + word); } root = tr.readTree(); pw.flush(); ++treeNumber; } pw.close(); tr.close(); log.info("Done."); }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
/** * Generate the spans of each node in the auto parse trees. * //from w w w .j a v a2 s . c o m * @param treePath * @param rawTextPath * @throws IOException */ @SuppressWarnings("unused") public static void textToSpanGenAuto(String treePath, String rawTextPath) throws IOException { log.info("Generating the spans of each node in the auto parse trees."); String folder = "23/"; File[] files = new File(treePath + folder).listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.startsWith("wsj_") && name.endsWith(".mrg"); } }); for (File file : files) { String fileName = file.getName().replaceAll("\\.mrg", ""); String orgText = Util.readFile(rawTextPath + folder + fileName); orgText = orgText.replaceAll("`", "'"); PrintWriter pw = new PrintWriter(treePath + folder + fileName + ".csv"); TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); TreeReader tr = new PennTreeReader(r, tf); int index = 9; Tree root = tr.readTree(); int treeNumber = 0; while (root != null) { StringBuilder tmp = new StringBuilder(); List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String parentValue = leaf.parent(root).value(); if (parentValue.equals("-NONE-")) { continue; } String word = Corpus.nodeToString(leaf).trim(); if (word.equals(".")) { continue; } word = word.replaceAll("`", "'"); word = word.replaceAll("^\\p{Punct}*", ""); word = word.replaceAll("\\p{Punct}*$", ""); if (fileName.equals("wsj_2300") && index == 1457 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2330") && index == 6344 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2351") && index == 1040 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2360") && index == 2066 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2369") && index == 6434 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2381") && index == 2399 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2386") && index == 3522 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2386") && index == 3647 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2387") && index == 1466 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2387") && index == 5389 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2397") && index == 1032 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2306") && index == 5692 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2308") && index == 2373 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2315") && index == 1056 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2321") && index == 1279 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2330") && index == 1563 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2345") && index == 1838 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2350") && index == 699 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2351") && index == 778 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2351") && index == 2391 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2363") && index == 2868 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2367") && index == 1379 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2376") && index == 6687 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2377") && index == 2464 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2379") && index == 4711 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2379") && index == 5174 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2381") && index == 565 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2387") && index == 5430 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2387") && index == 5779 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2394") && index == 179 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2397") && index == 5243 && word.equals("will")) { word = "wo"; } int span = orgText.indexOf(word, index); while (span == -1) { span = orgText.indexOf(word, index); } if (span - index > 1) { String difference = orgText.substring(index, span).trim(); boolean isError = true; isError &= !difference.matches("(\\p{Punct}+\\s*)+") && difference.length() > 0; isError &= !difference.equals(""); } index = span + word.length(); String spanString = (span + ".." + (span + word.length())); String key = fileName + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); tmp.append(key + "," + spanString + "," + word); tmp.append('\n'); } root = tr.readTree(); pw.print(tmp); ++treeNumber; } pw.close(); tr.close(); } log.info("Done."); }