List of usage examples for edu.stanford.nlp.trees TreeReader close
@Override public void close() throws IOException;
TreeReader
. From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
private static void sharedTaskSpanGen(File treeFile) throws IOException { TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), Util.ENCODING)); TreeReader tr = new PennTreeReader(r, tf); Tree root = tr.readTree();//w w w . jav a 2 s.c o m String article = treeFile.getName().substring(0, 8); String outFileName = treeFile.toString(); outFileName = outFileName.substring(0, outFileName.lastIndexOf('.')); BufferedReader reader = Util.reader(outFileName + ".tkn"); PrintWriter printer = new PrintWriter(outFileName + ".csv"); int treeNumber = 0; while (root != null) { String lineRead = reader.readLine(); if (root.children().length > 0) { List<Tree> leaves = root.getLeaves(); HashMap<String, String[]> tokens = sharedTaskTokens(lineRead); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String word = leaf.toString(); String wordKey = word.replaceAll("/", "\\\\/"); wordKey = wordKey.replaceAll("\\*", "\\\\*"); String[] spanLine = tokens.get(wordKey); String key = article + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); printer.println(key + "," + spanLine[1] + "," + word + "," + spanLine[2]); } } root = tr.readTree(); printer.flush(); ++treeNumber; } printer.close(); tr.close(); }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
public static void anyTextToSpanGen(File treeFile, File inputFile) throws IOException { log.info("Generating the spans of each node in the parse trees."); String orgText = Util.readFile(inputFile); orgText = orgText.replaceAll("`", "'").replaceAll("", "\"").replaceAll("", "'"); ;//from w w w . ja v a 2s.c om PrintWriter pw = new PrintWriter(treeFile + ".csv"); TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), Util.ENCODING)); TreeReader tr = new PennTreeReader(r, tf); int index = 0; Tree root = tr.readTree(); int treeNumber = 0; while (root != null) { List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String parentValue = leaf.parent(root).value(); if (parentValue.equals("-NONE-")) { continue; } String word = nodeToString(leaf).trim(); word = word.replaceAll("`", "'"); word = word.replaceAll("", "'"); word = word.replaceAll("\\.\\.\\.", ". . ."); int span = orgText.indexOf(word, index); if (span == -1) { continue; } index = span + word.length() - 1; String spanString = (span + ".." + (span + word.length())); String key = treeFile.getName() + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); pw.println(key + "," + spanString + "," + word); } root = tr.readTree(); pw.flush(); ++treeNumber; } pw.close(); tr.close(); log.info("Done."); }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
/** * Generate the spans of each node in the auto parse trees. * /*from ww w. j a v a 2 s . c o m*/ * @param treePath * @param rawTextPath * @throws IOException */ @SuppressWarnings("unused") public static void textToSpanGenAuto(String treePath, String rawTextPath) throws IOException { log.info("Generating the spans of each node in the auto parse trees."); String folder = "23/"; File[] files = new File(treePath + folder).listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.startsWith("wsj_") && name.endsWith(".mrg"); } }); for (File file : files) { String fileName = file.getName().replaceAll("\\.mrg", ""); String orgText = Util.readFile(rawTextPath + folder + fileName); orgText = orgText.replaceAll("`", "'"); PrintWriter pw = new PrintWriter(treePath + folder + fileName + ".csv"); TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); TreeReader tr = new PennTreeReader(r, tf); int index = 9; Tree root = tr.readTree(); int treeNumber = 0; while (root != null) { StringBuilder tmp = new StringBuilder(); List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String parentValue = leaf.parent(root).value(); if (parentValue.equals("-NONE-")) { continue; } String word = Corpus.nodeToString(leaf).trim(); if (word.equals(".")) { continue; } word = word.replaceAll("`", "'"); word = word.replaceAll("^\\p{Punct}*", ""); word = word.replaceAll("\\p{Punct}*$", ""); if (fileName.equals("wsj_2300") && index == 1457 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2330") && index == 6344 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2351") && index == 1040 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2360") && index == 2066 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2369") && index == 6434 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2381") && index == 2399 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2386") && index == 3522 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2386") && index == 3647 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2387") && index == 1466 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2387") && index == 5389 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2397") && index == 1032 && word.equals("n't")) { word = "'t"; } if (fileName.equals("wsj_2306") && index == 5692 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2308") && index == 2373 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2315") && index == 1056 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2321") && index == 1279 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2330") && index == 1563 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2345") && index == 1838 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2350") && index == 699 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2351") && index == 778 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2351") && index == 2391 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2363") && index == 2868 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2367") && index == 1379 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2376") && index == 6687 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2377") && index == 2464 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2379") && index == 4711 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2379") && index == 5174 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2381") && index == 565 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2387") && index == 5430 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2387") && index == 5779 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2394") && index == 179 && word.equals("will")) { word = "wo"; } if (fileName.equals("wsj_2397") && index == 5243 && word.equals("will")) { word = "wo"; } int span = orgText.indexOf(word, index); while (span == -1) { span = orgText.indexOf(word, index); } if (span - index > 1) { String difference = orgText.substring(index, span).trim(); boolean isError = true; isError &= !difference.matches("(\\p{Punct}+\\s*)+") && difference.length() > 0; isError &= !difference.equals(""); } index = span + word.length(); String spanString = (span + ".." + (span + word.length())); String key = fileName + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); tmp.append(key + "," + spanString + "," + word); tmp.append('\n'); } root = tr.readTree(); pw.print(tmp); ++treeNumber; } pw.close(); tr.close(); } log.info("Done."); }
From source file:sg.edu.nus.comp.pdtb.runners.SpanTreeExtractor.java
License:Open Source License
/** * Generate the spans of each node in the parse trees. * /*from w ww . ja v a 2 s.c o m*/ * @param treePath * @param rawTextPath * @throws IOException */ @SuppressWarnings("unused") public static void textToSpanGen(String treePath, String rawTextPath) throws IOException { log.info("Generating the spans of each node in the parse trees."); String[] topFolders = new File(treePath).list(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return new File(dir, name).isDirectory(); } }); for (String topFolder : topFolders) { String folder = topFolder + "/"; File tmp = new File(treePath + folder); if (tmp.isDirectory() && tmp.exists()) { File[] files = tmp.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith("mrg"); } }); for (File file : files) { log.info("Processing tree: " + file.getName()); String fileName = file.getName().replaceAll("\\.mrg", ""); String orgText = Util.readFile(rawTextPath + folder + fileName); orgText = orgText.replaceAll("`", "'"); PrintWriter pw = new PrintWriter(treePath + folder + fileName + ".csv"); TreeFactory tf = new LabeledScoredTreeFactory(); Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); TreeReader tr = new PennTreeReader(r, tf); int index = 9; if (fileName.equals("wsj_0285")) { index = 200; } if (fileName.equals("wsj_0901")) { index = 14; } Tree root = tr.readTree(); int treeNumber = 0; while (root != null) { List<Tree> leaves = root.getLeaves(); for (Tree leaf : leaves) { int nodeNumber = leaf.nodeNumber(root); String parentValue = leaf.parent(root).value(); if (parentValue.equals("-NONE-")) { continue; } String word = Corpus.nodeToString(leaf).trim(); if (fileName.equals("wsj_0998") && index == 4644) { continue; } if (word.equals(".") && !fileName.startsWith("wsj_23")) { continue; } // skipping dots after U.S. present in the parse // trees but not present in the original // text if (word.equals(".")) { if (fileName.equals("wsj_2303") && index == 1526) { continue; } if (fileName.equals("wsj_2314") && (index == 7625 || index == 7929)) { continue; } if (fileName.equals("wsj_2320") && (index == 474 || index == 3180)) { continue; } if (fileName.equals("wsj_2321") && index == 268) { continue; } if (fileName.equals("wsj_2324") && index == 490) { continue; } if (fileName.equals("wsj_2361") && index == 6563) { continue; } if (fileName.equals("wsj_2397") && (index == 2845 || index == 3273 || index == 3515)) { continue; } if (fileName.equals("wsj_2398") && index == 2793) { continue; } } word = word.replaceAll("`", "'"); word = word.replaceAll("\\.\\.\\.", ". . ."); if (fileName.equals("wsj_0004") && word.equals("IBC")) { word = "IBC/Donoghue"; } if (fileName.equals("wsj_0032") && word.equals("S.p.A.")) { word = "S.p.\nA."; } if (fileName.equals("wsj_0986") && index == 1804) { word = "5/ 16"; } if (fileName.equals("wsj_1737") && index == 689 && word.equals("U.S.")) { word = "U. S."; } if (fileName.equals("wsj_1974") && index == 1802 && word.equals("5/16")) { word = "5/ 16"; } int span = orgText.indexOf(word, index); if (fileName.equals("wsj_0110") && word.equals("7/16")) { word = "7/ 16"; } if (fileName.equals("wsj_0111") && word.equals("Rey/Fawcett")) { word = "Rey/ Fawcett"; } if (fileName.equals("wsj_0162") && word.equals("International")) { word = "In< ternational"; } if (fileName.equals("wsj_0359") && word.equals("Stovall/Twenty-First")) { word = "Stovall/ Twenty-First"; } if (fileName.equals("wsj_0400") && word.equals("16/32")) { word = "16/ 32"; } if (fileName.equals("wsj_0463") && word.equals("G.m.b.H.")) { word = "G.m.b.\nH."; } if (fileName.matches("wsj_(0660|1368|1371)") && word.matches("S\\.p\\.A\\.?(-controlled)?")) { word = word.replaceAll("S\\.p\\.A", "S.p.\nA"); } if (fileName.equals("wsj_0911") && word.equals("mystery/comedy")) { word = "mystery/ comedy"; } if (fileName.matches("wsj_(0917|1329)") && word.equals("G.m.b.H.")) { word = "G.m.b.\nH."; } if (fileName.equals("wsj_0998") && word.equals("Co.")) { word = "Co,."; } if (fileName.equals("wsj_1237") && word.equals("Bard/EMS")) { word = "Bard/ EMS"; } if (fileName.equals("wsj_1457")) { if (word.equals("fancy'shvartzer")) { word = "fancy 'shvartzer"; } else if (word.equals("the'breakup")) { word = "the 'breakup"; } } if (fileName.equals("wsj_1503") && word.equals("Gaming")) { word = "gaming"; } if (fileName.equals("wsj_1568") && word.equals(". . .")) { word = "..."; } if (fileName.equals("wsj_1583") && word.equals("'T-")) { word = "'T"; } if (fileName.equals("wsj_1625") && word.equals("staff")) { word = "staf"; } if (fileName.equals("wsj_1773") && word.equals("H.F.")) { word = "H. F."; } span = orgText.indexOf(word, index); if (fileName.equals("wsj_2170") && index == 7227 && word.equals("'s")) { span = 7227; word = ""; } if (span == -1) { continue; } if (span - index > 1) { String difference = orgText.substring(index, span).trim(); boolean isError = true; isError &= !(fileName.equals("wsj_0118") && difference.equals(".START")); isError &= !(fileName.matches("wsj_(0166|1156|2346)") && difference.equals(". \n\n.START")); isError &= !(fileName.equals("wsj_0203") && index == 2835 && span == 2955); isError &= !difference.matches("\\p{Punct}") && difference.length() > 0; isError &= !difference.equals("") && !difference.equals("><") && !difference.equals(". \n\n>"); isError &= !(fileName.equals("wsj_1625") && difference.equals("f")); isError &= !(fileName.equals("wsj_1839") && difference.equals(". .")); isError &= !(fileName.equals("wsj_2170") && difference.equals("'s")); isError &= !(fileName.equals("wsj_2346") && difference.equals(".START")); } index = span + word.length(); String spanString = (span + ".." + (span + word.length())); String key = fileName + "," + treeNumber + "," + nodeNumber; word = word.trim().replaceAll("\\s+", ""); word = word.replaceAll(",", "COMMA"); pw.println(key + "," + spanString + "," + word); } root = tr.readTree(); pw.flush(); ++treeNumber; } pw.close(); tr.close(); } } } log.info("Done."); }