List of usage examples for edu.stanford.nlp.trees BobChrisTreeNormalizer BobChrisTreeNormalizer
public BobChrisTreeNormalizer()
From source file:elkfed.expletives.TrainingData.java
License:Apache License
public static void extractExamples(String file, Set<String> anaphoricPronouns, List<ExpletiveInstance> instances) throws FileNotFoundException, IOException { TreeReader tr = new PennTreeReader(new FileReader(file), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree t;//from w w w.ja v a 2 s .co m String file_id = file.substring(file.length() - 8, file.length() - 4); int sent_idx = 1; while ((t = tr.readTree()) != null) { //t.pennPrint(); int word_idx = 1; for (Tree t1 : t.getLeaves()) { String s = t1.toString(); if ("it".equals(s) || "It".equals(s)) { String id = String.format("%s:S%d:%d-%d", file_id, sent_idx, word_idx, word_idx); ExpletiveInstance inst = new ExpletiveInstance(t, t1, id); boolean is_positive = anaphoricPronouns.contains(id); inst.setFeature(PairInstance.FD_POSITIVE, !is_positive); instances.add(inst); String cls = is_positive ? "+1" : "-1"; System.out.format("%s\t%s\t(%s)\n", s, id, cls); } word_idx++; } //System.out.println(); //System.out.println(t); sent_idx++; } }
From source file:elkfed.mmax.importer.ImportOntonotes.java
License:Apache License
public MiniDiscourse importFile(String fname) { try {/* w w w . ja v a2 s . c o m*/ boolean had_space = true; boolean need_bugfix = System.getProperty("elkfed.BuggyOntonotes", "no").matches("y|yes|true"); List<Tag> names_stack = new ArrayList<Tag>(); Alphabet<String> sets = new Alphabet<String>(); sets.lookupIndex("*DUMMY*"); int sent_id = 0; Tag sentence_tag = null; OntonotesReader reader = new OntonotesReader(new File(fname + ".coref")); OntonotesReader readerNE = new OntonotesReader(new File(fname + ".name")); TreeReader tr = new PennTreeReader(new FileReader(fname + ".parse"), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree tree = null; int eventType = reader.getNextEvent(); boolean in_text = false; do { if (eventType == OntonotesReader.START_TAG && "COREF".equals(reader.getName())) { Tag t; if (need_bugfix) { t = buggy_push_tag("coref", tag_stack); } else { t = push_tag("coref"); } if ("IDENT".equals(reader.getAttribute("TYPE"))) { t.attrs.put("coref_set", "set_" + sets.lookupIndex(reader.getAttribute("ID"))); } had_space = true; } else if (eventType == OntonotesReader.END_TAG && "COREF".equals(reader.getName())) { Tag t = pop_tag("coref"); DetermineMinSpan.addMinSpan(sentence_tag.start, tree, t, tokens); had_space = true; } else if (in_text && eventType == OntonotesReader.TOKEN) { if (!reader.isTrace()) { // process up to the next token in the names part int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.TOKEN) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else { throw new IllegalStateException("Unexpected event:" + names_event); } names_event = readerNE.getNextEvent(); } assert (reader.getToken().equals(readerNE.getToken())); String tok = reader.getToken(); if (tok.equals("-LRB-")) tok = "("; if (tok.equals("-RRB-")) tok = ")"; if (tok.equals("-LSB-")) tok = "["; if (tok.equals("-RSB-")) tok = "]"; if (tok.equals("-LCB-")) tok = "{"; if (tok.equals("-RCB-")) tok = "}"; add_token(tok); } } else if (in_text && eventType == OntonotesReader.NEWLINE) { //System.out.println("sentence break"); if (sentence_tag != null) { sentence_tag.end = tokens.size() - 1; if (sentence_tag.end >= sentence_tag.start) { tags.add(sentence_tag); if (tree != null) { Tag parse_tag = new Tag(); parse_tag.tag = "parse"; parse_tag.start = sentence_tag.start; parse_tag.end = sentence_tag.end; parse_tag.attrs.put("tag", tree.toString()); tags.add(parse_tag); assert sentence_tag.end - sentence_tag.start + 1 == tree.yield().size() : String .format("%s / %s", tokens.subList(sentence_tag.start, sentence_tag.end + 1), tree.yield()); addParseInfo(sentence_tag.start, tree); } } } // process up to end of sentence in names annotation int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.NEWLINE) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else if (names_event == OntonotesReader.END_TAG && "DOC".equals(readerNE.getName())) { // ignore } else { throw new IllegalStateException( "Unexpected event:" + readerNE.describeEvent(names_event)); } names_event = readerNE.getNextEvent(); } // prepare new parse and sentence sentence_tag = new Tag(); sentence_tag.start = tokens.size(); sentence_tag.tag = "sentence"; sentence_tag.attrs.put("orderid", "" + sent_id++); tree = tr.readTree(); } else if (eventType == OntonotesReader.END_TAG && "DOCNO".equals(reader.getName())) { in_text = true; // go to the end of the DOCNO part in name doc int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.END_TAG || !"DOCNO".equals(reader.getName())) { names_event = readerNE.getNextEvent(); } } else if (eventType == OntonotesReader.START_TAG && "TURN".equals(reader.getName())) { int names_event = readerNE.getNextEvent(); if (names_event != OntonotesReader.START_TAG || !"TURN".equals(readerNE.getName())) { throw new UnsupportedOperationException("TURN in coref but not in names"); } // parse level seems to be inconsistent... so don't check here :-| System.err.println("TURN parse:" + tree.toString()); tree = tr.readTree(); eventType = reader.getNextEvent(); names_event = readerNE.getNextEvent(); if (eventType != OntonotesReader.NEWLINE || names_event != OntonotesReader.NEWLINE) { throw new UnsupportedOperationException("No Newline after TURN"); } } eventType = reader.getNextEvent(); } while (eventType != OntonotesReader.END_DOCUMENT); return create(); } catch (IOException ex) { throw new RuntimeException("Cannot read file", ex); } }