List of usage examples for edu.stanford.nlp.trees Tree toString
@Override
public String toString()
From source file:elkfed.mmax.importer.ImportOntonotes.java
License:Apache License
public MiniDiscourse importFile(String fname) { try {//ww w .j ava 2s . co m boolean had_space = true; boolean need_bugfix = System.getProperty("elkfed.BuggyOntonotes", "no").matches("y|yes|true"); List<Tag> names_stack = new ArrayList<Tag>(); Alphabet<String> sets = new Alphabet<String>(); sets.lookupIndex("*DUMMY*"); int sent_id = 0; Tag sentence_tag = null; OntonotesReader reader = new OntonotesReader(new File(fname + ".coref")); OntonotesReader readerNE = new OntonotesReader(new File(fname + ".name")); TreeReader tr = new PennTreeReader(new FileReader(fname + ".parse"), new LabeledScoredTreeFactory(), new BobChrisTreeNormalizer()); Tree tree = null; int eventType = reader.getNextEvent(); boolean in_text = false; do { if (eventType == OntonotesReader.START_TAG && "COREF".equals(reader.getName())) { Tag t; if (need_bugfix) { t = buggy_push_tag("coref", tag_stack); } else { t = push_tag("coref"); } if ("IDENT".equals(reader.getAttribute("TYPE"))) { t.attrs.put("coref_set", "set_" + sets.lookupIndex(reader.getAttribute("ID"))); } had_space = true; } else if (eventType == OntonotesReader.END_TAG && "COREF".equals(reader.getName())) { Tag t = pop_tag("coref"); DetermineMinSpan.addMinSpan(sentence_tag.start, tree, t, tokens); had_space = true; } else if (in_text && eventType == OntonotesReader.TOKEN) { if (!reader.isTrace()) { // process up to the next token in the names part int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.TOKEN) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else { throw new IllegalStateException("Unexpected event:" + names_event); } names_event = readerNE.getNextEvent(); } assert (reader.getToken().equals(readerNE.getToken())); String tok = reader.getToken(); if (tok.equals("-LRB-")) tok = "("; if (tok.equals("-RRB-")) tok = ")"; if (tok.equals("-LSB-")) tok = "["; if (tok.equals("-RSB-")) tok = "]"; if (tok.equals("-LCB-")) tok = "{"; if (tok.equals("-RCB-")) tok = "}"; add_token(tok); } } else if (in_text && eventType == OntonotesReader.NEWLINE) { //System.out.println("sentence break"); if (sentence_tag != null) { sentence_tag.end = tokens.size() - 1; if (sentence_tag.end >= sentence_tag.start) { tags.add(sentence_tag); if (tree != null) { Tag parse_tag = new Tag(); parse_tag.tag = "parse"; parse_tag.start = sentence_tag.start; parse_tag.end = sentence_tag.end; parse_tag.attrs.put("tag", tree.toString()); tags.add(parse_tag); assert sentence_tag.end - sentence_tag.start + 1 == tree.yield().size() : String .format("%s / %s", tokens.subList(sentence_tag.start, sentence_tag.end + 1), tree.yield()); addParseInfo(sentence_tag.start, tree); } } } // process up to end of sentence in names annotation int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.NEWLINE) { if (names_event == OntonotesReader.START_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = push_tag("enamex", names_stack); t.attrs.put("tag", readerNE.getAttribute("TYPE")); } else if (names_event == OntonotesReader.END_TAG && "ENAMEX".equals(readerNE.getName())) { Tag t = pop_tag("enamex", names_stack); } else if (names_event == OntonotesReader.END_TAG && "DOC".equals(readerNE.getName())) { // ignore } else { throw new IllegalStateException( "Unexpected event:" + readerNE.describeEvent(names_event)); } names_event = readerNE.getNextEvent(); } // prepare new parse and sentence sentence_tag = new Tag(); sentence_tag.start = tokens.size(); sentence_tag.tag = "sentence"; sentence_tag.attrs.put("orderid", "" + sent_id++); tree = tr.readTree(); } else if (eventType == OntonotesReader.END_TAG && "DOCNO".equals(reader.getName())) { in_text = true; // go to the end of the DOCNO part in name doc int names_event = readerNE.getNextEvent(); while (names_event != OntonotesReader.END_TAG || !"DOCNO".equals(reader.getName())) { names_event = readerNE.getNextEvent(); } } else if (eventType == OntonotesReader.START_TAG && "TURN".equals(reader.getName())) { int names_event = readerNE.getNextEvent(); if (names_event != OntonotesReader.START_TAG || !"TURN".equals(readerNE.getName())) { throw new UnsupportedOperationException("TURN in coref but not in names"); } // parse level seems to be inconsistent... so don't check here :-| System.err.println("TURN parse:" + tree.toString()); tree = tr.readTree(); eventType = reader.getNextEvent(); names_event = readerNE.getNextEvent(); if (eventType != OntonotesReader.NEWLINE || names_event != OntonotesReader.NEWLINE) { throw new UnsupportedOperationException("No Newline after TURN"); } } eventType = reader.getNextEvent(); } while (eventType != OntonotesReader.END_DOCUMENT); return create(); } catch (IOException ex) { throw new RuntimeException("Cannot read file", ex); } }
From source file:knowledgeextraction.EntityAttributeGraph.java
public static void main(String[] args) throws IOException { // TODO code application logic here BufferedReader reader = new BufferedReader(new FileReader(filePath)); String text = reader.readLine(); Annotation document = new Annotation(text); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit");//, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document);// ww w . j av a2 s . c o m List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { String input = sentence.toString(); System.out.println(input); Tree tree = new EntityAttributeGraph().parse(input); System.out.println("tree: " + tree.toString()); // Get dependency tree TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> td = gs.typedDependenciesCollapsed(); //System.out.println(td); Object[] list = td.toArray(); //System.out.println(list.length); PrintBestPath(list); //System.out.println(); } }
From source file:knu.univ.lingvo.coref.MentionExtractor.java
License:Open Source License
private String treeToKey(Tree t) { int idx = getHeadIndex(t); String key = Integer.toString(idx) + ':' + t.toString(); return key;// w w w. j a v a 2s. c o m }
From source file:MedArkRef.AnalysisUtilities.java
License:Open Source License
public arkref.parsestuff.AnalysisUtilities.ParseResult parseSentence(String sentence) { String result = ""; //System.err.println(sentence); //see if a parser socket server is available int port = new Integer(GlobalProperties.getProperties().getProperty("parserServerPort", "5556")); String host = "127.0.0.1"; Socket client;/* w w w . j av a 2 s. co m*/ PrintWriter pw; BufferedReader br; String line; Tree parse = null; double parseScore = Double.MIN_VALUE; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); //flush to complete the transmission while ((line = br.readLine()) != null) { //if(!line.matches(".*\\S.*")){ // System.out.println(); //} if (br.ready()) { line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; } else { parseScore = new Double(line); } } br.close(); pw.close(); client.close(); if (parse == null) { parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; } if (GlobalProperties.getDebug()) System.err.println("result (parse):" + result); parse = readTreeFromString(result); return new arkref.parsestuff.AnalysisUtilities.ParseResult(true, parse, parseScore); } catch (Exception ex) { if (GlobalProperties.getDebug()) System.err.println("Could not connect to parser server."); //ex.printStackTrace(); } System.err.println("parsing:" + sentence); //if socket server not available, then use a local parser object if (parser == null) { try { Options op = new Options(); String serializedInputFileOrUrl = GlobalProperties.getProperties().getProperty("parserGrammarFile", "config" + File.separator + "englishFactored.ser.gz"); parser = new LexicalizedParser(serializedInputFileOrUrl, op); int maxLength = new Integer(GlobalProperties.getProperties().getProperty("parserMaxLength", "40")) .intValue(); parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try { if (parser.parse(sentence)) { parse = parser.getBestParse(); //remove all the parent annotations (this is a hacky way to do it) String ps = parse.toString().replaceAll("\\[[^\\]]+/[^\\]]+\\]", ""); parse = AnalysisUtilities.getInstance().readTreeFromString(ps); parseScore = parser.getPCFGScore(); return new arkref.parsestuff.AnalysisUtilities.ParseResult(true, parse, parseScore); } } catch (Exception e) { } parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; return new arkref.parsestuff.AnalysisUtilities.ParseResult(false, parse, parseScore); }
From source file:opennlp.tools.parse_thicket.kernel_interface.MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
License:Apache License
protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits, boolean isPositive) { List<String[]> treeBankBuffer = new ArrayList<String[]>(); try {//ww w .j a va 2 s .c o m // get the parses from original documents, and form the training // dataset ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); List<Tree> forest = pt.getSentences(); // if from the first half or ranked docs, then positive, otherwise // negative String posOrNeg = null; if (isPositive) posOrNeg = " 1 "; else posOrNeg = " -1 "; // form the list of training samples for (Tree t : forest) { treeBankBuffer.add(new String[] { posOrNeg + " |BT| " + t.toString() + " |ET|" }); } } catch (Exception e) { e.printStackTrace(); } return treeBankBuffer; }
From source file:opennlp.tools.parse_thicket.kernel_interface.MultiSentenceKernelBasedSearchResultsProcessor.java
License:Apache License
protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) { List<String[]> treeBankBuffer = new ArrayList<String[]>(); try {//from www. j a va 2 s .c o m // get the parses from original documents, and form the training // dataset ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); List<Tree> forest = pt.getSentences(); // if from the first half or ranked docs, then positive, otherwise // negative String posOrNeg = null; if (count < hits.size() / 2) posOrNeg = " 1 "; else posOrNeg = " -1 "; // form the list of training samples for (Tree t : forest) { treeBankBuffer.add(new String[] { posOrNeg + " |BT| " + t.toString() + " |ET|" }); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return treeBankBuffer; }
From source file:opennlp.tools.parse_thicket.kernel_interface.PT2ExtendedTreeForestBuilder.java
License:Apache License
private List<String[]> formTrainingSetFromText(String para, boolean positive) { String prefix = null;/* w w w. ja v a 2s . c o m*/ if (positive) prefix = " 1 "; else prefix = " -1 "; ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para); List<Tree> forest = pt.getSentences(); List<String[]> treeBankBuffer = new ArrayList<String[]>(); for (Tree t : forest) { treeBankBuffer.add(new String[] { prefix + "|BT| " + t.toString() + " |ET|" }); } return treeBankBuffer; }
From source file:opennlp.tools.parse_thicket.kernel_interface.PT2ExtendedTreeForestBuilder.java
License:Apache License
private String formTrainingSetFromTextOneLine(String para, boolean positive) { String prefix = null;/* ww w . j av a2 s .c om*/ if (positive) prefix = " 1 "; else prefix = " -1 "; ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para); List<Tree> forest = pt.getSentences(); String line = prefix; for (Tree t : forest) { line += "|BT| " + t.toString() + " |ET| "; } return line; }
From source file:opennlp.tools.parse_thicket.kernel_interface.PT2ExtendedTreeForestBuilder.java
License:Apache License
public void classifySentences(String sentences, String path) { ParseThicket pt = matcher.buildParseThicketFromTextWithRST(sentences); List<Tree> forest = pt.getSentences(); List<String[]> treeBankBuffer = new ArrayList<String[]>(); for (Tree t : forest) { treeBankBuffer.add(new String[] { " 0 |BT| " + t.toString() + " |ET|" }); }/*w ww. j a v a 2 s.c o m*/ ProfileReaderWriter.writeReport(treeBankBuffer, path + "unknown.txt", ' '); tkRunner.runClassifier(path, "unknown.txt", modelFileName, "classifier_output.txt"); }
From source file:opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree.java
License:Apache License
public List<String> buildForestForCorefArcs(ParseThicket pt) { List<String> results = new ArrayList<String>(); for (WordWordInterSentenceRelationArc arc : pt.getArcs()) { // if (!arc.getArcType().getType().startsWith("coref")) // continue; int fromSent = arc.getCodeFrom().getFirst(); int toSent = arc.getCodeTo().getFirst(); if (fromSent < 1 || toSent < 1) // TODO problem in sentence // enumeration => skip building // extended trees return results; String wordFrom = arc.getLemmaFrom(); String wordTo = arc.getLemmaTo(); List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent - 1), pt.getSentences().get(fromSent - 1), new String[] { wordFrom }); if (trees == null || trees.size() < 1) continue; System.out.println(trees); StringBuilder sb = new StringBuilder(10000); toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent - 1), trees.get(0), new String[] { wordTo }); System.out.println(sb.toString()); results.add(sb.toString());/*from w w w. j av a2 s .co m*/ } // if no arcs then orig sentences if (results.isEmpty()) { for (Tree t : pt.getSentences()) { results.add(t.toString()); } } return results; }