List of usage examples for org.jdom2.xpath XPathFactory instance
public static final XPathFactory instance()
From source file:elh.eus.absa.CorpusReader.java
License:Open Source License
private void extractOpinionsAbsaSemEval2014(InputStream fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/* w w w .j ava 2 s. c om*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); Integer sId = 0; //sentence id Integer oId = 0; //opinion id for (Element sent : sentences) { sId++; StringBuilder sb = new StringBuilder(); String sentString = sent.getChildText("text"); sb = sb.append(sentString); Element aspectTerms = sent.getChild("aspectTerms"); if (aspectTerms != null) { List<Element> aspectTermList = aspectTerms.getChildren(); for (Element aspectElem : aspectTermList) { oId++; String trgt = aspectElem.getAttributeValue("target"); Integer offsetFrom = Integer.parseInt(aspectElem.getAttributeValue("from")); Integer offsetTo = Integer.parseInt(aspectElem.getAttributeValue("to")); String polarity = aspectElem.getAttributeValue("polarity"); //String cat = aspectElem.getAttributeValue("category"); //create and add opinion to the structure Opinion op = new Opinion("o" + oId, trgt, offsetFrom, offsetTo, polarity, null, "s" + sId); this.addOpinion(op); } //System.out.println(sb.toString()); } } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:elh.eus.absa.CorpusReader.java
License:Open Source License
/** * Read semeval-absa 2015 shared task formatted corpus and extract opinions. * /* w w w.ja v a 2 s. co m*/ * @param InputStream fileName: corpus * @param boolean nullSentOps: whether null opinions should be created for sentence with no opinion * (only used for semeval-absa 2015 formatted corpora) * */ private void extractOpinionsAbsaSemEval2015(InputStream fileName, boolean nullSentenceOps) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try { Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); String rId = ""; String sId = ""; //sentence id Integer oId = 0; //opinion id for (Element sent : sentences) { sId = sent.getAttributeValue("id"); rId = sId.replaceAll(":[0-9]+$", ""); if (rId.equalsIgnoreCase(sId)) { rId = sId.replaceAll("#[0-9]+$", ""); } //store the sentence and the corresponding review addRevSent(rId, sId); StringBuilder sb = new StringBuilder(); String sentString = sent.getChildText("text"); //add sentence to the reader object this.addSentence(sId, sentString); sb = sb.append(sentString); Element opinions = sent.getChild("Opinions"); if (opinions != null) { List<Element> opinionList = opinions.getChildren(); //there is no opinions if (opinionList.isEmpty()) { //System.err.println("kkkkk"); //create sentence at list, even if it has no opinion elements sId = sent.getAttributeValue("id"); addRevSent(rId, sId); String sentStr = sent.getChildText("text"); //add sentence to the reader object this.addSentence(sId, sentStr); if (nullSentenceOps) { oId++; //create and add opinion to the structure Opinion op = new Opinion("o" + oId, "NULL", 0, 0, "NULL", "NULL", sId); this.addOpinion(op); } } for (Element opElem : opinionList) { oId++; String trgt = opElem.getAttributeValue("target"); Integer offsetFrom = 0; Integer offsetTo = 0; try { offsetFrom = Integer.parseInt(opElem.getAttributeValue("from")); offsetTo = Integer.parseInt(opElem.getAttributeValue("to")); } catch (NumberFormatException ne) { } String polarity = opElem.getAttributeValue("polarity"); String cat = opElem.getAttributeValue("category"); //create and add opinion to the structure Opinion op = new Opinion("o" + oId, trgt, offsetFrom, offsetTo, polarity, cat, sId); this.addOpinion(op); //debugging sb.append("\n\t> " + "o" + oId + " " + trgt + " " + offsetFrom + "-" + offsetTo + " " + polarity + " " + cat); } //System.out.println(sb.toString()); } else { //System.err.println("kkkkk"); //create sentence at list, even if it has no opinion elements sId = sent.getAttributeValue("id"); addRevSent(rId, sId); String sentStr = sent.getChildText("text"); //add sentence to the reader object this.addSentence(sId, sentStr); if (nullSentenceOps) { oId++; //create and add opinion to the structure Opinion op = new Opinion("o" + oId, "NULL", 0, 0, "NULL", "NULL", sId); this.addOpinion(op); } } } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:elh.eus.absa.CorpusReader.java
License:Open Source License
/** * Extract sentence texts from tabulated format. The function assumes the text is PoS tagged in * Conll tabulated format.//from w ww .j ava2 s . co m * * @param fileName string: input corpus file path */ private void extractOpinionsTabText(InputStream fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try { Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); String rId = ""; String sId = ""; //sentence id Integer oId = 0; //opinion id for (Element sent : sentences) { sId = sent.getAttributeValue("id"); rId = sId; oId++; /*store the sentence and the corresponding review * (in this case this info is redundant, because a whole review is represented by a sentence) */ addRevSent(rId, sId); //StringBuilder sb = new StringBuilder(); String sentString = sent.getChildText("text"); //add sentence to the reader object this.addSentence(sId, sentString); String polarity = sent.getAttributeValue("polarity"); if (polarity == null) { System.err.println("no polarity annotation for review " + rId + "." + " Review won't be taken into account"); continue; } String trgt = ""; String cat = "global"; Integer offsetFrom = 0; Integer offsetTo = 0; //create and add opinion to the structure Opinion op = new Opinion("o" + oId, trgt, offsetFrom, offsetTo, polarity, cat, sId); this.addOpinion(op); //debugging //sb.append("\n\t> "+"o"+oId+" "+trgt+" "+offsetFrom+"-"+offsetTo+" "+polarity+" "+cat); } //System.out.println(sentString); } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
License:Apache License
public void absaSemEvalToNER(String fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/* w w w . ja va2 s . c om*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { StringBuilder sb = new StringBuilder(); String sentString = sent.getChildText("text"); sb = sb.append(sentString); Element aspectTerms = sent.getChild("aspectTerms"); if (aspectTerms != null) { List<List<Integer>> offsetList = new ArrayList<List<Integer>>(); List<Integer> offsets = new ArrayList<Integer>(); List<Element> aspectTermList = aspectTerms.getChildren(); if (!aspectTermList.isEmpty()) { for (Element aspectElem : aspectTermList) { Integer offsetFrom = Integer.parseInt(aspectElem.getAttributeValue("from")); Integer offsetTo = Integer.parseInt(aspectElem.getAttributeValue("to")); offsets.add(offsetFrom); offsets.add(offsetTo); } } Collections.sort(offsets); for (int i = 0; i < offsets.size(); i++) { List<Integer> offsetArray = new ArrayList<Integer>(); offsetArray.add(offsets.get(i++)); if (offsets.size() > i) { offsetArray.add(offsets.get(i)); } offsetList.add(offsetArray); } int counter = 0; for (List<Integer> offsetSent : offsetList) { Integer offsetFrom = offsetSent.get(0); Integer offsetTo = offsetSent.get(1); String aspectString = sentString.substring(offsetFrom, offsetTo); sb.replace(offsetFrom + counter, offsetTo + counter, "<START:term> " + aspectString + " <END>"); counter += 19; } } System.out.println(sb.toString()); } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
License:Apache License
public void absaSemEvalToNER2015(String fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/*from w ww.j av a2s. c om*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { String sentString = sent.getChildText("text"); StringBuilder sb = new StringBuilder(); sb = sb.append(sentString); Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { List<List<Integer>> offsetList = new ArrayList<List<Integer>>(); List<Integer> offsets = new ArrayList<Integer>(); List<Element> oteList = opinionsElement.getChildren(); for (Element aspectElem : oteList) { if (!aspectElem.getAttributeValue("target").equals("NULL")) { Integer offsetFrom = Integer.parseInt(aspectElem.getAttributeValue("from")); Integer offsetTo = Integer.parseInt(aspectElem.getAttributeValue("to")); offsets.add(offsetFrom); offsets.add(offsetTo); } } List<Integer> offsetsWithoutDuplicates = new ArrayList<Integer>(new HashSet<Integer>(offsets)); Collections.sort(offsetsWithoutDuplicates); for (int i = 0; i < offsetsWithoutDuplicates.size(); i++) { List<Integer> offsetArray = new ArrayList<Integer>(); offsetArray.add(offsetsWithoutDuplicates.get(i++)); if (offsetsWithoutDuplicates.size() > i) { offsetArray.add(offsetsWithoutDuplicates.get(i)); } offsetList.add(offsetArray); } int counter = 0; for (List<Integer> offsetSent : offsetList) { Integer offsetFrom = offsetSent.get(0); Integer offsetTo = offsetSent.get(1); String aspectString = sentString.substring(offsetFrom, offsetTo); sb.replace(offsetFrom + counter, offsetTo + counter, "<START:target> " + aspectString + " <END>"); counter += 21; } System.out.println(sb.toString()); } } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
License:Apache License
public void absaSemEvalToMultiClassNER2015(String fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {//from ww w.j a v a 2 s . c om Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { String sentString = sent.getChildText("text"); StringBuilder sb = new StringBuilder(); sb = sb.append(sentString); Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { List<List<Integer>> offsetList = new ArrayList<List<Integer>>(); HashSet<String> targetClassSet = new LinkedHashSet<String>(); List<Integer> offsets = new ArrayList<Integer>(); List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { if (!opinion.getAttributeValue("target").equals("NULL")) { String className = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); Integer offsetFrom = Integer.parseInt(opinion.getAttributeValue("from")); Integer offsetTo = Integer.parseInt(opinion.getAttributeValue("to")); offsets.add(offsetFrom); offsets.add(offsetTo); targetClassSet.add(targetString + "JAR!" + className + opinion.getAttributeValue("from") + opinion.getAttributeValue("to")); } } List<Integer> offsetsWithoutDuplicates = new ArrayList<Integer>(new HashSet<Integer>(offsets)); Collections.sort(offsetsWithoutDuplicates); List<String> targetClassList = new ArrayList<String>(targetClassSet); for (int i = 0; i < offsetsWithoutDuplicates.size(); i++) { List<Integer> offsetArray = new ArrayList<Integer>(); offsetArray.add(offsetsWithoutDuplicates.get(i++)); if (offsetsWithoutDuplicates.size() > i) { offsetArray.add(offsetsWithoutDuplicates.get(i)); } offsetList.add(offsetArray); } int counter = 0; for (int i = 0; i < offsetList.size(); i++) { Integer offsetFrom = offsetList.get(i).get(0); Integer offsetTo = offsetList.get(i).get(1); String className = targetClassList.get(i); String aspectString = sentString.substring(offsetFrom, offsetTo); sb.replace(offsetFrom + counter, offsetTo + counter, "<START:" + className.split("JAR!")[1].substring(0, 3) + "> " + aspectString + " <END>"); counter += 18; } System.out.println(sb.toString()); } } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
License:Apache License
public void absaSemEvalText(Reader reader) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/* w w w. ja v a 2 s. co m*/ Document doc = sax.build(reader); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { String sentString = sent.getChildText("text"); System.out.println(sentString); } } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
License:Apache License
public String absa15testToNAF(String fileName) { KAFDocument kaf = new KAFDocument("en", "v1.naf"); Segmenter segmenter = new Segmenter(); TokenFactory tokenFactory = new TokenFactory(); Properties properties = setAnnotateProperties(); SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/* ww w. ja va2s . c o m*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); int counter = 1; for (Element sent : sentences) { String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); StringReader stringReader = new StringReader(sentString); BufferedReader breader = new BufferedReader(stringReader); IxaPipeTokenizer<Token> tokenizer = new IxaPipeTokenizer<Token>(breader, tokenFactory, properties); List<Token> tokens = tokenizer.tokenize(); List<List<Token>> segmentedSentences = segmenter.segment(tokens); for (List<Token> sentence : segmentedSentences) { for (Token token : sentence) { WF wf = kaf.newWF(token.value(), token.startOffset(), counter); wf.setXpath(sentId); } } counter++; } } catch (JDOMException | IOException e) { e.printStackTrace(); } return kaf.toString(); }
From source file:eu.himeros.hocr.FlatXml.java
License:Open Source License
private void init(File inFile, File outFile) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFile); Element root = doc.getRootElement(); Namespace oldns = root.getNamespace(); Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml"); Namespace xmlns = newRoot.getNamespace(); Element head = root.getChild("head", oldns); head.setNamespace(xmlns);/*from www . j a va2s.c o m*/ for (Element child : head.getChildren()) child.setNamespace(xmlns); Element title = new Element("title", xmlns); title.addContent("ocr"); if (head != null) head.addContent(title); Element body = root.getChild("body", oldns); body.setNamespace(xmlns); /*Element oldPage; try{ oldPage=body.getChild("div",xmlns); }catch(Exception ex){ oldPage=new Element("div",xmlns); }*/ Element page = new Element("div", xmlns); page.setAttribute("class", "ocr_page"); page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png")); XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> careaElL = xpath.evaluate(body); for (Element careaEl : careaElL) { page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\"" + careaEl.getAttributeValue("title") + "\">")); for (Element pEl : careaEl.getChildren()) { page.addContent(new Comment("<p>")); for (Element lineEl : pEl.getChildren()) { lineEl.removeAttribute("id"); lineEl.setNamespace(xmlns); for (Element child : lineEl.getChildren()) { child.removeAttribute("id"); child.removeAttribute("lang"); child.removeAttribute("lang", xmlns); child.setNamespace(xmlns); } page.addContent(lineEl.clone()); } page.addContent(new Comment("</p>")); } page.addContent(new Comment("</div>")); } //oldPage.detach(); if (body != null) { body.removeContent(); body.addContent(page); } newRoot.addContent(root.removeContent()); doc.detachRootElement(); doc.setRootElement(newRoot); XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat()); xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile))); }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void updateElements() { xpath = XPathFactory.instance().compile("//ns:span[@uc!='']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); for (Element element : elements) { String uc = element.getAttributeValue("uc"); element.setAttribute("occ", "" + occHm.get(uc)); try {/*from w w w . j a v a 2s. c o m*/ if (occHm.get(uc) == 1) { element.setAttribute("anchor", nearGtHm.get(uc).getAttributeValue("uc")); element.setAttribute("anchor-id", nearGtHm.get(uc).getAttributeValue("id")); if ("CORRWORD".equals(element.getAttributeValue("class")) | "UCWORD".equals(element.getAttributeValue("class"))) { String title = element.getAttributeValue("title"); title = nearGtHm.get(uc).getAttributeValue("text") + "\u261a " + title; element.setAttribute("title", title); } } } catch (Exception ex) { continue; } } }