List of usage examples for org.jdom2.input SAXBuilder build
@Override public Document build(final String systemId) throws JDOMException, IOException
This builds a document from the supplied URI.
From source file:esiptestbed.mudrod.ontology.pre.AggregateTriples.java
License:Apache License
/** * Load OWL file into memory// ww w. j av a 2 s .c o m * @param filePathName local path of OWL file * @throws JDOMException JDOMException * @throws IOException IOException */ public void loadxml(String filePathName) throws JDOMException, IOException { SAXBuilder saxBuilder = new SAXBuilder(); File file = new File(filePathName); document = saxBuilder.build(file); rootNode = document.getRootElement(); }
From source file:eu.himeros.cophi.ocr.proofreader.controller.pojo.HocrDocumentBufferedReader.java
License:Open Source License
/** * Load the resource.//from ww w . j av a 2 s.c om * @param origin the buffered reader used to read the resource. * @return the DOM document created processing the original document. */ @Override public Document load(BufferedReader origin) { try { SAXBuilder builder = new SAXBuilder(); return builder.build(origin); } catch (IOException | JDOMException ex) { ex.printStackTrace(System.err); return null; } }
From source file:eu.himeros.hocr.FlatXml.java
License:Open Source License
private void init(File inFile, File outFile) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFile); Element root = doc.getRootElement(); Namespace oldns = root.getNamespace(); Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml"); Namespace xmlns = newRoot.getNamespace(); Element head = root.getChild("head", oldns); head.setNamespace(xmlns);//from w ww.ja v a 2s . c om for (Element child : head.getChildren()) child.setNamespace(xmlns); Element title = new Element("title", xmlns); title.addContent("ocr"); if (head != null) head.addContent(title); Element body = root.getChild("body", oldns); body.setNamespace(xmlns); /*Element oldPage; try{ oldPage=body.getChild("div",xmlns); }catch(Exception ex){ oldPage=new Element("div",xmlns); }*/ Element page = new Element("div", xmlns); page.setAttribute("class", "ocr_page"); page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png")); XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> careaElL = xpath.evaluate(body); for (Element careaEl : careaElL) { page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\"" + careaEl.getAttributeValue("title") + "\">")); for (Element pEl : careaEl.getChildren()) { page.addContent(new Comment("<p>")); for (Element lineEl : pEl.getChildren()) { lineEl.removeAttribute("id"); lineEl.setNamespace(xmlns); for (Element child : lineEl.getChildren()) { child.removeAttribute("id"); child.removeAttribute("lang"); child.removeAttribute("lang", xmlns); child.setNamespace(xmlns); } page.addContent(lineEl.clone()); } page.addContent(new Comment("</p>")); } page.addContent(new Comment("</div>")); } //oldPage.detach(); if (body != null) { body.removeContent(); body.addContent(page); } newRoot.addContent(root.removeContent()); doc.detachRootElement(); doc.setRootElement(newRoot); XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat()); xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile))); }
From source file:eu.himeros.hocr.XmlWordListExtractor.java
License:Open Source License
private void init(String inFileName, String outFileName) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFileName); BufferedWriter bw = new BufferedWriter(new FileWriter(outFileName)); Element el = doc.getRootElement(); String s = el.getValue();//from ww w . j a va 2 s. co m s = s.replaceAll("\n", " "); s = s.replaceAll(" +", " "); s = s.replace("- ", ""); s = s.replace(" ", "\n"); String[] ss = s.split("\n"); for (String item : ss) { item = item.replaceAll("[^\u0370-\u03FF\u1F00-\u1FFF]|[]", ""); if (item.length() < 1) continue; bw.write(item); bw.newLine(); } bw.close(); }
From source file:eu.knux.passmanager.helper.FileHelper.java
License:Apache License
public static LinkedHashMap<String, Category> loadPassword(File f) { SAXBuilder builder = new SAXBuilder(); Element racine = null;/* w ww . j a v a 2s .c om*/ LinkedHashMap<String, Category> categoriesReturned = new LinkedHashMap<>(); try { Document doc = builder.build(f); racine = doc.getRootElement(); } catch (JDOMException | IOException e) { e.printStackTrace(); } if (racine != null) { List<Element> categories = racine.getChildren("category"); categoriesReturned.put("root", new Category("root")); for (Element e : categories) { String name = e.getAttributeValue("name"); List<Element> passes = e.getChildren("password"); for (Element e2 : passes) { Category currCate = null; if (name != null && !categoriesReturned.containsKey(name)) { categoriesReturned.put(name, new Category(name)); } currCate = (name == null) ? categoriesReturned.get("root") : categoriesReturned.get(name); Password p = new Password(); p.setName(e2.getChildText("name")); p.setPass(e2.getChildText("pass")); p.setComment(e2.getChildText("comment")); p.setEncrypted(true); currCate.addPassword(p); } } } return categoriesReturned; }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) { // reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {//from w w w. ja va 2 s. co m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); // naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); // the list contains just one list of tokens List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // going through every opinion element for each sentence // each opinion element can contain one or more opinions Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { // iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String category = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); System.err.println("-> " + category + ", " + targetString); // adding OTE if (!targetString.equalsIgnoreCase("NULL")) { int fromOffset = Integer.parseInt(opinion.getAttributeValue("from")); int toOffset = Integer.parseInt(opinion.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { // span is +1 with respect to the last token of the span endIndex = i + 1; } } // TODO remove this condition to correct manually offsets if (startIndex != -1 && endIndex != -1) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms); if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType(category); } } } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String absa2015ToWFs(String fileName, String language) { KAFDocument kaf = new KAFDocument("en", "v1.naf"); SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {// ww w . j ava 2 s . co m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); int counter = 1; for (Element sent : sentences) { String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentences) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); } } counter++; } } catch (JDOMException | IOException e) { e.printStackTrace(); } return kaf.toString(); }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String absa2015ToDocCatFormatForPolarity(String fileName, String language, int windowMin, int windowMax) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); Document doc = null;// ww w. java 2 s .c o m String text = ""; try { doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { Element opinionsElement = sent.getChild("Opinions"); String sentStringTmp = sent.getChildText("text"); List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentStringTmp, language); List<Token> sentence = segmentedSentence.get(0); if (opinionsElement != null) { // iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String sentString = ""; String targetString = opinion.getAttributeValue("target"); String polarityString = opinion.getAttributeValue("polarity"); if (targetString.equalsIgnoreCase("NULL") || opinionList.size() == 1) { for (Token token : sentence) { sentString += token.getTokenValue() + " "; } text += polarityString + "\t" + sentString + "\n"; } else { int posTargetMin = -1; int posTargetMax = -1; // List<String> itemsTarget = Arrays.asList(targetString.split(" // ")); List<List<Token>> segmentedtarget = StringUtils.tokenizeSentence(targetString, language); List<Token> target = segmentedtarget.get(0); String targetMin = target.get(0).getTokenValue(); String targetMax = target.get(target.size() - 1).getTokenValue(); int count = 0; for (Token token : sentence) { if (token.getTokenValue().equals(targetMin)) { posTargetMin = count; } if (token.getTokenValue().equals(targetMax) && posTargetMin > -1) { posTargetMax = count; break; } count++; } if (posTargetMin - windowMin >= 0) { posTargetMin = posTargetMin - windowMin; } else posTargetMin = 0; if (posTargetMax + windowMax < sentence.size()) { posTargetMax = posTargetMax + windowMax; } else posTargetMax = sentence.size() - 1; for (int x = posTargetMin; x <= posTargetMax; x++) { sentString += sentence.get(x).getTokenValue() + " "; } text += polarityString + "\t" + sentString + "\n"; } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } return text; }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
private static void absa2014ToNAFNER(KAFDocument kaf, String fileName, String language) { // reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/* ww w . jav a 2s . co m*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); // naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); // the list contains just one list of tokens List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // going through every opinion element for each sentence // each opinion element can contain one or more opinions Element aspectTermsElem = sent.getChild("aspectTerms"); if (aspectTermsElem != null) { List<Element> aspectTermsList = aspectTermsElem.getChildren(); // iterating over every opinion in the opinions element if (!aspectTermsList.isEmpty()) { for (Element aspectTerm : aspectTermsList) { // String targetString = aspectTerm.getAttributeValue("term"); // System.err.println("-> " + targetString); // adding OTE int fromOffset = Integer.parseInt(aspectTerm.getAttributeValue("from")); int toOffset = Integer.parseInt(aspectTerm.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { // span is +1 with respect to the last token of the span endIndex = i + 1; } } // TODO remove this condition to correct manually offsets if (startIndex != -1 && endIndex != -1) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms); if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("term"); } } } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.DSRCCorpus.java
License:Apache License
private static void DSRCToNAFNER(KAFDocument kaf, String wordsDoc, String markablesDoc) throws JDOMException, IOException { // reading the words xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); Document docWords = sax.build(wordsDoc); XPathExpression<Element> expr = xFactory.compile("//word", Filters.element()); List<Element> words = expr.evaluate(docWords); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // building the NAF containing the WFs and Terms // naf sentence counter int sentCounter = 1; for (Element word : words) { // sentence id and original text String token = word.getText(); // the list contains just one list of tokens WF wf = kaf.newWF(0, token, sentCounter); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf);//from w w w . j ava 2s . c o m sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token); sentTerms.add(term); Matcher endMatcher = endOfSentence.matcher(token); if (endMatcher.matches()) { sentCounter++; } } // end of processing words String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // processing markables document in mmax opinion expression files Document markDoc = sax.build(markablesDoc); XPathFactory markFactory = XPathFactory.instance(); XPathExpression<Element> markExpr = markFactory.compile("//ns:markable", Filters.element(), null, Namespace.getNamespace("ns", "www.eml.org/NameSpaces/OpinionExpression")); List<Element> markables = markExpr.evaluate(markDoc); for (Element markable : markables) { if (markable.getAttributeValue("annotation_type").equalsIgnoreCase("target")) { String markSpan = markable.getAttributeValue("span"); System.err.println("--> span: " + markSpan); String removeCommaSpan = markSpan.replaceAll(",word_.*", ""); System.err.println("--> newSpan: " + removeCommaSpan); String[] spanWords = removeCommaSpan.split("\\.\\."); int startIndex = Integer.parseInt(spanWords[0].replace("word_", "")); int endIndex = Integer.parseInt(spanWords[spanWords.length - 1].replace("word_", "")) + 1; List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex - 1, endIndex - 1)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("TARGET"); System.err.println("--> target: " + neEntity.getStr()); } } // end of create entity } }