List of usage examples for org.jdom2.input SAXBuilder SAXBuilder
public SAXBuilder()
From source file:esiptestbed.mudrod.ontology.pre.AggregateTriples.java
License:Apache License
/** * Load OWL file into memory//w ww .j a va 2s . c om * @param filePathName local path of OWL file * @throws JDOMException JDOMException * @throws IOException IOException */ public void loadxml(String filePathName) throws JDOMException, IOException { SAXBuilder saxBuilder = new SAXBuilder(); File file = new File(filePathName); document = saxBuilder.build(file); rootNode = document.getRootElement(); }
From source file:eu.himeros.cophi.ocr.proofreader.controller.pojo.HocrDocumentBufferedReader.java
License:Open Source License
/** * Load the resource./*www . j av a2s. c om*/ * @param origin the buffered reader used to read the resource. * @return the DOM document created processing the original document. */ @Override public Document load(BufferedReader origin) { try { SAXBuilder builder = new SAXBuilder(); return builder.build(origin); } catch (IOException | JDOMException ex) { ex.printStackTrace(System.err); return null; } }
From source file:eu.himeros.digitaledition.AlignedQuotationParser.java
License:Open Source License
public Element getRoot(String inFileName) throws Exception { builder = new SAXBuilder(); docIn = builder.build(inFileName);// w w w. j a v a 2s. com return docIn.getRootElement(); }
From source file:eu.himeros.hocr.FlatXml.java
License:Open Source License
private void init(File inFile, File outFile) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFile); Element root = doc.getRootElement(); Namespace oldns = root.getNamespace(); Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml"); Namespace xmlns = newRoot.getNamespace(); Element head = root.getChild("head", oldns); head.setNamespace(xmlns);/*from www. ja va 2 s . com*/ for (Element child : head.getChildren()) child.setNamespace(xmlns); Element title = new Element("title", xmlns); title.addContent("ocr"); if (head != null) head.addContent(title); Element body = root.getChild("body", oldns); body.setNamespace(xmlns); /*Element oldPage; try{ oldPage=body.getChild("div",xmlns); }catch(Exception ex){ oldPage=new Element("div",xmlns); }*/ Element page = new Element("div", xmlns); page.setAttribute("class", "ocr_page"); page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png")); XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> careaElL = xpath.evaluate(body); for (Element careaEl : careaElL) { page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\"" + careaEl.getAttributeValue("title") + "\">")); for (Element pEl : careaEl.getChildren()) { page.addContent(new Comment("<p>")); for (Element lineEl : pEl.getChildren()) { lineEl.removeAttribute("id"); lineEl.setNamespace(xmlns); for (Element child : lineEl.getChildren()) { child.removeAttribute("id"); child.removeAttribute("lang"); child.removeAttribute("lang", xmlns); child.setNamespace(xmlns); } page.addContent(lineEl.clone()); } page.addContent(new Comment("</p>")); } page.addContent(new Comment("</div>")); } //oldPage.detach(); if (body != null) { body.removeContent(); body.addContent(page); } newRoot.addContent(root.removeContent()); doc.detachRootElement(); doc.setRootElement(newRoot); XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat()); xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile))); }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
public void initFile(String inFileName) throws Exception { builder = new SAXBuilder(); doc = builder.build(inFileName);//from w w w . j a v a 2s. com root = doc.getRootElement(); xmlns = root.getNamespace(); l1Fm = new GreekContextFilterMananger(); //TODO: generalize aqp = new AlignedQuotationParser(); try { nearGt = aqp.parse(inFileName.substring(0, inFileName.length() - 5) + ".ngt.xml"); //TODO : generalize makeNearGtHm(); } catch (Exception e) { // solving problems by ignoring them } }
From source file:eu.himeros.hocr.NgtMaker.java
License:Open Source License
public void parseDoc(File file) throws Exception { adjustFile(file);//from w w w . j ava 2s .c o m start = -1; end = -1; prevValue = -1; ocrAl = new ArrayList<>(1000); outFileName = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "ngt.xml"; builder = new SAXBuilder(); doc = builder.build(file); root = doc.getRootElement(); xmlns = root.getNamespace(); xpath = XPathFactory.instance().compile("//ns:span[@class='ocr_word']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); for (Element element : elements) { parseOcrWord(element); } ocrAl.add("%%%"); ocrAl.add("%%%"); findAnchors(); writeFragment(start, end); }
From source file:eu.himeros.hocr.XmlWordListExtractor.java
License:Open Source License
private void init(String inFileName, String outFileName) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFileName); BufferedWriter bw = new BufferedWriter(new FileWriter(outFileName)); Element el = doc.getRootElement(); String s = el.getValue();//from ww w .j a v a2 s.c o m s = s.replaceAll("\n", " "); s = s.replaceAll(" +", " "); s = s.replace("- ", ""); s = s.replace(" ", "\n"); String[] ss = s.split("\n"); for (String item : ss) { item = item.replaceAll("[^\u0370-\u03FF\u1F00-\u1FFF]|[]", ""); if (item.length() < 1) continue; bw.write(item); bw.newLine(); } bw.close(); }
From source file:eu.knux.passmanager.helper.FileHelper.java
License:Apache License
public static LinkedHashMap<String, Category> loadPassword(File f) { SAXBuilder builder = new SAXBuilder(); Element racine = null;//from w w w . j a v a 2 s .c o m LinkedHashMap<String, Category> categoriesReturned = new LinkedHashMap<>(); try { Document doc = builder.build(f); racine = doc.getRootElement(); } catch (JDOMException | IOException e) { e.printStackTrace(); } if (racine != null) { List<Element> categories = racine.getChildren("category"); categoriesReturned.put("root", new Category("root")); for (Element e : categories) { String name = e.getAttributeValue("name"); List<Element> passes = e.getChildren("password"); for (Element e2 : passes) { Category currCate = null; if (name != null && !categoriesReturned.containsKey(name)) { categoriesReturned.put(name, new Category(name)); } currCate = (name == null) ? categoriesReturned.get("root") : categoriesReturned.get(name); Password p = new Password(); p.setName(e2.getChildText("name")); p.setPass(e2.getChildText("pass")); p.setComment(e2.getChildText("comment")); p.setEncrypted(true); currCate.addPassword(p); } } } return categoriesReturned; }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) { // reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {//from w ww .ja v a 2 s . c o m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); // naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); // the list contains just one list of tokens List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // going through every opinion element for each sentence // each opinion element can contain one or more opinions Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { // iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String category = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); System.err.println("-> " + category + ", " + targetString); // adding OTE if (!targetString.equalsIgnoreCase("NULL")) { int fromOffset = Integer.parseInt(opinion.getAttributeValue("from")); int toOffset = Integer.parseInt(opinion.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { // span is +1 with respect to the last token of the span endIndex = i + 1; } } // TODO remove this condition to correct manually offsets if (startIndex != -1 && endIndex != -1) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms); if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType(category); } } } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String absa2015ToWFs(String fileName, String language) { KAFDocument kaf = new KAFDocument("en", "v1.naf"); SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/*from w ww.ja va2 s . co m*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); int counter = 1; for (Element sent : sentences) { String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentences) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); } } counter++; } } catch (JDOMException | IOException e) { e.printStackTrace(); } return kaf.toString(); }