Example usage for org.jdom2.input SAXBuilder build

Introduction

In this page you can find the example usage for org.jdom2.input SAXBuilder build.

Prototype

@Override
public Document build(final String systemId) throws JDOMException, IOException

Source Link

Document

This builds a document from the supplied URI.

Usage

From source file:esiptestbed.mudrod.ontology.pre.AggregateTriples.java

License:Apache License

/**
 * Load OWL file into memory// ww  w. j av  a  2  s .c o  m
 * @param filePathName local path of OWL file
 * @throws JDOMException JDOMException
 * @throws IOException IOException
 */
public void loadxml(String filePathName) throws JDOMException, IOException {
    SAXBuilder saxBuilder = new SAXBuilder();
    File file = new File(filePathName);

    document = saxBuilder.build(file);
    rootNode = document.getRootElement();
}

From source file:eu.himeros.cophi.ocr.proofreader.controller.pojo.HocrDocumentBufferedReader.java

License:Open Source License

/**
 * Load the resource.//from   ww  w  . j av  a 2 s.c om
 * @param origin the buffered reader used to read the resource.
 * @return the DOM document created processing the original document.
 */
@Override
public Document load(BufferedReader origin) {
    try {
        SAXBuilder builder = new SAXBuilder();
        return builder.build(origin);
    } catch (IOException | JDOMException ex) {
        ex.printStackTrace(System.err);
        return null;
    }
}

From source file:eu.himeros.hocr.FlatXml.java

License:Open Source License

private void init(File inFile, File outFile) throws Exception {
    SAXBuilder builder = new SAXBuilder();
    Document doc = builder.build(inFile);
    Element root = doc.getRootElement();
    Namespace oldns = root.getNamespace();
    Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml");
    Namespace xmlns = newRoot.getNamespace();
    Element head = root.getChild("head", oldns);
    head.setNamespace(xmlns);//from  w  ww.ja  v  a 2s .  c om
    for (Element child : head.getChildren())
        child.setNamespace(xmlns);
    Element title = new Element("title", xmlns);
    title.addContent("ocr");
    if (head != null)
        head.addContent(title);
    Element body = root.getChild("body", oldns);
    body.setNamespace(xmlns);
    /*Element oldPage;
    try{
    oldPage=body.getChild("div",xmlns);
    }catch(Exception ex){
    oldPage=new Element("div",xmlns);
    }*/
    Element page = new Element("div", xmlns);
    page.setAttribute("class", "ocr_page");
    page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png"));
    XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']",
            Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> careaElL = xpath.evaluate(body);
    for (Element careaEl : careaElL) {
        page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\""
                + careaEl.getAttributeValue("title") + "\">"));
        for (Element pEl : careaEl.getChildren()) {
            page.addContent(new Comment("<p>"));
            for (Element lineEl : pEl.getChildren()) {
                lineEl.removeAttribute("id");
                lineEl.setNamespace(xmlns);
                for (Element child : lineEl.getChildren()) {
                    child.removeAttribute("id");
                    child.removeAttribute("lang");
                    child.removeAttribute("lang", xmlns);
                    child.setNamespace(xmlns);
                }
                page.addContent(lineEl.clone());
            }
            page.addContent(new Comment("</p>"));
        }
        page.addContent(new Comment("</div>"));
    }
    //oldPage.detach();
    if (body != null) {
        body.removeContent();
        body.addContent(page);
    }
    newRoot.addContent(root.removeContent());
    doc.detachRootElement();
    doc.setRootElement(newRoot);
    XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat());
    xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile)));
}

From source file:eu.himeros.hocr.XmlWordListExtractor.java

License:Open Source License

private void init(String inFileName, String outFileName) throws Exception {
    SAXBuilder builder = new SAXBuilder();
    Document doc = builder.build(inFileName);
    BufferedWriter bw = new BufferedWriter(new FileWriter(outFileName));
    Element el = doc.getRootElement();
    String s = el.getValue();//from  ww w  . j  a va  2  s.  co  m
    s = s.replaceAll("\n", " ");
    s = s.replaceAll(" +", " ");
    s = s.replace("- ", "");
    s = s.replace(" ", "\n");
    String[] ss = s.split("\n");
    for (String item : ss) {
        item = item.replaceAll("[^\u0370-\u03FF\u1F00-\u1FFF]|[]", "");
        if (item.length() < 1)
            continue;
        bw.write(item);
        bw.newLine();
    }
    bw.close();
}

From source file:eu.knux.passmanager.helper.FileHelper.java

License:Apache License

public static LinkedHashMap<String, Category> loadPassword(File f) {
    SAXBuilder builder = new SAXBuilder();
    Element racine = null;/* w ww  .  j  a  v a 2s .c  om*/
    LinkedHashMap<String, Category> categoriesReturned = new LinkedHashMap<>();
    try {
        Document doc = builder.build(f);
        racine = doc.getRootElement();
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }

    if (racine != null) {
        List<Element> categories = racine.getChildren("category");
        categoriesReturned.put("root", new Category("root"));
        for (Element e : categories) {
            String name = e.getAttributeValue("name");
            List<Element> passes = e.getChildren("password");
            for (Element e2 : passes) {
                Category currCate = null;
                if (name != null && !categoriesReturned.containsKey(name)) {
                    categoriesReturned.put(name, new Category(name));
                }
                currCate = (name == null) ? categoriesReturned.get("root") : categoriesReturned.get(name);

                Password p = new Password();
                p.setName(e2.getChildText("name"));
                p.setPass(e2.getChildText("pass"));
                p.setComment(e2.getChildText("comment"));
                p.setEncrypted(true);
                currCate.addPassword(p);
            }
        }
    }

    return categoriesReturned;
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from w  w w. ja va 2  s. co m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String category = opinion.getAttributeValue("category");
                    String targetString = opinion.getAttributeValue("target");
                    System.err.println("-> " + category + ", " + targetString);
                    // adding OTE
                    if (!targetString.equalsIgnoreCase("NULL")) {
                        int fromOffset = Integer.parseInt(opinion.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(opinion.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType(category);
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToWFs(String fileName, String language) {
    KAFDocument kaf = new KAFDocument("en", "v1.naf");
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//  ww  w .  j  ava 2 s  .  co  m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        int counter = 1;
        for (Element sent : sentences) {
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                }
            }
            counter++;
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
    return kaf.toString();
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToDocCatFormatForPolarity(String fileName, String language, int windowMin,
        int windowMax) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document doc = null;//  ww  w.  java 2  s  .c o m
    String text = "";

    try {
        doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        for (Element sent : sentences) {
            Element opinionsElement = sent.getChild("Opinions");
            String sentStringTmp = sent.getChildText("text");

            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentStringTmp, language);
            List<Token> sentence = segmentedSentence.get(0);

            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();

                for (Element opinion : opinionList) {

                    String sentString = "";

                    String targetString = opinion.getAttributeValue("target");
                    String polarityString = opinion.getAttributeValue("polarity");

                    if (targetString.equalsIgnoreCase("NULL") || opinionList.size() == 1) {
                        for (Token token : sentence) {
                            sentString += token.getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    } else {
                        int posTargetMin = -1;
                        int posTargetMax = -1;
                        // List<String> itemsTarget = Arrays.asList(targetString.split("
                        // "));
                        List<List<Token>> segmentedtarget = StringUtils.tokenizeSentence(targetString,
                                language);
                        List<Token> target = segmentedtarget.get(0);
                        String targetMin = target.get(0).getTokenValue();
                        String targetMax = target.get(target.size() - 1).getTokenValue();
                        int count = 0;
                        for (Token token : sentence) {
                            if (token.getTokenValue().equals(targetMin)) {
                                posTargetMin = count;
                            }
                            if (token.getTokenValue().equals(targetMax) && posTargetMin > -1) {
                                posTargetMax = count;
                                break;
                            }
                            count++;
                        }
                        if (posTargetMin - windowMin >= 0) {
                            posTargetMin = posTargetMin - windowMin;
                        } else
                            posTargetMin = 0;
                        if (posTargetMax + windowMax < sentence.size()) {
                            posTargetMax = posTargetMax + windowMax;
                        } else
                            posTargetMax = sentence.size() - 1;
                        for (int x = posTargetMin; x <= posTargetMax; x++) {
                            sentString += sentence.get(x).getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    }
                }

            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }

    return text;
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2014ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {/* ww w  .  jav a 2s  . co m*/
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<WF>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element aspectTermsElem = sent.getChild("aspectTerms");

            if (aspectTermsElem != null) {

                List<Element> aspectTermsList = aspectTermsElem.getChildren();
                // iterating over every opinion in the opinions element
                if (!aspectTermsList.isEmpty()) {
                    for (Element aspectTerm : aspectTermsList) {
                        // String targetString = aspectTerm.getAttributeValue("term");
                        // System.err.println("-> " + targetString);
                        // adding OTE
                        int fromOffset = Integer.parseInt(aspectTerm.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(aspectTerm.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType("term");
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.DSRCCorpus.java

License:Apache License

private static void DSRCToNAFNER(KAFDocument kaf, String wordsDoc, String markablesDoc)
        throws JDOMException, IOException {
    // reading the words xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document docWords = sax.build(wordsDoc);
    XPathExpression<Element> expr = xFactory.compile("//word", Filters.element());
    List<Element> words = expr.evaluate(docWords);
    List<WF> sentWFs = new ArrayList<>();
    List<Term> sentTerms = new ArrayList<>();
    // building the NAF containing the WFs and Terms
    // naf sentence counter
    int sentCounter = 1;
    for (Element word : words) {
        // sentence id and original text
        String token = word.getText();
        // the list contains just one list of tokens
        WF wf = kaf.newWF(0, token, sentCounter);
        final List<WF> wfTarget = new ArrayList<WF>();
        wfTarget.add(wf);//from  w  w  w  .  j ava 2s .  c o m
        sentWFs.add(wf);
        Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
        term.setPos("O");
        term.setLemma(token);
        sentTerms.add(term);
        Matcher endMatcher = endOfSentence.matcher(token);
        if (endMatcher.matches()) {
            sentCounter++;
        }
    } // end of processing words

    String[] tokenIds = new String[sentWFs.size()];
    for (int i = 0; i < sentWFs.size(); i++) {
        tokenIds[i] = sentWFs.get(i).getId();
    }
    // processing markables document in mmax opinion expression files
    Document markDoc = sax.build(markablesDoc);
    XPathFactory markFactory = XPathFactory.instance();
    XPathExpression<Element> markExpr = markFactory.compile("//ns:markable", Filters.element(), null,
            Namespace.getNamespace("ns", "www.eml.org/NameSpaces/OpinionExpression"));
    List<Element> markables = markExpr.evaluate(markDoc);
    for (Element markable : markables) {
        if (markable.getAttributeValue("annotation_type").equalsIgnoreCase("target")) {
            String markSpan = markable.getAttributeValue("span");
            System.err.println("--> span: " + markSpan);
            String removeCommaSpan = markSpan.replaceAll(",word_.*", "");
            System.err.println("--> newSpan: " + removeCommaSpan);
            String[] spanWords = removeCommaSpan.split("\\.\\.");
            int startIndex = Integer.parseInt(spanWords[0].replace("word_", ""));
            int endIndex = Integer.parseInt(spanWords[spanWords.length - 1].replace("word_", "")) + 1;

            List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex - 1, endIndex - 1));
            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                references.add(neSpan);
                Entity neEntity = kaf.newEntity(references);
                neEntity.setType("TARGET");
                System.err.println("--> target: " + neEntity.getStr());
            }
        } // end of create entity
    }
}