Example usage for org.jdom2.xpath XPathFactory instance

List of usage examples for org.jdom2.xpath XPathFactory instance

Introduction

In this page you can find the example usage for org.jdom2.xpath XPathFactory instance.

Prototype

public static final XPathFactory instance() 

Source Link

Document

Obtain an instance of an XPathFactory using the default mechanisms to determine what XPathFactory implementation to use.

Usage

From source file:eu.himeros.hocr.HocrInfoAggregator.java

License:Open Source License

public void alignToGroundTruth() {
    ArrayList<Element> ocrAl = new ArrayList<>();
    ArrayList<Element> nearGtAl;
    int start = 1;
    int end;//from  ww  w .ja v a 2  s . c  o  m
    xpath = XPathFactory.instance().compile("//ns:span[@id]", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    for (Element element : elements) {
        if (element.getAttributeValue("anchor-id") == null) {
            if ("".equals(element.getAttributeValue("uc"))) {
                continue;
            }
            ocrAl.add(element);
        } else {
            end = ((end = Integer.parseInt(element.getAttributeValue("anchor-id")) - 1) < 1 ? 1 : end);
            nearGtAl = makeNearGtAl(start, end);
            makeAlignment(ocrAl, nearGtAl);
            ocrAl = new ArrayList<>();
            start = end + 2;
        }
    }
}

From source file:eu.himeros.hocr.HocrInfoAggregator.java

License:Open Source License

private void makeCompliantHocr() {
    xpath = XPathFactory.instance().compile("//ns:span[@id|@idx]", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    int spanId = 0;
    for (Element span : elements) {
        if (span.getAttribute("idx") != null) {
            try {
                span = span.getChildren().get(0);
            } catch (Exception ex) {
                //
            }/*w  w  w .  j  a  v a 2 s . co m*/
        }
        LinkedList<Attribute> attributeLl = new LinkedList(span.getParentElement().getAttributes());
        attributeLl.addFirst(new Attribute("id", "w_" + spanId++));
        span.getParentElement().setAttributes(attributeLl);
        String[] suggestions = null;
        String title = span.getAttributeValue("title");
        if (title != null) {
            suggestions = title.split(" ");
        }
        if (suggestions == null) {
            suggestions = new String[] { "" };
        }
        Element ins = new Element("ins", xmlns);
        ins.setAttribute("class", "alt");
        ins.setAttribute("title", makeNlp(span.getAttributeValue("class")));
        ins.setText(span.getText());
        span.removeContent();
        span.addContent(ins);
        span.setAttribute("class", "alternatives");
        span.removeAttribute("uc");
        span.removeAttribute("occ");
        span.removeAttribute("title");
        span.removeAttribute("anchor");
        span.removeAttribute("anchor-id");
        span.removeAttribute("id");
        span.getParentElement().removeAttribute("idx");
        span.removeAttribute("whole");
        span.getParentElement().removeAttribute("whole");
        if (title == null || "".equals(title)) {
            continue;
        }
        double score = 0.90;
        for (String suggestion : suggestions) {
            if (suggestion == null || "".equals(suggestion)) {
                continue;
            }
            Element del = new Element("del", xmlns);
            del.setAttribute("title", "nlp " + String.format("%.2f", score).replaceAll(",", "."));
            score = score - 0.01;
            suggestion = suggestion.replaceAll(l1PunctMarkFilter, "");
            Matcher leftMatcher = l1LeftPunctMarkPattern.matcher(ins.getText());
            if (leftMatcher.matches()) {
                suggestion = leftMatcher.group(1) + suggestion;
            }
            Matcher rightMatcher = l1RightPunctMarkPattern.matcher(ins.getText());
            if (rightMatcher.matches()) {
                String ngtSymbol = "";
                if (suggestion.endsWith("\u261a")) {
                    ngtSymbol = "\u261a";
                    suggestion = suggestion.substring(0, suggestion.length() - 1);
                }
                suggestion = suggestion + rightMatcher.group(1) + ngtSymbol;
            }
            ///!!!!
            if (suggestion.endsWith("\u261a") && ins.getParentElement().getParentElement()
                    .getAttributeValue("lang", Namespace.XML_NAMESPACE) != null) {
                String buff = suggestion.substring(0, suggestion.length() - 1);
                sa.align(buff, ins.getText());
                double sim = 1 - sa.getEditDistance()
                        / Math.max((double) buff.length(), (double) ins.getText().length());
                if (sim > 0.6) {

                    suggestion = ins.getText() + "\u261b";
                    ins.setText(buff);
                    ins.setAttribute("title", "nlp 0.70");
                }
            }
            del.addContent(suggestion);
            span.addContent(del);
        }
    }
}

From source file:eu.himeros.hocr.NgtMaker.java

License:Open Source License

public void parseDoc(File file) throws Exception {
    adjustFile(file);//from   w  w  w .  j a  va2s  .  co m
    start = -1;
    end = -1;
    prevValue = -1;
    ocrAl = new ArrayList<>(1000);
    outFileName = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "ngt.xml";
    builder = new SAXBuilder();
    doc = builder.build(file);
    root = doc.getRootElement();
    xmlns = root.getNamespace();
    xpath = XPathFactory.instance().compile("//ns:span[@class='ocr_word']", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    for (Element element : elements) {
        parseOcrWord(element);
    }

    ocrAl.add("%%%");
    ocrAl.add("%%%");
    findAnchors();
    writeFragment(start, end);
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from   ww  w  .  ja v a2 s. c  o  m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String category = opinion.getAttributeValue("category");
                    String targetString = opinion.getAttributeValue("target");
                    System.err.println("-> " + category + ", " + targetString);
                    // adding OTE
                    if (!targetString.equalsIgnoreCase("NULL")) {
                        int fromOffset = Integer.parseInt(opinion.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(opinion.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType(category);
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToWFs(String fileName, String language) {
    KAFDocument kaf = new KAFDocument("en", "v1.naf");
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//  w w  w.j a v a2 s.co m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        int counter = 1;
        for (Element sent : sentences) {
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                }
            }
            counter++;
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
    return kaf.toString();
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToDocCatFormatForPolarity(String fileName, String language, int windowMin,
        int windowMax) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document doc = null;// w w w  .  j a  v a 2 s.  c o  m
    String text = "";

    try {
        doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        for (Element sent : sentences) {
            Element opinionsElement = sent.getChild("Opinions");
            String sentStringTmp = sent.getChildText("text");

            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentStringTmp, language);
            List<Token> sentence = segmentedSentence.get(0);

            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();

                for (Element opinion : opinionList) {

                    String sentString = "";

                    String targetString = opinion.getAttributeValue("target");
                    String polarityString = opinion.getAttributeValue("polarity");

                    if (targetString.equalsIgnoreCase("NULL") || opinionList.size() == 1) {
                        for (Token token : sentence) {
                            sentString += token.getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    } else {
                        int posTargetMin = -1;
                        int posTargetMax = -1;
                        // List<String> itemsTarget = Arrays.asList(targetString.split("
                        // "));
                        List<List<Token>> segmentedtarget = StringUtils.tokenizeSentence(targetString,
                                language);
                        List<Token> target = segmentedtarget.get(0);
                        String targetMin = target.get(0).getTokenValue();
                        String targetMax = target.get(target.size() - 1).getTokenValue();
                        int count = 0;
                        for (Token token : sentence) {
                            if (token.getTokenValue().equals(targetMin)) {
                                posTargetMin = count;
                            }
                            if (token.getTokenValue().equals(targetMax) && posTargetMin > -1) {
                                posTargetMax = count;
                                break;
                            }
                            count++;
                        }
                        if (posTargetMin - windowMin >= 0) {
                            posTargetMin = posTargetMin - windowMin;
                        } else
                            posTargetMin = 0;
                        if (posTargetMax + windowMax < sentence.size()) {
                            posTargetMax = posTargetMax + windowMax;
                        } else
                            posTargetMax = sentence.size() - 1;
                        for (int x = posTargetMin; x <= posTargetMax; x++) {
                            sentString += sentence.get(x).getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    }
                }

            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }

    return text;
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2014ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from   w  w  w. ja  va  2  s.  c  o  m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<WF>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element aspectTermsElem = sent.getChild("aspectTerms");

            if (aspectTermsElem != null) {

                List<Element> aspectTermsList = aspectTermsElem.getChildren();
                // iterating over every opinion in the opinions element
                if (!aspectTermsList.isEmpty()) {
                    for (Element aspectTerm : aspectTermsList) {
                        // String targetString = aspectTerm.getAttributeValue("term");
                        // System.err.println("-> " + targetString);
                        // adding OTE
                        int fromOffset = Integer.parseInt(aspectTerm.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(aspectTerm.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType("term");
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.DSRCCorpus.java

License:Apache License

private static void DSRCToNAFNER(KAFDocument kaf, String wordsDoc, String markablesDoc)
        throws JDOMException, IOException {
    // reading the words xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document docWords = sax.build(wordsDoc);
    XPathExpression<Element> expr = xFactory.compile("//word", Filters.element());
    List<Element> words = expr.evaluate(docWords);
    List<WF> sentWFs = new ArrayList<>();
    List<Term> sentTerms = new ArrayList<>();
    // building the NAF containing the WFs and Terms
    // naf sentence counter
    int sentCounter = 1;
    for (Element word : words) {
        // sentence id and original text
        String token = word.getText();
        // the list contains just one list of tokens
        WF wf = kaf.newWF(0, token, sentCounter);
        final List<WF> wfTarget = new ArrayList<WF>();
        wfTarget.add(wf);//from  w ww .java  2  s .co m
        sentWFs.add(wf);
        Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
        term.setPos("O");
        term.setLemma(token);
        sentTerms.add(term);
        Matcher endMatcher = endOfSentence.matcher(token);
        if (endMatcher.matches()) {
            sentCounter++;
        }
    } // end of processing words

    String[] tokenIds = new String[sentWFs.size()];
    for (int i = 0; i < sentWFs.size(); i++) {
        tokenIds[i] = sentWFs.get(i).getId();
    }
    // processing markables document in mmax opinion expression files
    Document markDoc = sax.build(markablesDoc);
    XPathFactory markFactory = XPathFactory.instance();
    XPathExpression<Element> markExpr = markFactory.compile("//ns:markable", Filters.element(), null,
            Namespace.getNamespace("ns", "www.eml.org/NameSpaces/OpinionExpression"));
    List<Element> markables = markExpr.evaluate(markDoc);
    for (Element markable : markables) {
        if (markable.getAttributeValue("annotation_type").equalsIgnoreCase("target")) {
            String markSpan = markable.getAttributeValue("span");
            System.err.println("--> span: " + markSpan);
            String removeCommaSpan = markSpan.replaceAll(",word_.*", "");
            System.err.println("--> newSpan: " + removeCommaSpan);
            String[] spanWords = removeCommaSpan.split("\\.\\.");
            int startIndex = Integer.parseInt(spanWords[0].replace("word_", ""));
            int endIndex = Integer.parseInt(spanWords[spanWords.length - 1].replace("word_", "")) + 1;

            List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex - 1, endIndex - 1));
            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                references.add(neSpan);
                Entity neEntity = kaf.newEntity(references);
                neEntity.setType("TARGET");
                System.err.println("--> target: " + neEntity.getStr());
            }
        } // end of create entity
    }
}

From source file:eus.ixa.ixa.pipe.convert.TassFormat.java

License:Apache License

public static void generalToTabulated(String fileName) throws JDOMException, IOException {
    StringBuilder sb = new StringBuilder();
    // reading the TASS General Corpus xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document doc = sax.build(fileName);
    XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element());
    List<Element> tweets = expr.evaluate(doc);

    for (Element tweet : tweets) {
        String tokenizedTweetContent = null;
        String tweetPolarity = null;
        String tweetId = tweet.getChildText("tweetid");
        String tweetContentString = tweet.getChildText("content");
        // the list contains just one list of tokens
        List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE);
        for (List<Token> tokenizedSentence : segmentedSentences) {
            String[] tokenizedTweetArray = eus.ixa.ixa.pipe.ml.utils.StringUtils
                    .convertListTokenToArrayStrings(tokenizedSentence);
            tokenizedTweetContent = StringUtils.getStringFromTokens(tokenizedTweetArray);
        }//  w ww.  j av a  2s. c o m
        if (tweet.getChild("sentiments").getChild("polarity").getChildText("value") != null) {
            tweetPolarity = tweet.getChild("sentiments").getChild("polarity").getChildText("value");
        }
        sb.append(tweetId).append("\t").append(tweetPolarity).append("\t").append(tokenizedTweetContent)
                .append("\n");
    }
    System.out.println(sb.toString());
}

From source file:eus.ixa.ixa.pipe.convert.TassFormat.java

License:Apache License

public static void generalToWFs(String fileName) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {/*from   w w w  .j  a v  a 2  s  .c o m*/
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element());
        List<Element> tweets = expr.evaluate(doc);

        for (Element tweet : tweets) {
            String tweetId = tweet.getChildText("tweetid");
            KAFDocument kaf = new KAFDocument(LANGUAGE, "v1.naf");
            kaf.createPublic().publicId = tweetId;

            String tweetContentString = tweet.getChildText("content");
            List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    kaf.newWF(token.startOffset(), token.getTokenValue(), 1);
                }
            }
            Path outfile = Files.createFile(Paths.get(tweetId + ".naf"));
            Files.write(outfile, kaf.toString().getBytes(StandardCharsets.UTF_8));
            System.err.println(">> Wrote naf document to " + outfile);
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}