Example usage for org.jdom2.xpath XPathFactory compile

List of usage examples for org.jdom2.xpath XPathFactory compile

Introduction

In this page you can find the example usage for org.jdom2.xpath XPathFactory compile.

Prototype

public <T> XPathExpression<T> compile(String expression, Filter<T> filter) 

Source Link

Document

Create a XPathExpression<T> instance from this factory.

Usage

From source file:es.ehu.si.ixa.pipe.convert.Convert.java

License:Apache License

public void absaSemEvalText(Reader reader) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {/*from  ww w . jav  a  2s .  co m*/
        Document doc = sax.build(reader);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);
        for (Element sent : sentences) {
            String sentString = sent.getChildText("text");
            System.out.println(sentString);
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:es.ehu.si.ixa.pipe.convert.Convert.java

License:Apache License

public String absa15testToNAF(String fileName) {
    KAFDocument kaf = new KAFDocument("en", "v1.naf");
    Segmenter segmenter = new Segmenter();
    TokenFactory tokenFactory = new TokenFactory();
    Properties properties = setAnnotateProperties();
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from ww  w  .j av  a  2 s .c  o  m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        int counter = 1;
        for (Element sent : sentences) {
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            StringReader stringReader = new StringReader(sentString);
            BufferedReader breader = new BufferedReader(stringReader);
            IxaPipeTokenizer<Token> tokenizer = new IxaPipeTokenizer<Token>(breader, tokenFactory, properties);
            List<Token> tokens = tokenizer.tokenize();
            List<List<Token>> segmentedSentences = segmenter.segment(tokens);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.value(), token.startOffset(), counter);
                    wf.setXpath(sentId);
                }
            }
            counter++;
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
    return kaf.toString();
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {/* w  w w .  j a  va 2s .com*/
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String category = opinion.getAttributeValue("category");
                    String targetString = opinion.getAttributeValue("target");
                    System.err.println("-> " + category + ", " + targetString);
                    // adding OTE
                    if (!targetString.equalsIgnoreCase("NULL")) {
                        int fromOffset = Integer.parseInt(opinion.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(opinion.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType(category);
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToWFs(String fileName, String language) {
    KAFDocument kaf = new KAFDocument("en", "v1.naf");
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {/*from   www.j av a 2s. com*/
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        int counter = 1;
        for (Element sent : sentences) {
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                }
            }
            counter++;
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
    return kaf.toString();
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

public static String absa2015ToDocCatFormatForPolarity(String fileName, String language, int windowMin,
        int windowMax) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document doc = null;//from w ww . java2s.  c o  m
    String text = "";

    try {
        doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        for (Element sent : sentences) {
            Element opinionsElement = sent.getChild("Opinions");
            String sentStringTmp = sent.getChildText("text");

            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentStringTmp, language);
            List<Token> sentence = segmentedSentence.get(0);

            if (opinionsElement != null) {
                // iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();

                for (Element opinion : opinionList) {

                    String sentString = "";

                    String targetString = opinion.getAttributeValue("target");
                    String polarityString = opinion.getAttributeValue("polarity");

                    if (targetString.equalsIgnoreCase("NULL") || opinionList.size() == 1) {
                        for (Token token : sentence) {
                            sentString += token.getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    } else {
                        int posTargetMin = -1;
                        int posTargetMax = -1;
                        // List<String> itemsTarget = Arrays.asList(targetString.split("
                        // "));
                        List<List<Token>> segmentedtarget = StringUtils.tokenizeSentence(targetString,
                                language);
                        List<Token> target = segmentedtarget.get(0);
                        String targetMin = target.get(0).getTokenValue();
                        String targetMax = target.get(target.size() - 1).getTokenValue();
                        int count = 0;
                        for (Token token : sentence) {
                            if (token.getTokenValue().equals(targetMin)) {
                                posTargetMin = count;
                            }
                            if (token.getTokenValue().equals(targetMax) && posTargetMin > -1) {
                                posTargetMax = count;
                                break;
                            }
                            count++;
                        }
                        if (posTargetMin - windowMin >= 0) {
                            posTargetMin = posTargetMin - windowMin;
                        } else
                            posTargetMin = 0;
                        if (posTargetMax + windowMax < sentence.size()) {
                            posTargetMax = posTargetMax + windowMax;
                        } else
                            posTargetMax = sentence.size() - 1;
                        for (int x = posTargetMin; x <= posTargetMax; x++) {
                            sentString += sentence.get(x).getTokenValue() + " ";
                        }
                        text += polarityString + "\t" + sentString + "\n";
                    }
                }

            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }

    return text;
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

License:Apache License

private static void absa2014ToNAFNER(KAFDocument kaf, String fileName, String language) {
    // reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from ww  w  . j av  a2  s . c  o m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        // naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            // sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            // the list contains just one list of tokens
            List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<WF>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            // going through every opinion element for each sentence
            // each opinion element can contain one or more opinions
            Element aspectTermsElem = sent.getChild("aspectTerms");

            if (aspectTermsElem != null) {

                List<Element> aspectTermsList = aspectTermsElem.getChildren();
                // iterating over every opinion in the opinions element
                if (!aspectTermsList.isEmpty()) {
                    for (Element aspectTerm : aspectTermsList) {
                        // String targetString = aspectTerm.getAttributeValue("term");
                        // System.err.println("-> " + targetString);
                        // adding OTE
                        int fromOffset = Integer.parseInt(aspectTerm.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(aspectTerm.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                // span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        // TODO remove this condition to correct manually offsets
                        if (startIndex != -1 && endIndex != -1) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms);
                            if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType("term");
                            }
                        }
                    }
                }
            }
        } // end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.DSRCCorpus.java

License:Apache License

private static void DSRCToNAFNER(KAFDocument kaf, String wordsDoc, String markablesDoc)
        throws JDOMException, IOException {
    // reading the words xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document docWords = sax.build(wordsDoc);
    XPathExpression<Element> expr = xFactory.compile("//word", Filters.element());
    List<Element> words = expr.evaluate(docWords);
    List<WF> sentWFs = new ArrayList<>();
    List<Term> sentTerms = new ArrayList<>();
    // building the NAF containing the WFs and Terms
    // naf sentence counter
    int sentCounter = 1;
    for (Element word : words) {
        // sentence id and original text
        String token = word.getText();
        // the list contains just one list of tokens
        WF wf = kaf.newWF(0, token, sentCounter);
        final List<WF> wfTarget = new ArrayList<WF>();
        wfTarget.add(wf);/* ww  w .  j av  a2  s.  co m*/
        sentWFs.add(wf);
        Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
        term.setPos("O");
        term.setLemma(token);
        sentTerms.add(term);
        Matcher endMatcher = endOfSentence.matcher(token);
        if (endMatcher.matches()) {
            sentCounter++;
        }
    } // end of processing words

    String[] tokenIds = new String[sentWFs.size()];
    for (int i = 0; i < sentWFs.size(); i++) {
        tokenIds[i] = sentWFs.get(i).getId();
    }
    // processing markables document in mmax opinion expression files
    Document markDoc = sax.build(markablesDoc);
    XPathFactory markFactory = XPathFactory.instance();
    XPathExpression<Element> markExpr = markFactory.compile("//ns:markable", Filters.element(), null,
            Namespace.getNamespace("ns", "www.eml.org/NameSpaces/OpinionExpression"));
    List<Element> markables = markExpr.evaluate(markDoc);
    for (Element markable : markables) {
        if (markable.getAttributeValue("annotation_type").equalsIgnoreCase("target")) {
            String markSpan = markable.getAttributeValue("span");
            System.err.println("--> span: " + markSpan);
            String removeCommaSpan = markSpan.replaceAll(",word_.*", "");
            System.err.println("--> newSpan: " + removeCommaSpan);
            String[] spanWords = removeCommaSpan.split("\\.\\.");
            int startIndex = Integer.parseInt(spanWords[0].replace("word_", ""));
            int endIndex = Integer.parseInt(spanWords[spanWords.length - 1].replace("word_", "")) + 1;

            List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex - 1, endIndex - 1));
            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                references.add(neSpan);
                Entity neEntity = kaf.newEntity(references);
                neEntity.setType("TARGET");
                System.err.println("--> target: " + neEntity.getStr());
            }
        } // end of create entity
    }
}

From source file:eus.ixa.ixa.pipe.convert.TassFormat.java

License:Apache License

public static void generalToTabulated(String fileName) throws JDOMException, IOException {
    StringBuilder sb = new StringBuilder();
    // reading the TASS General Corpus xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    Document doc = sax.build(fileName);
    XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element());
    List<Element> tweets = expr.evaluate(doc);

    for (Element tweet : tweets) {
        String tokenizedTweetContent = null;
        String tweetPolarity = null;
        String tweetId = tweet.getChildText("tweetid");
        String tweetContentString = tweet.getChildText("content");
        // the list contains just one list of tokens
        List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE);
        for (List<Token> tokenizedSentence : segmentedSentences) {
            String[] tokenizedTweetArray = eus.ixa.ixa.pipe.ml.utils.StringUtils
                    .convertListTokenToArrayStrings(tokenizedSentence);
            tokenizedTweetContent = StringUtils.getStringFromTokens(tokenizedTweetArray);
        }//from  w w  w  .j a v a2s . com
        if (tweet.getChild("sentiments").getChild("polarity").getChildText("value") != null) {
            tweetPolarity = tweet.getChild("sentiments").getChild("polarity").getChildText("value");
        }
        sb.append(tweetId).append("\t").append(tweetPolarity).append("\t").append(tokenizedTweetContent)
                .append("\n");
    }
    System.out.println(sb.toString());
}

From source file:eus.ixa.ixa.pipe.convert.TassFormat.java

License:Apache License

public static void generalToWFs(String fileName) {
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {//from   w ww  . j a v a2s  . c  o  m
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element());
        List<Element> tweets = expr.evaluate(doc);

        for (Element tweet : tweets) {
            String tweetId = tweet.getChildText("tweetid");
            KAFDocument kaf = new KAFDocument(LANGUAGE, "v1.naf");
            kaf.createPublic().publicId = tweetId;

            String tweetContentString = tweet.getChildText("content");
            List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE);
            for (List<Token> sentence : segmentedSentences) {
                for (Token token : sentence) {
                    kaf.newWF(token.startOffset(), token.getTokenValue(), 1);
                }
            }
            Path outfile = Files.createFile(Paths.get(tweetId + ".naf"));
            Files.write(outfile, kaf.toString().getBytes(StandardCharsets.UTF_8));
            System.err.println(">> Wrote naf document to " + outfile);
        }
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:nl.colorize.util.xml.XPath.java

License:Apache License

/**
 * Creates an {@code XPath} instance from an XPath expression in text form.
 * @throws IllegalArgumentException if the expression is not valid XPath.
 *//*from ww  w  . ja v a 2 s  .com*/
public static XPath parse(String expression) {
    XPathFactory xpathFactory = XPathFactory.instance();
    XPathExpression<Element> compiled = xpathFactory.compile(expression, Filters.element());
    return new XPath(compiled);
}