List of usage examples for org.jdom2.xpath XPathFactory instance
public static final XPathFactory instance()
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
public void alignToGroundTruth() { ArrayList<Element> ocrAl = new ArrayList<>(); ArrayList<Element> nearGtAl; int start = 1; int end;//from ww w .ja v a 2 s . c o m xpath = XPathFactory.instance().compile("//ns:span[@id]", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); for (Element element : elements) { if (element.getAttributeValue("anchor-id") == null) { if ("".equals(element.getAttributeValue("uc"))) { continue; } ocrAl.add(element); } else { end = ((end = Integer.parseInt(element.getAttributeValue("anchor-id")) - 1) < 1 ? 1 : end); nearGtAl = makeNearGtAl(start, end); makeAlignment(ocrAl, nearGtAl); ocrAl = new ArrayList<>(); start = end + 2; } } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void makeCompliantHocr() { xpath = XPathFactory.instance().compile("//ns:span[@id|@idx]", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); int spanId = 0; for (Element span : elements) { if (span.getAttribute("idx") != null) { try { span = span.getChildren().get(0); } catch (Exception ex) { // }/*w w w . j a v a 2 s . co m*/ } LinkedList<Attribute> attributeLl = new LinkedList(span.getParentElement().getAttributes()); attributeLl.addFirst(new Attribute("id", "w_" + spanId++)); span.getParentElement().setAttributes(attributeLl); String[] suggestions = null; String title = span.getAttributeValue("title"); if (title != null) { suggestions = title.split(" "); } if (suggestions == null) { suggestions = new String[] { "" }; } Element ins = new Element("ins", xmlns); ins.setAttribute("class", "alt"); ins.setAttribute("title", makeNlp(span.getAttributeValue("class"))); ins.setText(span.getText()); span.removeContent(); span.addContent(ins); span.setAttribute("class", "alternatives"); span.removeAttribute("uc"); span.removeAttribute("occ"); span.removeAttribute("title"); span.removeAttribute("anchor"); span.removeAttribute("anchor-id"); span.removeAttribute("id"); span.getParentElement().removeAttribute("idx"); span.removeAttribute("whole"); span.getParentElement().removeAttribute("whole"); if (title == null || "".equals(title)) { continue; } double score = 0.90; for (String suggestion : suggestions) { if (suggestion == null || "".equals(suggestion)) { continue; } Element del = new Element("del", xmlns); del.setAttribute("title", "nlp " + String.format("%.2f", score).replaceAll(",", ".")); score = score - 0.01; suggestion = suggestion.replaceAll(l1PunctMarkFilter, ""); Matcher leftMatcher = l1LeftPunctMarkPattern.matcher(ins.getText()); if (leftMatcher.matches()) { suggestion = leftMatcher.group(1) + suggestion; } Matcher rightMatcher = l1RightPunctMarkPattern.matcher(ins.getText()); if (rightMatcher.matches()) { String ngtSymbol = ""; if (suggestion.endsWith("\u261a")) { ngtSymbol = "\u261a"; suggestion = suggestion.substring(0, suggestion.length() - 1); } suggestion = suggestion + rightMatcher.group(1) + ngtSymbol; } ///!!!! if (suggestion.endsWith("\u261a") && ins.getParentElement().getParentElement() .getAttributeValue("lang", Namespace.XML_NAMESPACE) != null) { String buff = suggestion.substring(0, suggestion.length() - 1); sa.align(buff, ins.getText()); double sim = 1 - sa.getEditDistance() / Math.max((double) buff.length(), (double) ins.getText().length()); if (sim > 0.6) { suggestion = ins.getText() + "\u261b"; ins.setText(buff); ins.setAttribute("title", "nlp 0.70"); } } del.addContent(suggestion); span.addContent(del); } } }
From source file:eu.himeros.hocr.NgtMaker.java
License:Open Source License
public void parseDoc(File file) throws Exception { adjustFile(file);//from w w w . j a va2s . co m start = -1; end = -1; prevValue = -1; ocrAl = new ArrayList<>(1000); outFileName = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "ngt.xml"; builder = new SAXBuilder(); doc = builder.build(file); root = doc.getRootElement(); xmlns = root.getNamespace(); xpath = XPathFactory.instance().compile("//ns:span[@class='ocr_word']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); for (Element element : elements) { parseOcrWord(element); } ocrAl.add("%%%"); ocrAl.add("%%%"); findAnchors(); writeFragment(start, end); }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
private static void absa2015ToNAFNER(KAFDocument kaf, String fileName, String language) { // reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {//from ww w . ja v a2 s. c o m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); // naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); // the list contains just one list of tokens List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // going through every opinion element for each sentence // each opinion element can contain one or more opinions Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { // iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String category = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); System.err.println("-> " + category + ", " + targetString); // adding OTE if (!targetString.equalsIgnoreCase("NULL")) { int fromOffset = Integer.parseInt(opinion.getAttributeValue("from")); int toOffset = Integer.parseInt(opinion.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { // span is +1 with respect to the last token of the span endIndex = i + 1; } } // TODO remove this condition to correct manually offsets if (startIndex != -1 && endIndex != -1) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms); if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType(category); } } } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String absa2015ToWFs(String fileName, String language) { KAFDocument kaf = new KAFDocument("en", "v1.naf"); SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {// w w w.j a v a2 s.co m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); int counter = 1; for (Element sent : sentences) { String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentences) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); } } counter++; } } catch (JDOMException | IOException e) { e.printStackTrace(); } return kaf.toString(); }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String absa2015ToDocCatFormatForPolarity(String fileName, String language, int windowMin, int windowMax) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); Document doc = null;// w w w . j a v a 2 s. c o m String text = ""; try { doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); for (Element sent : sentences) { Element opinionsElement = sent.getChild("Opinions"); String sentStringTmp = sent.getChildText("text"); List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentStringTmp, language); List<Token> sentence = segmentedSentence.get(0); if (opinionsElement != null) { // iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String sentString = ""; String targetString = opinion.getAttributeValue("target"); String polarityString = opinion.getAttributeValue("polarity"); if (targetString.equalsIgnoreCase("NULL") || opinionList.size() == 1) { for (Token token : sentence) { sentString += token.getTokenValue() + " "; } text += polarityString + "\t" + sentString + "\n"; } else { int posTargetMin = -1; int posTargetMax = -1; // List<String> itemsTarget = Arrays.asList(targetString.split(" // ")); List<List<Token>> segmentedtarget = StringUtils.tokenizeSentence(targetString, language); List<Token> target = segmentedtarget.get(0); String targetMin = target.get(0).getTokenValue(); String targetMax = target.get(target.size() - 1).getTokenValue(); int count = 0; for (Token token : sentence) { if (token.getTokenValue().equals(targetMin)) { posTargetMin = count; } if (token.getTokenValue().equals(targetMax) && posTargetMin > -1) { posTargetMax = count; break; } count++; } if (posTargetMin - windowMin >= 0) { posTargetMin = posTargetMin - windowMin; } else posTargetMin = 0; if (posTargetMax + windowMax < sentence.size()) { posTargetMax = posTargetMax + windowMax; } else posTargetMax = sentence.size() - 1; for (int x = posTargetMin; x <= posTargetMax; x++) { sentString += sentence.get(x).getTokenValue() + " "; } text += polarityString + "\t" + sentString + "\n"; } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } return text; }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
private static void absa2014ToNAFNER(KAFDocument kaf, String fileName, String language) { // reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {//from w w w. ja va 2 s. c o m Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); // naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); // the list contains just one list of tokens List<List<Token>> segmentedSentence = StringUtils.tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // going through every opinion element for each sentence // each opinion element can contain one or more opinions Element aspectTermsElem = sent.getChild("aspectTerms"); if (aspectTermsElem != null) { List<Element> aspectTermsList = aspectTermsElem.getChildren(); // iterating over every opinion in the opinions element if (!aspectTermsList.isEmpty()) { for (Element aspectTerm : aspectTermsList) { // String targetString = aspectTerm.getAttributeValue("term"); // System.err.println("-> " + targetString); // adding OTE int fromOffset = Integer.parseInt(aspectTerm.getAttributeValue("from")); int toOffset = Integer.parseInt(aspectTerm.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { // span is +1 with respect to the last token of the span endIndex = i + 1; } } // TODO remove this condition to correct manually offsets if (startIndex != -1 && endIndex != -1) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = NAFUtils.getWFIdsFromTerms(sentTerms); if (NAFUtils.checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("term"); } } } } } } // end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.DSRCCorpus.java
License:Apache License
private static void DSRCToNAFNER(KAFDocument kaf, String wordsDoc, String markablesDoc) throws JDOMException, IOException { // reading the words xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); Document docWords = sax.build(wordsDoc); XPathExpression<Element> expr = xFactory.compile("//word", Filters.element()); List<Element> words = expr.evaluate(docWords); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); // building the NAF containing the WFs and Terms // naf sentence counter int sentCounter = 1; for (Element word : words) { // sentence id and original text String token = word.getText(); // the list contains just one list of tokens WF wf = kaf.newWF(0, token, sentCounter); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf);//from w ww .java 2 s .co m sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token); sentTerms.add(term); Matcher endMatcher = endOfSentence.matcher(token); if (endMatcher.matches()) { sentCounter++; } } // end of processing words String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } // processing markables document in mmax opinion expression files Document markDoc = sax.build(markablesDoc); XPathFactory markFactory = XPathFactory.instance(); XPathExpression<Element> markExpr = markFactory.compile("//ns:markable", Filters.element(), null, Namespace.getNamespace("ns", "www.eml.org/NameSpaces/OpinionExpression")); List<Element> markables = markExpr.evaluate(markDoc); for (Element markable : markables) { if (markable.getAttributeValue("annotation_type").equalsIgnoreCase("target")) { String markSpan = markable.getAttributeValue("span"); System.err.println("--> span: " + markSpan); String removeCommaSpan = markSpan.replaceAll(",word_.*", ""); System.err.println("--> newSpan: " + removeCommaSpan); String[] spanWords = removeCommaSpan.split("\\.\\."); int startIndex = Integer.parseInt(spanWords[0].replace("word_", "")); int endIndex = Integer.parseInt(spanWords[spanWords.length - 1].replace("word_", "")) + 1; List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex - 1, endIndex - 1)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("TARGET"); System.err.println("--> target: " + neEntity.getStr()); } } // end of create entity } }
From source file:eus.ixa.ixa.pipe.convert.TassFormat.java
License:Apache License
public static void generalToTabulated(String fileName) throws JDOMException, IOException { StringBuilder sb = new StringBuilder(); // reading the TASS General Corpus xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element()); List<Element> tweets = expr.evaluate(doc); for (Element tweet : tweets) { String tokenizedTweetContent = null; String tweetPolarity = null; String tweetId = tweet.getChildText("tweetid"); String tweetContentString = tweet.getChildText("content"); // the list contains just one list of tokens List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE); for (List<Token> tokenizedSentence : segmentedSentences) { String[] tokenizedTweetArray = eus.ixa.ixa.pipe.ml.utils.StringUtils .convertListTokenToArrayStrings(tokenizedSentence); tokenizedTweetContent = StringUtils.getStringFromTokens(tokenizedTweetArray); }// w ww. j av a 2s. c o m if (tweet.getChild("sentiments").getChild("polarity").getChildText("value") != null) { tweetPolarity = tweet.getChild("sentiments").getChild("polarity").getChildText("value"); } sb.append(tweetId).append("\t").append(tweetPolarity).append("\t").append(tokenizedTweetContent) .append("\n"); } System.out.println(sb.toString()); }
From source file:eus.ixa.ixa.pipe.convert.TassFormat.java
License:Apache License
public static void generalToWFs(String fileName) { SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try {/*from w w w .j a v a 2 s .c o m*/ Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//tweet", Filters.element()); List<Element> tweets = expr.evaluate(doc); for (Element tweet : tweets) { String tweetId = tweet.getChildText("tweetid"); KAFDocument kaf = new KAFDocument(LANGUAGE, "v1.naf"); kaf.createPublic().publicId = tweetId; String tweetContentString = tweet.getChildText("content"); List<List<Token>> segmentedSentences = StringUtils.tokenizeSentence(tweetContentString, LANGUAGE); for (List<Token> sentence : segmentedSentences) { for (Token token : sentence) { kaf.newWF(token.startOffset(), token.getTokenValue(), 1); } } Path outfile = Files.createFile(Paths.get(tweetId + ".naf")); Files.write(outfile, kaf.toString().getBytes(StandardCharsets.UTF_8)); System.err.println(">> Wrote naf document to " + outfile); } } catch (JDOMException | IOException e) { e.printStackTrace(); } }