List of usage examples for org.jsoup.nodes Node attr
public String attr(String attributeKey)
From source file:com.bibisco.manager.TextEditorManager.java
private static void parseNode(HtmlParsingResult pHtmlParsingResult, Node pNode, boolean pBlnExcludeSpellCheck) { mLog.debug("Start parseNode(HtmlParsingResult, Node, boolean): ", pNode.nodeName()); if ("#text".equals(pNode.nodeName())) { parseTextNode(pHtmlParsingResult, pNode); } else if ("spellerror".equals(pNode.nodeName()) && pBlnExcludeSpellCheck) { // Do nothing } else if ("span".equals(pNode.nodeName()) && pNode.attr("style").equals("display: none;")) { // Do nothing } else {/*w ww .ja v a2 s . c om*/ if ("ul".equals(pNode.nodeName())) { pHtmlParsingResult.ulOpen = true; } if ("ol".equals(pNode.nodeName())) { pHtmlParsingResult.olOpen = true; } if ("li".equals(pNode.nodeName())) { if (pHtmlParsingResult.ulOpen) { pHtmlParsingResult.characterCount += 1; } else if (pHtmlParsingResult.olOpen) { pHtmlParsingResult.characterCount += 1; pHtmlParsingResult.olLiPosition += 1; pHtmlParsingResult.characterCount += String.valueOf(pHtmlParsingResult.olLiPosition).length(); } } for (Node lNode : pNode.childNodes()) { parseNode(pHtmlParsingResult, lNode, pBlnExcludeSpellCheck); } if ("ul".equals(pNode.nodeName())) { pHtmlParsingResult.ulOpen = false; } if ("ol".equals(pNode.nodeName())) { pHtmlParsingResult.olOpen = false; pHtmlParsingResult.olLiPosition = 0; } } mLog.debug("End parseNode(HtmlParsingResult, Node, boolean)"); }
From source file:com.screenslicer.core.util.Util.java
public static boolean isHidden(Node node) { return node.attr("class").indexOf(HIDDEN_MARKER) > -1; }
From source file:com.screenslicer.core.util.Util.java
public static boolean isFiltered(Node node) { return node.attr("class").indexOf(FILTERED_MARKER) > -1; }
From source file:com.screenslicer.core.util.Util.java
public static boolean isFilteredLenient(Node node) { return node.attr("class").indexOf(FILTERED_MARKER) > -1 || node.attr("class").indexOf(FILTERED_LENIENT_MARKER) > -1; }
From source file:com.screenslicer.core.util.Util.java
public static String classId(Node node) { if (node != null) { String className = node.attr("class"); if (!CommonUtil.isEmpty(className)) { Matcher matcher = nodeMarker.matcher(className); if (matcher.find()) { return matcher.group(0); }/*from w w w . j a v a 2 s .c o m*/ } } return null; }
From source file:com.screenslicer.core.util.BrowserUtil.java
private static WebElement toElement(Browser browser, Node node, HtmlNode htmlNode, boolean recurse) { if (node != null || htmlNode != null) { try {//ww w .j a v a 2 s .c om String classId = NodeUtil.classId(node); if (classId != null) { WebElement element = browser.findElementByClassName(classId); if (element != null) { return element; } } } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } } if (recurse) { Log.warn("Could not convert Node to WebElement... trying fuzzy search"); if (node != null) { try { HtmlNode find = new HtmlNode(); find.alt = node.attr("alt"); find.classes = CommonUtil.isEmpty(node.attr("class")) ? null : node.attr("class").split("\\s"); find.href = node.attr("href"); find.id = node.attr("id"); find.innerText = node instanceof Element ? ((Element) node).text() : null; find.name = node.attr("name"); find.tagName = node.nodeName(); find.title = node.attr("title"); find.role = node.attr("role"); find.type = node.attr("type"); find.value = node.attr("value"); find.fuzzy = true; WebElement found = toElement(browser, find, BrowserUtil.openElement(browser, false, null, null, null, null), false); found = found == null ? toElement(browser, find, null, false) : found; if (found != null) { return found; } } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } } if (htmlNode != null) { try { WebElement found = toElement(browser, htmlNode, BrowserUtil.openElement(browser, false, null, null, null, null), false); found = found == null ? toElement(browser, htmlNode, null, false) : found; if (found != null) { return found; } } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } } } Log.warn("Could not convert Node to WebElement... failed permanently"); return null; }
From source file:com.screenslicer.core.util.BrowserUtil.java
public static Element openElement(final Browser browser, boolean init, final String[] whitelist, final String[] patterns, final HtmlNode[] urlNodes, final UrlTransform[] transforms) throws ActionFailed { try {// w w w .j a v a 2 s .c om if (init) { int myStartId; synchronized (startIdLock) { startId = startId == Integer.MAX_VALUE ? 0 : startId + 1; myStartId = startId; } browser.executeScript(" var all = document.body.getElementsByTagName('*');" + "for(var i = 0; i < all.length; i++){" + " if(all[i].className && typeof all[i].className == 'string'){" + " all[i].className=all[i].className.replace(/" + HIDDEN_MARKER + "/g,'').replace(/" + FILTERED_MARKER + "/g,'').replace(/" + FILTERED_LENIENT_MARKER + "/g,'').replace(/\\s+/g,' ').trim();" + " }" + "}" + isVisible + "for(var j = 0; j < all.length; j++){" + " if(!all[j].className.match(/" + NODE_MARKER + "\\d+_\\d+/g)){" + " all[j].className += ' " + NODE_MARKER + myStartId + "_'+j+' ';" + " }" + " if(!isVisible(all[j])){" + " all[j].className += ' " + HIDDEN_MARKER + " ';" + " }" + "}"); } String url = browser.getCurrentUrl(); new URL(url); Element element = CommonUtil.parse(browser.getPageSource(), url, false).body(); element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (!node.nodeName().equals("#text") && !NodeUtil.isEmpty(node)) { NodeUtil.markVisible(node); } } }); if ((whitelist != null && whitelist.length > 0) || (patterns != null && patterns.length > 0) || (urlNodes != null && urlNodes.length > 0)) { element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (node.nodeName().equals("a")) { if (UrlUtil.isUrlFiltered(browser.getCurrentUrl(), node.attr("href"), node, whitelist, patterns, urlNodes, transforms)) { NodeUtil.markFiltered(node, false); } } else { String urlAttr = UrlUtil.urlFromAttr(node); if (!CommonUtil.isEmpty(urlAttr) && UrlUtil.isUrlFiltered(browser.getCurrentUrl(), urlAttr, node, whitelist, patterns, urlNodes, transforms)) { NodeUtil.markFiltered(node, true); } } } }); } if (WebApp.DEBUG) { try { FileUtils.writeStringToFile(new File("./" + System.currentTimeMillis() + ".log.scrape"), element.outerHtml(), "utf-8"); } catch (IOException e) { } } return element; } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { throw new ActionFailed(t); } }
From source file:com.screenslicer.core.util.Util.java
public static void clean(Node node) { node.traverse(new NodeVisitor() { @Override/*from w w w .java2s .c o m*/ public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { String classAttr = node.attr("class"); classAttr = cleanClass(classAttr); if (CommonUtil.isEmpty(classAttr)) { node.removeAttr("class"); } else { node.attr("class", classAttr); } } }); }
From source file:com.screenslicer.core.util.Util.java
public static Element openElement(final RemoteWebDriver driver, final String[] whitelist, final String[] patterns, final UrlTransform[] transforms) throws ActionFailed { try {// ww w .j a va2 s . com driver.executeScript(" var all = document.getElementsByTagName('*');" + "for(var i = 0; i < all.length; i++){" + " if(all[i].className){" + " all[i].className=all[i].className.replace(/" + NODE_MARKER + "\\d+/g,'').replace(/" + HIDDEN_MARKER + "/g,'').replace(/" + FILTERED_MARKER + "/g,'').replace(/" + FILTERED_LENIENT_MARKER + "/g,'').replace(/\\s+/g,' ').trim();" + " }" + "}" + isVisible + "for(var j = 0; j < all.length; j++){" + " all[j].className += ' " + NODE_MARKER + "'+j+' ';" + " if(!isVisible(all[j])){" + " all[j].className += ' " + HIDDEN_MARKER + " ';" + " }" + "}"); String url = driver.getCurrentUrl(); new URL(url); Element element = parse(driver.getPageSource(), url).body(); element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (!node.nodeName().equals("#text") && !isEmpty(node)) { markVisible(node); } } }); if ((whitelist != null && whitelist.length > 0) || (patterns != null && patterns.length > 0)) { element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (node.nodeName().equals("a")) { if (isUrlFiltered(driver.getCurrentUrl(), node.attr("href"), whitelist, patterns, transforms)) { markFiltered(node, false); } } else { String urlAttr = Util.urlFromAttr(node); if (!CommonUtil.isEmpty(urlAttr) && isUrlFiltered(driver.getCurrentUrl(), urlAttr, whitelist, patterns, transforms)) { markFiltered(node, true); } } } }); } return element; } catch (Exception e) { Log.exception(e); throw new ActionFailed(e); } }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected DetailledItem parse_result(String html) throws IOException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from www . j a v a 2s .com String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING); Document doc2 = Jsoup.parse(html2); doc2.setBaseUri(opac_url); String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive", ENCODING); Document doc3 = Jsoup.parse(html3); doc3.setBaseUri(opac_url); DetailledItem result = new DetailledItem(); try { result.setId(doc.select("#bibtip_id").text().trim()); } catch (Exception ex) { ex.printStackTrace(); } List<String> reservationlinks = new ArrayList<>(); for (Element link : doc3.select("#vormerkung a, #tab-content a")) { String href = link.absUrl("href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } // Vormerken if (hrefq.get("methodToCall") != null) { if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) { reservationlinks.add(href.split("\\?")[1]); } } } if (reservationlinks.size() == 1) { result.setReservable(true); result.setReservation_info(reservationlinks.get(0)); } else if (reservationlinks.size() == 0) { result.setReservable(false); } else { // TODO: Multiple options - handle this case! } if (doc.select(".data td img").size() == 1) { result.setCover(doc.select(".data td img").first().attr("abs:src")); try { downloadCover(result); } catch (Exception e) { } } if (doc.select(".aw_teaser_title").size() == 1) { result.setTitle(doc.select(".aw_teaser_title").first().text().trim()); } else if (doc.select(".data td strong").size() > 0) { result.setTitle(doc.select(".data td strong").first().text().trim()); } else { result.setTitle(""); } if (doc.select(".aw_teaser_title_zusatz").size() > 0) { result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim())); } String title = ""; String text = ""; boolean takeover = false; Element detailtrs = doc2.select(".box-container .data td").first(); for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { title = ((Element) node).text().trim(); text = ""; } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); takeover = true; break; } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } if (!takeover) { text = ""; title = ""; } detailtrs = doc2.select("#tab-content .data td").first(); if (detailtrs != null) { for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } text = ""; } title = ((Element) node).text().trim(); } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); } else { text = text + ((Element) node).text(); } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } } else { if (doc2.select("#tab-content .fulltitle tr").size() > 0) { Elements rows = doc2.select("#tab-content .fulltitle tr"); for (Element tr : rows) { if (tr.children().size() == 2) { Element valcell = tr.child(1); String value = valcell.text().trim(); if (valcell.select("a").size() == 1) { value = valcell.select("a").first().absUrl("href"); } result.addDetail(new Detail(tr.child(0).text().trim(), value)); } } } else { result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL))); } } if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } } for (Element link : doc3.select("#tab-content a")) { Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href")); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } } for (Element link : doc3.select(".box-container a")) { if (link.text().trim().equals("Download")) { result.addDetail( new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href"))); } } Map<String, Integer> copy_columnmap = new HashMap<>(); // Default values copy_columnmap.put("barcode", 1); copy_columnmap.put("branch", 3); copy_columnmap.put("status", 4); Elements copy_columns = doc.select("#tab-content .data tr#bg2 th"); for (int i = 0; i < copy_columns.size(); i++) { Element th = copy_columns.get(i); String head = th.text().trim(); if (head.contains("Status")) { copy_columnmap.put("status", i); } if (head.contains("Zweigstelle")) { copy_columnmap.put("branch", i); } if (head.contains("Mediennummer")) { copy_columnmap.put("barcode", i); } if (head.contains("Standort")) { copy_columnmap.put("location", i); } if (head.contains("Signatur")) { copy_columnmap.put("signature", i); } } Pattern status_lent = Pattern.compile( "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$"); Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$"); Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : exemplartrs) { try { Copy copy = new Copy(); Element status = tr.child(copy_columnmap.get("status")); Element barcode = tr.child(copy_columnmap.get("barcode")); String barcodetext = barcode.text().trim().replace(" Wegweiser", ""); // STATUS String statustext; if (status.getElementsByTag("b").size() > 0) { statustext = status.getElementsByTag("b").text().trim(); } else { statustext = status.text().trim(); } if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) { Matcher matcher1 = status_and_barcode.matcher(statustext); if (matcher1.matches()) { statustext = matcher1.group(1); barcodetext = matcher1.group(2); } } Matcher matcher = status_lent.matcher(statustext); if (matcher.matches()) { copy.setStatus(matcher.group(1)); copy.setReservations(matcher.group(3)); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } else { copy.setStatus(statustext); } copy.setBarcode(barcodetext); if (status.select("a[href*=doVormerkung]").size() == 1) { copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]); } String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", ""); copy.setBranch(branchtext); if (copy_columnmap.containsKey("location")) { copy.setLocation( tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", "")); } if (copy_columnmap.containsKey("signature")) { copy.setShelfmark( tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", "")); } result.addCopy(copy); } catch (Exception ex) { ex.printStackTrace(); } } try { Element isvolume = null; Map<String, String> volume = new HashMap<>(); Elements links = doc.select(".data td a"); int elcount = links.size(); for (int eli = 0; eli < elcount; eli++) { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8"); for (NameValuePair nv : anyurl) { if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) { isvolume = links.get(eli); } else if (nv.getName().equals("catKey")) { volume.put("catKey", nv.getValue()); } else if (nv.getName().equals("dbIdentifier")) { volume.put("dbIdentifier", nv.getValue()); } } if (isvolume != null) { volume.put("volume", "true"); result.setVolumesearch(volume); break; } } } catch (Exception e) { e.printStackTrace(); } return result; }