List of usage examples for org.jsoup.select NodeVisitor NodeVisitor
NodeVisitor
From source file:com.screenslicer.core.scrape.ProcessPage.java
private static void trim(Element body) { final List<Node> toRemove = new ArrayList<Node>(); body.traverse(new NodeVisitor() { @Override/*from w w w . jav a2 s. c om*/ public void tail(Node n, int d) { } @Override public void head(Node node, int d) { if (Util.isHidden(node)) { toRemove.add(node); } } }); for (Node node : toRemove) { node.remove(); } }
From source file:hello.Scraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc = Jsoup.parse(html); final Elements anchorNodes = htmlDoc.select("body").select("ul").select("li"); final List<Element> anchorList = new ArrayList<Element>(); anchorNodes.traverse(new NodeVisitor() { @Override/*w ww. jav a 2 s . co m*/ public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; anchorList.add(e); } } @Override public void tail(Node node, int depth) { } }); return anchorList; }
From source file:crawler.AScraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc; try {//w w w.j a va 2s. c om htmlDoc = Jsoup.parse(new String(html.getBytes("ISO-8859-1"), "GBK")); } catch (UnsupportedEncodingException e) { LOG.error("Unsupported page encoding."); return null; } final Elements anchorNodes = htmlDoc.select("body").select("div[id^=read]").select("a"); final List<Element> anchorList = new ArrayList<>(); anchorNodes.traverse(new NodeVisitor() { @Override public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; if (StringUtils.containsIgnoreCase(e.text(), ANCHOR_TEXT_PATTERN)) { anchorList.add(e); } } } @Override public void tail(Node node, int depth) { } }); return anchorList; }
From source file:com.screenslicer.common.CommonUtil.java
private static Element sanitize(Document doc, final boolean ascii) { if (ascii) {// w w w . jav a2 s. com doc.outputSettings().charset("ascii"); } else { doc.outputSettings().charset("utf-8"); } doc.traverse(new NodeVisitor() { @Override public void tail(Node n, int d) { } @Override public void head(Node n, int d) { try { if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) { ((TextNode) n).text(HtmlCoder.decode(n.toString())); } } catch (Throwable t) { Log.exception(t); } } }); return doc; }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * metaTitle?metaTitle,metaTitle??????title * * @param contentElement/*from w ww.j a v a 2 s. com*/ * @return * @throws Exception */ protected String getTitle(final Element contentElement) throws Exception { final ArrayList<Element> titleList = new ArrayList<Element>(); final ArrayList<Double> titleSim = new ArrayList<Double>(); final String metaTitle = getText(doc.title().trim()); if (!metaTitle.isEmpty()) { doc.body().traverse(new NodeVisitor() { @Override public void head(Node node, int i) { if (node instanceof Element) { Element tag = (Element) node; String tagName = tag.tagName(); if (Pattern.matches("h[1-6]", tagName)) { String title = tag.text().trim(); double sim = strSim(title, metaTitle); titleSim.add(sim); titleList.add(tag); } } } @Override public void tail(Node node, int i) { } }); int index = titleSim.size(); if (index >= 0) { double maxScore = 0; int maxIndex = -1; for (int i = 0; i < index; i++) { double score = (i + 1) * titleSim.get(i); if (score > maxScore) { maxScore = score; maxIndex = i; } } if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) { String title = getText(metaTitle); if (!title.endsWith("") && title.length() > 7) { return title; } Collections.sort(titleList, new Comparator<Element>() { @Override public int compare(Element o1, Element o2) { int len1 = 1; int len2 = 1; if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len1 = 0; } if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len2 = 0; } if (len1 == len2) { return o1.tagName().charAt(1) - o2.tagName().charAt(1); } return len2 - len1; } }); return getText(titleList.get(0).text()); } return titleList.get(maxIndex).text(); } } /** * ? */ Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]"); if (titles.size() > 0) { String title = titles.first().text(); if (title.length() > 5 && title.length() < 40) { return titles.first().text(); } } try { return getTitleByEditDistance(contentElement); } catch (Exception ex) { throw new Exception("title not found"); } }
From source file:com.screenslicer.core.util.BrowserUtil.java
public static Element openElement(final Browser browser, boolean init, final String[] whitelist, final String[] patterns, final HtmlNode[] urlNodes, final UrlTransform[] transforms) throws ActionFailed { try {//from w ww .j ava2s. co m if (init) { int myStartId; synchronized (startIdLock) { startId = startId == Integer.MAX_VALUE ? 0 : startId + 1; myStartId = startId; } browser.executeScript(" var all = document.body.getElementsByTagName('*');" + "for(var i = 0; i < all.length; i++){" + " if(all[i].className && typeof all[i].className == 'string'){" + " all[i].className=all[i].className.replace(/" + HIDDEN_MARKER + "/g,'').replace(/" + FILTERED_MARKER + "/g,'').replace(/" + FILTERED_LENIENT_MARKER + "/g,'').replace(/\\s+/g,' ').trim();" + " }" + "}" + isVisible + "for(var j = 0; j < all.length; j++){" + " if(!all[j].className.match(/" + NODE_MARKER + "\\d+_\\d+/g)){" + " all[j].className += ' " + NODE_MARKER + myStartId + "_'+j+' ';" + " }" + " if(!isVisible(all[j])){" + " all[j].className += ' " + HIDDEN_MARKER + " ';" + " }" + "}"); } String url = browser.getCurrentUrl(); new URL(url); Element element = CommonUtil.parse(browser.getPageSource(), url, false).body(); element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (!node.nodeName().equals("#text") && !NodeUtil.isEmpty(node)) { NodeUtil.markVisible(node); } } }); if ((whitelist != null && whitelist.length > 0) || (patterns != null && patterns.length > 0) || (urlNodes != null && urlNodes.length > 0)) { element.traverse(new NodeVisitor() { @Override public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { if (node.nodeName().equals("a")) { if (UrlUtil.isUrlFiltered(browser.getCurrentUrl(), node.attr("href"), node, whitelist, patterns, urlNodes, transforms)) { NodeUtil.markFiltered(node, false); } } else { String urlAttr = UrlUtil.urlFromAttr(node); if (!CommonUtil.isEmpty(urlAttr) && UrlUtil.isUrlFiltered(browser.getCurrentUrl(), urlAttr, node, whitelist, patterns, urlNodes, transforms)) { NodeUtil.markFiltered(node, true); } } } }); } if (WebApp.DEBUG) { try { FileUtils.writeStringToFile(new File("./" + System.currentTimeMillis() + ".log.scrape"), element.outerHtml(), "utf-8"); } catch (IOException e) { } } return element; } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { throw new ActionFailed(t); } }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
protected String getTitleByEditDistance(Element contentElement) throws Exception { final String metaTitle = doc.title(); final ArrayList<Double> max = new ArrayList<Double>(); max.add(0.0);/*from w ww. ja v a2s . c o m*/ final StringBuilder sb = new StringBuilder(); doc.body().traverse(new NodeVisitor() { public void head(Node node, int i) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; String text = tn.text().trim(); double sim = strSim(text, metaTitle); if (sim > 0) { if (sim > max.get(0)) { max.set(0, sim); sb.setLength(0); sb.append(text); } } } } public void tail(Node node, int i) { } }); if (sb.length() > 0) { return sb.toString(); } throw new Exception(); }
From source file:com.screenslicer.core.util.Util.java
public static void clean(Node node) { node.traverse(new NodeVisitor() { @Override//w w w . j a v a2 s.com public void tail(Node node, int depth) { } @Override public void head(Node node, int depth) { String classAttr = node.attr("class"); classAttr = cleanClass(classAttr); if (CommonUtil.isEmpty(classAttr)) { node.removeAttr("class"); } else { node.attr("class", classAttr); } } }); }
From source file:com.screenslicer.core.util.Util.java
private static void markFiltered(Node node, final boolean lenient) { if (lenient) { if (!isFilteredLenient(node)) { node.attr("class", node.attr("class") + " " + FILTERED_LENIENT_MARKER + " "); }// www.j a v a 2 s. c o m } else { node.traverse(new NodeVisitor() { @Override public void tail(Node n, int d) { } @Override public void head(Node n, int d) { if (!isFiltered(n)) { n.attr("class", n.attr("class") + " " + FILTERED_MARKER + " "); } } }); } }
From source file:com.screenslicer.core.util.Util.java
public static Element markTestElement(Element element) { element.traverse(new NodeVisitor() { @Override/*from w w w.j a v a2 s.c o m*/ public void tail(Node node, int level) { } @Override public void head(Node node, int level) { node.attr("class", nodeMarker.matcher(node.attr("class")).replaceAll("")); } }); element.traverse(new NodeVisitor() { int count = 0; @Override public void tail(Node node, int level) { } @Override public void head(Node node, int level) { ++count; node.attr("class", node.attr("class") + " " + NODE_MARKER + count + " "); } }); return element; }