Example usage for org.jsoup.select NodeVisitor NodeVisitor

List of usage examples for org.jsoup.select NodeVisitor NodeVisitor

Introduction

In this page you can find the example usage for org.jsoup.select NodeVisitor NodeVisitor.

Prototype

NodeVisitor

Source Link

Usage

From source file:com.screenslicer.core.scrape.ProcessPage.java

private static void trim(Element body) {
    final List<Node> toRemove = new ArrayList<Node>();
    body.traverse(new NodeVisitor() {
        @Override/*from   w  w w .  jav  a2  s.  c  om*/
        public void tail(Node n, int d) {
        }

        @Override
        public void head(Node node, int d) {
            if (Util.isHidden(node)) {
                toRemove.add(node);
            }
        }
    });
    for (Node node : toRemove) {
        node.remove();
    }
}

From source file:hello.Scraper.java

@Splitter(inputChannel = "channel1", outputChannel = "channel2")
public List<Element> scrape(ResponseEntity<String> payload) {
    String html = payload.getBody();
    final Document htmlDoc = Jsoup.parse(html);
    final Elements anchorNodes = htmlDoc.select("body").select("ul").select("li");

    final List<Element> anchorList = new ArrayList<Element>();
    anchorNodes.traverse(new NodeVisitor() {
        @Override/*w  ww.  jav  a 2 s  .  co  m*/
        public void head(org.jsoup.nodes.Node node, int depth) {
            if (node instanceof org.jsoup.nodes.Element) {
                Element e = (Element) node;
                anchorList.add(e);
            }
        }

        @Override
        public void tail(Node node, int depth) {
        }
    });

    return anchorList;
}

From source file:crawler.AScraper.java

@Splitter(inputChannel = "channel1", outputChannel = "channel2")
public List<Element> scrape(ResponseEntity<String> payload) {
    String html = payload.getBody();
    final Document htmlDoc;
    try {//w  w  w.j  a va 2s. c  om
        htmlDoc = Jsoup.parse(new String(html.getBytes("ISO-8859-1"), "GBK"));
    } catch (UnsupportedEncodingException e) {
        LOG.error("Unsupported page encoding.");
        return null;
    }
    final Elements anchorNodes = htmlDoc.select("body").select("div[id^=read]").select("a");
    final List<Element> anchorList = new ArrayList<>();
    anchorNodes.traverse(new NodeVisitor() {
        @Override
        public void head(org.jsoup.nodes.Node node, int depth) {
            if (node instanceof org.jsoup.nodes.Element) {
                Element e = (Element) node;
                if (StringUtils.containsIgnoreCase(e.text(), ANCHOR_TEXT_PATTERN)) {
                    anchorList.add(e);
                }
            }
        }

        @Override
        public void tail(Node node, int depth) {
        }
    });
    return anchorList;
}

From source file:com.screenslicer.common.CommonUtil.java

private static Element sanitize(Document doc, final boolean ascii) {
    if (ascii) {//  w  w w  .  jav  a2  s. com
        doc.outputSettings().charset("ascii");
    } else {
        doc.outputSettings().charset("utf-8");
    }
    doc.traverse(new NodeVisitor() {
        @Override
        public void tail(Node n, int d) {
        }

        @Override
        public void head(Node n, int d) {
            try {
                if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) {
                    ((TextNode) n).text(HtmlCoder.decode(n.toString()));
                }
            } catch (Throwable t) {
                Log.exception(t);
            }
        }
    });
    return doc;
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * metaTitle?metaTitle,metaTitle??????title
 *
 * @param contentElement/*from  w  ww.j a v  a 2  s.  com*/
 * @return
 * @throws Exception
 */
protected String getTitle(final Element contentElement) throws Exception {
    final ArrayList<Element> titleList = new ArrayList<Element>();
    final ArrayList<Double> titleSim = new ArrayList<Double>();
    final String metaTitle = getText(doc.title().trim());
    if (!metaTitle.isEmpty()) {
        doc.body().traverse(new NodeVisitor() {
            @Override
            public void head(Node node, int i) {
                if (node instanceof Element) {
                    Element tag = (Element) node;
                    String tagName = tag.tagName();
                    if (Pattern.matches("h[1-6]", tagName)) {
                        String title = tag.text().trim();
                        double sim = strSim(title, metaTitle);
                        titleSim.add(sim);
                        titleList.add(tag);
                    }
                }
            }

            @Override
            public void tail(Node node, int i) {
            }
        });
        int index = titleSim.size();
        if (index >= 0) {
            double maxScore = 0;
            int maxIndex = -1;
            for (int i = 0; i < index; i++) {
                double score = (i + 1) * titleSim.get(i);
                if (score > maxScore) {
                    maxScore = score;
                    maxIndex = i;
                }
            }

            if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) {
                String title = getText(metaTitle);
                if (!title.endsWith("") && title.length() > 7) {
                    return title;
                }
                Collections.sort(titleList, new Comparator<Element>() {
                    @Override
                    public int compare(Element o1, Element o2) {
                        int len1 = 1;
                        int len2 = 1;
                        if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len1 = 0;
                        }
                        if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len2 = 0;
                        }
                        if (len1 == len2) {
                            return o1.tagName().charAt(1) - o2.tagName().charAt(1);
                        }
                        return len2 - len1;
                    }
                });
                return getText(titleList.get(0).text());
            }
            return titleList.get(maxIndex).text();
        }
    }

    /**
     * ?
     */
    Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
    if (titles.size() > 0) {
        String title = titles.first().text();
        if (title.length() > 5 && title.length() < 40) {
            return titles.first().text();
        }
    }
    try {
        return getTitleByEditDistance(contentElement);
    } catch (Exception ex) {
        throw new Exception("title not found");
    }

}

From source file:com.screenslicer.core.util.BrowserUtil.java

public static Element openElement(final Browser browser, boolean init, final String[] whitelist,
        final String[] patterns, final HtmlNode[] urlNodes, final UrlTransform[] transforms)
        throws ActionFailed {
    try {//from   w  ww  .j ava2s. co m
        if (init) {
            int myStartId;
            synchronized (startIdLock) {
                startId = startId == Integer.MAX_VALUE ? 0 : startId + 1;
                myStartId = startId;
            }
            browser.executeScript("      var all = document.body.getElementsByTagName('*');"
                    + "for(var i = 0; i < all.length; i++){"
                    + "  if(all[i].className && typeof all[i].className == 'string'){"
                    + "    all[i].className=all[i].className.replace(/" + HIDDEN_MARKER + "/g,'').replace(/"
                    + FILTERED_MARKER + "/g,'').replace(/" + FILTERED_LENIENT_MARKER
                    + "/g,'').replace(/\\s+/g,' ').trim();" + "  }" + "}" + isVisible
                    + "for(var j = 0; j < all.length; j++){" + "  if(!all[j].className.match(/" + NODE_MARKER
                    + "\\d+_\\d+/g)){" + "    all[j].className += ' " + NODE_MARKER + myStartId + "_'+j+' ';"
                    + "  }" + "  if(!isVisible(all[j])){" + "    all[j].className += ' " + HIDDEN_MARKER + " ';"
                    + "  }" + "}");
        }
        String url = browser.getCurrentUrl();
        new URL(url);
        Element element = CommonUtil.parse(browser.getPageSource(), url, false).body();
        element.traverse(new NodeVisitor() {
            @Override
            public void tail(Node node, int depth) {
            }

            @Override
            public void head(Node node, int depth) {
                if (!node.nodeName().equals("#text") && !NodeUtil.isEmpty(node)) {
                    NodeUtil.markVisible(node);
                }
            }
        });
        if ((whitelist != null && whitelist.length > 0) || (patterns != null && patterns.length > 0)
                || (urlNodes != null && urlNodes.length > 0)) {
            element.traverse(new NodeVisitor() {
                @Override
                public void tail(Node node, int depth) {
                }

                @Override
                public void head(Node node, int depth) {
                    if (node.nodeName().equals("a")) {
                        if (UrlUtil.isUrlFiltered(browser.getCurrentUrl(), node.attr("href"), node, whitelist,
                                patterns, urlNodes, transforms)) {
                            NodeUtil.markFiltered(node, false);
                        }
                    } else {
                        String urlAttr = UrlUtil.urlFromAttr(node);
                        if (!CommonUtil.isEmpty(urlAttr) && UrlUtil.isUrlFiltered(browser.getCurrentUrl(),
                                urlAttr, node, whitelist, patterns, urlNodes, transforms)) {
                            NodeUtil.markFiltered(node, true);
                        }
                    }
                }
            });
        }
        if (WebApp.DEBUG) {
            try {
                FileUtils.writeStringToFile(new File("./" + System.currentTimeMillis() + ".log.scrape"),
                        element.outerHtml(), "utf-8");
            } catch (IOException e) {
            }
        }
        return element;
    } catch (Browser.Retry r) {
        throw r;
    } catch (Browser.Fatal f) {
        throw f;
    } catch (Throwable t) {
        throw new ActionFailed(t);
    }
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

protected String getTitleByEditDistance(Element contentElement) throws Exception {
    final String metaTitle = doc.title();

    final ArrayList<Double> max = new ArrayList<Double>();
    max.add(0.0);/*from  w  ww.  ja  v a2s . c o  m*/
    final StringBuilder sb = new StringBuilder();
    doc.body().traverse(new NodeVisitor() {

        public void head(Node node, int i) {

            if (node instanceof TextNode) {
                TextNode tn = (TextNode) node;
                String text = tn.text().trim();
                double sim = strSim(text, metaTitle);
                if (sim > 0) {
                    if (sim > max.get(0)) {
                        max.set(0, sim);
                        sb.setLength(0);
                        sb.append(text);
                    }
                }

            }
        }

        public void tail(Node node, int i) {
        }
    });
    if (sb.length() > 0) {
        return sb.toString();
    }
    throw new Exception();

}

From source file:com.screenslicer.core.util.Util.java

public static void clean(Node node) {
    node.traverse(new NodeVisitor() {
        @Override//w  w  w  . j  a  v  a2 s.com
        public void tail(Node node, int depth) {
        }

        @Override
        public void head(Node node, int depth) {
            String classAttr = node.attr("class");
            classAttr = cleanClass(classAttr);
            if (CommonUtil.isEmpty(classAttr)) {
                node.removeAttr("class");
            } else {
                node.attr("class", classAttr);
            }
        }
    });
}

From source file:com.screenslicer.core.util.Util.java

private static void markFiltered(Node node, final boolean lenient) {
    if (lenient) {
        if (!isFilteredLenient(node)) {
            node.attr("class", node.attr("class") + " " + FILTERED_LENIENT_MARKER + " ");
        }// www.j a v  a  2  s.  c  o m
    } else {
        node.traverse(new NodeVisitor() {
            @Override
            public void tail(Node n, int d) {
            }

            @Override
            public void head(Node n, int d) {
                if (!isFiltered(n)) {
                    n.attr("class", n.attr("class") + " " + FILTERED_MARKER + " ");
                }
            }
        });
    }
}

From source file:com.screenslicer.core.util.Util.java

public static Element markTestElement(Element element) {
    element.traverse(new NodeVisitor() {
        @Override/*from   w w w.j  a v a2  s.c  o m*/
        public void tail(Node node, int level) {
        }

        @Override
        public void head(Node node, int level) {
            node.attr("class", nodeMarker.matcher(node.attr("class")).replaceAll(""));
        }
    });
    element.traverse(new NodeVisitor() {
        int count = 0;

        @Override
        public void tail(Node node, int level) {
        }

        @Override
        public void head(Node node, int level) {
            ++count;
            node.attr("class", node.attr("class") + " " + NODE_MARKER + count + " ");
        }
    });
    return element;
}