List of usage examples for org.jsoup.nodes Attributes iterator
public Iterator<Attribute> iterator()
From source file:com.kingfong.webcrawler.util.DOMContentUtils.java
/** * This method finds all anchors below the supplied DOM * <code>node</code>, and creates appropriate {@link Outlink} * records for each (relative to the supplied <code>base</code> * URL), and adds them to the <code>outlinks</code> {@link * ArrayList}.//w w w. j a v a 2 s . c om * * <p> * * Links without inner structure (tags, text, etc) are discarded, as * are links which contain only single nested links and empty text * nodes (this is a common DOM-fixup artifact, at least with * nekohtml). */ public void getOutlinks(String html, URL url, HashSet<String> outlinks) { Document document = Jsoup.parse(html); Elements elements = document.getAllElements(); for (Element currentNode : elements) { String nodeName = currentNode.tagName(); // short nodeType = currentNode.; Elements children = currentNode.children(); nodeName = nodeName.toLowerCase(); LinkParams params = linkParams.get(nodeName); if (params != null) { // if (!shouldThrowAwayLink(currentNode, children, childLen, // params)) { // StringBuilder linkText = new StringBuilder(); // getText(linkText, currentNode, true); Attributes attrs = currentNode.attributes(); String target = null; boolean noFollow = false; boolean post = false; Iterator<Attribute> iterator = attrs.iterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); String attrName = attr.getKey(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getValue(); } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) { noFollow = true; } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) { post = true; } } if (StringUtils.startsWith(target, "/")) { target = url.getProtocol() + "://" + url.getHost() + target; } if (target != null && URLFilter.filt(target)) { outlinks.add(target); } // } // this should not have any children, skip them if (params.childLen == 0) continue; } } }