List of usage examples for org.jsoup.nodes Element absUrl
public String absUrl(String attributeKey)
From source file:de.geeksfactory.opacclient.apis.Zones.java
static String findNextPageUrl(Document doc) { if (doc.select(".pageNavLink[title*=nchsten]").size() > 0) { Element link = doc.select(".pageNavLink[title*=nchsten]").first(); return link.absUrl("href"); } else {/*from w w w. j av a 2 s . co m*/ return null; } }
From source file:googleranking.processing.GoogleData.java
public List<String> getLinksInPage() { Document doc = getGoogleHtml(); List<String> ret = new ArrayList<String>(); try {//from w w w .ja va 2s . co m Elements links = doc.select(".g>.r>a"); for (Element link : links) { String url = link.absUrl("href"); url = URLDecoder.decode(url.substring(url.indexOf("=") + 1, url.indexOf("&")), "UTF-8"); if (url.startsWith("http") || url.startsWith("https")) { ret.add(getDomain(url)); // Ads/news/etc } } } catch (Exception e) { Logger.getLogger(GoogleData.class.getName()).log(Level.SEVERE, null, e); } return ret; }
From source file:github.srlee309.lessWrongBookCreator.scraper.PostSectionExtractor.java
/** * Saves all images in the given Element to a local newUrl and converts the src for all img tags to the local file * @param postContent - from which to extract the images *///from ww w. j ava2s. c o m protected final void convertImagesToLocal(Element postContent) { Elements imgs = postContent.getElementsByTag("img"); for (Element img : imgs) { String src = img.absUrl("src"); String folder = "htmlOutput"; int indexName = src.lastIndexOf("/"); String name = src; if (indexName != -1) { indexName = src.lastIndexOf("/") + 1; name = src.substring(indexName, src.length()); } img.attr("src", name); saveImage(src, folder, name); } }
From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBForumProvider.java
/** * Try to find further URLs in the document that need to be requested to complete the resource * data./*ww w . j a v a2 s . co m*/ * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an * empty list. * <p/> * This implementation tries to locate the paging area of PHPBB and selects the last link of the * paging, which will be the "next" page. * * @param document * @param requestUrl * @return */ @Override protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) { List<String> result = new LinkedList<String>(); // return the next page in the result list String foo = "a[href*='start='][href*='viewforum.php']"; List<Element> values = document.select(foo); for (Element e : values) { String baseUrl = e.absUrl("href"); if (baseUrl.length() > 0) { result.add(baseUrl); } } return result; }
From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBTopicProvider.java
/** * Try to find further URLs in the document that need to be requested to complete the resource data. * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an empty list. * <p/>/*from www . ja v a2 s .c o m*/ * This implementation tries to locate the paging area of PHPBB and selects the last link of the paging, which will * be the "next" page. * * @param document * @param requestUrl * @return */ @Override protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) { List<String> result = new LinkedList<String>(); // return the next page in the result list Elements values = document.select("div#pagecontent a[href~=viewtopic\\.php.*start=]"); for (Element o : values) { String baseUrl = o.absUrl("href"); if (baseUrl.length() > 0) { result.add(baseUrl); } } return result; }
From source file:com.lumata.lib.lupa.extractor.internal.HtmlBiggestImageExtractor.java
@Override public Image extractBestImage(URL sourceUrl, Elements htmlSection, ImageExtractionRequirements requirements) { Map<String, Image> imagesToExplore = new HashMap<String, Image>(); Set<ImageDownloadTask> imagesToDownload = new HashSet<ImageDownloadTask>(); Iterator<org.jsoup.nodes.Element> it = htmlSection.iterator(); // collect valid images while (it.hasNext() && imagesToExplore.size() < requirements.getMaxImagesToExplore()) { Element imageElement = it.next(); String imageUrl = imageElement.absUrl("src"); // Do not process empty img tags, duplicated images or tracking // pixels and other assorted ads if (imageUrl == null || imagesToExplore.containsKey(imageUrl) || isTrackingPixelOrAd(imageUrl)) { continue; }//from w w w. j a v a 2 s . c o m // remember this image Image imageContent = new Image(imageUrl); if (imageElement.hasAttr(WIDTH_ATTRIBUTE)) { // TODO: We need to convert other picture size units supported by html (there must be a lib for this) imageContent.setWidth(Integer.parseInt(imageElement.attr(WIDTH_ATTRIBUTE).replace("px", ""))); } if (imageElement.hasAttr(HEIGHT_ATTRIBUTE)) { imageContent.setHeight(Integer.parseInt(imageElement.attr(HEIGHT_ATTRIBUTE).replace("px", ""))); } if (imageContent.getWidth() == null || imageContent.getHeight() == null) {// mark image to download imagesToDownload.add(new ImageDownloadTask(imageContent)); } imagesToExplore.put(imageUrl, imageContent); } // if dimensions are empty -> download image if (CollectionUtils.isNotEmpty(imagesToDownload)) { try { ExecutorService pool = Executors.newFixedThreadPool(imagesToDownload.size(), getThreadFactory(sourceUrl)); pool.invokeAll(imagesToDownload); pool.shutdown(); } catch (InterruptedException e) { LOG.error("InterruptedException while downloading images", e); } } // select biggest image Image biggestImage = null; try { biggestImage = Collections.max(imagesToExplore.values(), new Comparator<Image>() { @Override public int compare(Image o1, Image o2) { return getSquarePixels(o1) - getSquarePixels(o2); } }); } catch (NoSuchElementException e) { return null; } // if image is too small, discard return (biggestImage.getWidth() < requirements.getMinImageSize() || biggestImage.getHeight() < requirements.getMinImageSize()) ? null : biggestImage; }
From source file:sachin.spider.Page.java
/** * This function is called to get the list of the source of the images on * the page/* w ww . j a va 2 s .c o m*/ * * @return List of all images */ public List<String> getImageLinks() { List<String> imageLinks = new ArrayList<String>(); Elements slinks = document.select("img[src]"); for (Element slink : slinks) { String link = slink.absUrl("src"); if (!link.isEmpty() && !imageLinks.contains(link)) { imageLinks.add(link); } } return imageLinks; }
From source file:sachin.spider.Page.java
/** * This function is called to get the list of all the anchor tag links. * * @return List of all anchor tags//from ww w .j ava 2 s. c o m */ public List<String> getHyperLinks() { List<String> aLinks = new ArrayList<String>(); Elements slinks = document.select("a[href]"); for (Element slink : slinks) { String link = slink.absUrl("href"); if (!link.isEmpty() && !aLinks.contains(link) && !link.equals("#")) { aLinks.add(link); } } return aLinks; }
From source file:org.apache.marmotta.ldclient.provider.phpbb.mapping.PHPBBForumHrefMapper.java
@Override public List<Value> map(String resourceUri, Element selectedValue, ValueFactory factory) { String baseUriSite = resourceUri.substring(0, resourceUri.lastIndexOf('/')); String baseUriTopic = baseUriSite + "/viewforum.php?"; try {/*w ww . j av a2 s . c o m*/ URI uri = new URI(selectedValue.absUrl("href")); Map<String, String> params = new HashMap<String, String>(); for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) { params.put(p.getName(), p.getValue()); } return Collections.singletonList((Value) factory.createURI(baseUriTopic + "f=" + params.get("f"))); } catch (URISyntaxException ex) { throw new RuntimeException("invalid syntax for URI", ex); } }
From source file:sachin.spider.Page.java
/** * This function is called to get the list of outgoing links from the page * * @return List of all outgoing links//from w w w . j a v a2 s .c o m */ public List<String> getOutgoingLinks() { outgoingLinks = new ArrayList<String>(); Elements alinks = document.getElementsByAttribute("href"); Elements slinks = document.getElementsByAttribute("src"); for (Element alink : alinks) { String link = alink.absUrl("href"); if (!link.isEmpty() && !outgoingLinks.contains(link)) { outgoingLinks.add(link); } } for (Element slink : slinks) { String link = slink.absUrl("src"); if (!link.isEmpty() && !outgoingLinks.contains(link)) { outgoingLinks.add(link); } } return outgoingLinks; }