Example usage for org.jsoup.nodes Element absUrl

List of usage examples for org.jsoup.nodes Element absUrl

Introduction

In this page you can find the example usage for org.jsoup.nodes Element absUrl.

Prototype

public String absUrl(String attributeKey) 

Source Link

Document

Get an absolute URL from a URL attribute that may be relative (i.e.

Usage

From source file:de.geeksfactory.opacclient.apis.Zones.java

static String findNextPageUrl(Document doc) {
    if (doc.select(".pageNavLink[title*=nchsten]").size() > 0) {
        Element link = doc.select(".pageNavLink[title*=nchsten]").first();
        return link.absUrl("href");
    } else {/*from  w w w.  j  av  a 2 s  . co m*/
        return null;
    }
}

From source file:googleranking.processing.GoogleData.java

public List<String> getLinksInPage() {
    Document doc = getGoogleHtml();
    List<String> ret = new ArrayList<String>();
    try {//from w  w w  .ja  va  2s . co  m
        Elements links = doc.select(".g>.r>a");
        for (Element link : links) {
            String url = link.absUrl("href");
            url = URLDecoder.decode(url.substring(url.indexOf("=") + 1, url.indexOf("&")), "UTF-8");
            if (url.startsWith("http") || url.startsWith("https")) {
                ret.add(getDomain(url)); // Ads/news/etc
            }
        }
    } catch (Exception e) {
        Logger.getLogger(GoogleData.class.getName()).log(Level.SEVERE, null, e);
    }
    return ret;
}

From source file:github.srlee309.lessWrongBookCreator.scraper.PostSectionExtractor.java

/**
 * Saves all images in the given Element to a local newUrl and converts the src for all img tags to the local file
 * @param postContent - from which to extract the images
*///from ww  w. j  ava2s.  c o  m
protected final void convertImagesToLocal(Element postContent) {
    Elements imgs = postContent.getElementsByTag("img");
    for (Element img : imgs) {
        String src = img.absUrl("src");
        String folder = "htmlOutput";

        int indexName = src.lastIndexOf("/");
        String name = src;
        if (indexName != -1) {
            indexName = src.lastIndexOf("/") + 1;
            name = src.substring(indexName, src.length());
        }
        img.attr("src", name);
        saveImage(src, folder, name);
    }
}

From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBForumProvider.java

/**
 * Try to find further URLs in the document that need to be requested to complete the resource
 * data./*ww  w . j  a v a2 s  . co m*/
 * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an
 * empty list.
 * <p/>
 * This implementation tries to locate the paging area of PHPBB and selects the last link of the
 * paging, which will be the "next" page.
 * 
 * @param document
 * @param requestUrl
 * @return
 */
@Override
protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) {
    List<String> result = new LinkedList<String>();

    // return the next page in the result list
    String foo = "a[href*='start='][href*='viewforum.php']";

    List<Element> values = document.select(foo);
    for (Element e : values) {
        String baseUrl = e.absUrl("href");
        if (baseUrl.length() > 0) {
            result.add(baseUrl);
        }
    }

    return result;

}

From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBTopicProvider.java

/**
 * Try to find further URLs in the document that need to be requested to complete the resource data.
 * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an empty list.
 * <p/>/*from  www  .  ja v a2  s  .c o m*/
 * This implementation tries to locate the paging area of PHPBB and selects the last link of the paging, which will
 * be the "next" page.
 *
 * @param document
 * @param requestUrl
 * @return
 */
@Override
protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) {
    List<String> result = new LinkedList<String>();

    // return the next page in the result list
    Elements values = document.select("div#pagecontent a[href~=viewtopic\\.php.*start=]");
    for (Element o : values) {
        String baseUrl = o.absUrl("href");
        if (baseUrl.length() > 0) {
            result.add(baseUrl);
        }
    }

    return result;

}

From source file:com.lumata.lib.lupa.extractor.internal.HtmlBiggestImageExtractor.java

@Override
public Image extractBestImage(URL sourceUrl, Elements htmlSection, ImageExtractionRequirements requirements) {
    Map<String, Image> imagesToExplore = new HashMap<String, Image>();
    Set<ImageDownloadTask> imagesToDownload = new HashSet<ImageDownloadTask>();
    Iterator<org.jsoup.nodes.Element> it = htmlSection.iterator();

    // collect valid images
    while (it.hasNext() && imagesToExplore.size() < requirements.getMaxImagesToExplore()) {
        Element imageElement = it.next();
        String imageUrl = imageElement.absUrl("src");

        // Do not process empty img tags, duplicated images or tracking
        // pixels and other assorted ads
        if (imageUrl == null || imagesToExplore.containsKey(imageUrl) || isTrackingPixelOrAd(imageUrl)) {
            continue;
        }//from   w  w w. j  a  v  a 2  s . c o  m

        // remember this image
        Image imageContent = new Image(imageUrl);
        if (imageElement.hasAttr(WIDTH_ATTRIBUTE)) {
            // TODO: We need to convert other picture size units supported by html (there must be a lib for this)
            imageContent.setWidth(Integer.parseInt(imageElement.attr(WIDTH_ATTRIBUTE).replace("px", "")));
        }
        if (imageElement.hasAttr(HEIGHT_ATTRIBUTE)) {
            imageContent.setHeight(Integer.parseInt(imageElement.attr(HEIGHT_ATTRIBUTE).replace("px", "")));
        }
        if (imageContent.getWidth() == null || imageContent.getHeight() == null) {// mark image to download
            imagesToDownload.add(new ImageDownloadTask(imageContent));
        }
        imagesToExplore.put(imageUrl, imageContent);
    }

    // if dimensions are empty -> download image
    if (CollectionUtils.isNotEmpty(imagesToDownload)) {
        try {
            ExecutorService pool = Executors.newFixedThreadPool(imagesToDownload.size(),
                    getThreadFactory(sourceUrl));
            pool.invokeAll(imagesToDownload);
            pool.shutdown();
        } catch (InterruptedException e) {
            LOG.error("InterruptedException while downloading images", e);
        }
    }

    // select biggest image
    Image biggestImage = null;
    try {
        biggestImage = Collections.max(imagesToExplore.values(), new Comparator<Image>() {
            @Override
            public int compare(Image o1, Image o2) {
                return getSquarePixels(o1) - getSquarePixels(o2);
            }
        });
    } catch (NoSuchElementException e) {
        return null;
    }

    // if image is too small, discard
    return (biggestImage.getWidth() < requirements.getMinImageSize()
            || biggestImage.getHeight() < requirements.getMinImageSize()) ? null : biggestImage;
}

From source file:sachin.spider.Page.java

/**
 * This function is called to get the list of the source of the images on
 * the page/*  w ww  . j a  va  2  s  .c  o  m*/
 *
 * @return List of all images
 */
public List<String> getImageLinks() {
    List<String> imageLinks = new ArrayList<String>();
    Elements slinks = document.select("img[src]");
    for (Element slink : slinks) {
        String link = slink.absUrl("src");
        if (!link.isEmpty() && !imageLinks.contains(link)) {
            imageLinks.add(link);
        }
    }
    return imageLinks;
}

From source file:sachin.spider.Page.java

/**
 * This function is called to get the list of all the anchor tag links.
 *
 * @return List of all anchor tags//from  ww w .j ava  2  s. c o m
 */
public List<String> getHyperLinks() {
    List<String> aLinks = new ArrayList<String>();
    Elements slinks = document.select("a[href]");
    for (Element slink : slinks) {
        String link = slink.absUrl("href");
        if (!link.isEmpty() && !aLinks.contains(link) && !link.equals("#")) {
            aLinks.add(link);
        }
    }
    return aLinks;
}

From source file:org.apache.marmotta.ldclient.provider.phpbb.mapping.PHPBBForumHrefMapper.java

@Override
public List<Value> map(String resourceUri, Element selectedValue, ValueFactory factory) {
    String baseUriSite = resourceUri.substring(0, resourceUri.lastIndexOf('/'));
    String baseUriTopic = baseUriSite + "/viewforum.php?";

    try {/*w ww  .  j av  a2  s  .  c o m*/
        URI uri = new URI(selectedValue.absUrl("href"));
        Map<String, String> params = new HashMap<String, String>();
        for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) {
            params.put(p.getName(), p.getValue());
        }

        return Collections.singletonList((Value) factory.createURI(baseUriTopic + "f=" + params.get("f")));
    } catch (URISyntaxException ex) {
        throw new RuntimeException("invalid syntax for URI", ex);
    }
}

From source file:sachin.spider.Page.java

/**
 * This function is called to get the list of outgoing links from the page
 *
 * @return List of all outgoing links//from  w w  w  . j  a  v  a2  s .c o m
 */
public List<String> getOutgoingLinks() {
    outgoingLinks = new ArrayList<String>();
    Elements alinks = document.getElementsByAttribute("href");
    Elements slinks = document.getElementsByAttribute("src");
    for (Element alink : alinks) {
        String link = alink.absUrl("href");
        if (!link.isEmpty() && !outgoingLinks.contains(link)) {
            outgoingLinks.add(link);
        }
    }
    for (Element slink : slinks) {
        String link = slink.absUrl("src");
        if (!link.isEmpty() && !outgoingLinks.contains(link)) {
            outgoingLinks.add(link);
        }
    }
    return outgoingLinks;
}