List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java
public int[] scrapeImageSizeFromPage(String pageURL) { LOG.debug("[scrapeImageSizeFromPage] - BEGIN"); int[] result = { 0, 0 }; try {/*from w ww .j a v a2s .co m*/ Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("div.fullImageLink").select("img").first(); result[0] = Integer.valueOf(image.attr("width")); result[1] = Integer.valueOf(image.attr("height")); } catch (Exception e) { LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeImageSizeFromPage] - END"); return result; }
From source file:com.isoftstone.proxy.api.sdk.KuaidailiProxySDK.java
private List<ProxyVo> parseHtml(Document doc) { Elements eles = doc.select("#list table tr"); List<ProxyVo> proxyList = new ArrayList<ProxyVo>(); for (int i = 1; i < eles.size(); i++) { Element ele = eles.get(i); Element ipEle = ele.select("td:eq(0)").first(); Element portEle = ele.select("td:eq(1)").first(); ProxyVo proxyVo = new ProxyVo(); proxyVo.setProxyIp(ipEle.text()); proxyVo.setProxyPort(Integer.parseInt(portEle.text())); proxyList.add(proxyVo);/*from www .j ava2 s .com*/ } return proxyList; }
From source file:org.manalith.ircbot.plugin.linuxpkgfinder.PhPortageProvider.java
@Override public String find(String arg) { String result = ""; String url = "http://darkcircle.kr/phportage/phportage.xml?k=" + arg + "&limit=1&similarity=exact" + "&showmasked=true&livebuild=false"; try {/*from ww w. j a v a 2 s. c om*/ Document d = Jsoup.connect(url).get(); System.out.println(d.select("result>code").get(0).text()); if (NumberUtils.toInt(d.select("result>code").get(0).text()) == 0) { if (NumberUtils.toInt(d.select("result>actualnumofres").get(0).text()) == 0) result = "[Gentoo] "; else { Element e = d.select("result>packages>pkg").get(0); String pkgname = e.select("category").get(0).text() + "/" + e.select("name").get(0).text(); String ver = e.select("version").get(0).text(); String description = e.select("description").get(0).text(); result = "[Gentoo] \u0002" + pkgname + "\u0002 - " + description + ", " + ver; } } } catch (Exception e) { logger.error(e.getMessage(), e); result = ": " + e.getMessage(); } return result; }
From source file:it.polito.tellmefirst.apimanager.VideoManager.java
public String extractVideoIdFromResult(String input) { LOG.debug("[extractVideoIdFromResult] - BEGIN"); String result = null;//from w w w . j av a 2 s .c o m Document doc = Jsoup.parse(input); Elements ids = doc.select("id"); if (ids != null && ids.size() > 1) { String idDirty = ids.get(1).text(); System.out.println("ID dirty: " + idDirty); String[] idArray = idDirty.split("video:"); result = idArray[idArray.length - 1]; } else LOG.error("no video id available"); LOG.debug("[extractVideoIdFromResult] - END"); return result; }
From source file:org.commonjava.indy.ftest.core.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; client.content().store(hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(client.content().contentUrl(hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }/*from ww w .java2 s .co m*/ }
From source file:org.commonjava.indy.ftest.core.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; client.content().store(hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(client.content().contentUrl(hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//from w w w .ja v a 2 s . com }
From source file:org.mashupmedia.task.MetaTaskScheduler.java
public void getMashupMediaLatestReleaseInformation() { String url = "http://www.mashupmedia.org/latest-release/final"; try {//from www.j av a 2s .c o m ProxyTextFile proxyTextFile = (ProxyTextFile) proxyManager.loadProxyFile(url, ProxyType.TEXT_FILE); if (proxyTextFile == null) { logger.info( "Unable to find latest release from page: http://www.mashupmedia.org/latest-release/final"); return; } Document document = Jsoup.parse(proxyTextFile.getText()); Elements elements = document.select("div.view-latest-final-release div.views-row"); String releaseType = elements.select("div.views-field-field-release-type").text(); String version = elements.select("div.views-field-field-version").text(); logger.info("Found latest release information, type = " + releaseType + ", version = " + version); configurationManager.saveConfiguration(MashUpMediaConstants.LATEST_RELEASE_FINAL_VERSION, version); } catch (IOException e) { logger.error("Unable to get latest version information from www.mashupmedia.org", e); return; } }
From source file:com.clonephpscrapper.crawler.ClonePhpScrapper.java
public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception { String url = "http://clonephp.com/"; // Document doc = Jsoup.parse(fetchPage(new URI(url))); String response = ""; response = new GetRequestHandler().doGetRequest(new URL(url)); Document doc = Jsoup.parse(response); Elements ele = doc.select("table[class=dir] tbody tr td table[class=dir_cat] tbody tr th a");//.first(); for (Element ele1 : ele) { objCategories = new Categories(); String categoryName = ele1.text(); String categoryUrl = "http://clonephp.com/" + ele1.attr("href"); System.out.println("CATEGORY_NAME : " + categoryName); System.out.println("CATEGORY_URL : " + categoryUrl); objCategories.setCategoryName(categoryName); objCategories.setCategoryUrl(categoryUrl); objClonePhpDaoImpl.insertCategoriesData(objCategories); // objCrawlingEachUrlData.crawlingUrlData(categoryUrl); }/* w w w . j a v a 2 s. c o m*/ List<Future<String>> list = new ArrayList<Future<String>>(); ExecutorService executor = Executors.newFixedThreadPool(5); List<Categories> listCatogories = objClonePhpDaoImpl.getCategoriesDataList(); for (Categories listCatogory : listCatogories) { try { Callable worker = new CrawlingEachUrlData(listCatogory, objClonePhpDaoImpl); Future<String> future = executor.submit(worker); list.add(future); } catch (Exception exx) { System.out.println(exx); } } for (Future<String> fut : list) { try { //print the return value of Future, notice the output delay in console // because Future.get() waits for task to get completed System.out.println(new Date() + "::" + fut.get()); } catch (InterruptedException | ExecutionException ep) { ep.printStackTrace(); } } //shut down the executor service now executor.shutdown(); }
From source file:org.commonjava.aprox.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final AproxClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//from ww w . j a v a 2 s .c om }
From source file:org.commonjava.aprox.folo.ftest.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final AproxClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }// w w w. j av a 2 s. c om }