List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public int[] scrapeImageSizeFromPage(String pageURL) { LOG.debug("[scrapeImageSizeFromPage] - BEGIN"); int[] result = { 0, 0 }; try {// w ww .j a v a 2 s . co m Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get(); Element image = doc.select("img").first(); result[0] = Integer.valueOf(image.attr("width")); result[1] = Integer.valueOf(image.attr("height")); } catch (Exception e) { LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeImageSizeFromPage] - END"); return result; }
From source file:org.brunocvcunha.taskerbox.impl.custom.slickdeals.SlickDealsEmailAction.java
@Override public void spreadAction(final String url, String postTitle) { EmailAction email = getEmailAction(); EmailValueVO emailVO = new EmailValueVO(); StringBuffer sb = new StringBuffer(); sb.append(url);//from ww w . j a va 2s. c o m emailVO.setTitle("SlickDeals - " + postTitle); try { Document doc = TaskerboxHttpBox.getInstance().getDocumentForURL(url); for (Element post : doc.select(".post_message")) { sb.append("<br>"); sb.append(post.html()); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IllegalStateException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } emailVO.setBody(sb.toString()); email.action(emailVO); }
From source file:com.webcrawler.MailCrawlerService.java
/** * Gets the link elements./* ww w.java 2 s . c o m*/ * * @param doc the doc * @param tagSelector the tag selector * @return the link elements */ private Elements getLinkElements(Document doc, String tagSelector) { return doc.select(tagSelector); }
From source file:hu.petabyte.redflags.engine.gear.parser.DocFamilyFetcher.java
@Override protected Notice processImpl(Notice notice) throws Exception { TedResponse r = ted.get().requestNoticeTabQuietly(notice.getId(), lang, Tab.DATA); if (null != r) { Document dataTab = r.getParsedDocument(); if (!dataTab.select("a[href~=tabId=4").isEmpty()) { TedResponse r2 = ted.get().requestNoticeTabQuietly(notice.getId(), lang, Tab.DOCUMENT_FAMILY); if (null != r2) { Document docFamilyTab = r2.getParsedDocument(); notice = parseDocFamilyTab(notice, docFamilyTab); }//from w ww . j av a2 s . c o m } } determineDocFamilyId(notice); return notice; }
From source file:com.dsh105.nexus.command.module.information.TimeCommand.java
@Override public boolean onCommand(CommandPerformEvent event) { if (event.getArgs().length > 0) { String args = StringUtil.combineSplit(0, event.getArgs(), " "); try {//from www . j a v a 2 s . c o m HttpResponse<JsonNode> jsonResponse = Unirest.get(GOOGLE_COORDS_URL + args) .header("accept", "application/json").asJson(); JSONArray response = jsonResponse.getBody().getObject().getJSONArray("results"); if (!jsonResponse.getBody().getObject().getString("status").equalsIgnoreCase("OK")) { event.errorWithPing("Invalid request"); } else { double lat = response.getJSONObject(0).getJSONObject("geometry").getJSONObject("location") .getDouble("lat"); double lng = response.getJSONObject(0).getJSONObject("geometry").getJSONObject("location") .getDouble("lng"); String loc = response.getJSONObject(0).getString("formatted_address"); Document doc = Jsoup.connect(TIME_URL + lat + "/" + lng).get(); Element timeEl = doc.select("localtime").first(); String time = timeEl.text(); event.respond("Time in " + Colors.BOLD + loc + ": " + time); return true; } } catch (Exception e) { throw new TimeDataLookupException("An error occurred in the lookup process", e); } } else { return false; } return true; }
From source file:com.johan.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl) throws IOException { String html = httpGet(url, encoding).replace(" ", ""); Document doc = Jsoup.parse(html); docs.add(doc);//from www . j a va2s . c om if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl)) loadUrl(redirectUrl, encoding, true, docs, startUrl); } }
From source file:ru.xxlabaza.popa.pack.PackingService.java
private void processJavaScript(Document document) { document.select("script[src$=.js]:not([src^=http])").forEach(script -> { Path path = build.resolve(createPath(script.attr("src"))); log.info("Processing script '{}'", path); // String content = commentRemoveService.removeComments(path); String content = FileSystemUtils.getContent(path); if (!path.getFileName().toString().endsWith(".min.js")) { content = compressService.compress(content, JAVASCRIPT); }/* w w w.ja va2 s.com*/ script.removeAttr("src"); script.html(content); }); }
From source file:org.sonatype.nexus.testsuite.misc.nxcm4389.NXCM4389FavIconIT.java
private void assertFavIcons(final String text) throws IOException { // assert that shortcut icon mentioned in the HTML is actually available Document doc = Jsoup.parse(text); // favicon is used with absolute url here assertExists(doc.select("link[rel=icon]").attr("href")); doc = extractIELink(doc);/* ww w. j a v a 2 s. c om*/ // favicon is used with absolute url here assertExists(doc.select("link[rel=shortcut icon]").attr("href")); }
From source file:hello.Scraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc = Jsoup.parse(html); final Elements anchorNodes = htmlDoc.select("body").select("ul").select("li"); final List<Element> anchorList = new ArrayList<Element>(); anchorNodes.traverse(new NodeVisitor() { @Override//from w w w. j ava2 s.c o m public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; anchorList.add(e); } } @Override public void tail(Node node, int depth) { } }); return anchorList; }
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public String scrapeDBpediaImageFromPage(String pageURL) { LOG.debug("[scrapeDBpediaImageFromPage] - BEGIN url=" + pageURL); long startTime = System.currentTimeMillis(); String result = ""; try {//from w w w. j a va 2 s .com Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get(); Element image = doc.select("img").first(); result = "http:" + image.attr("src"); } catch (Exception e) { LOG.error("[scrapeDBpediaImageFromPage] - EXCEPTION: ", e); } long endTime = System.currentTimeMillis(); long duration = (endTime - startTime) / 1000; //no prod LOG.debug("########### [scrapeDBpediaImageFromPage] took " + duration + " seconds. ###########"); LOG.debug("[scrapeDBpediaImageFromPage] - END"); return result; }