List of usage examples for org.jsoup.select Elements first
public Element first()
From source file:org.sbs.goodcrawler.extractor.selector.UrlElementCssSelector.java
@SuppressWarnings("unchecked") @Override// w ww.j a v a2s . c o m public HashMap<String, Object> getContent() throws ExtractException { if (null != content && !newDoc) { return content; } // ??document if (StringUtils.isNotBlank(this.url) && !newDoc) { return content; } // ?documentSelector if (super.document != null) { Elements elements = super.document.select(value); if (elements.isEmpty()) return null; switch ($Attr) { case text: this.url = elements.first().text(); break; default: this.url = elements.first().attr(attr); break; } } if (StringUtils.isNotBlank(this.url)) { Document doc = null; PageFetchResult result = null; try { WebURL webUrl = new WebURL(); webUrl.setURL(this.url); result = FetchForeman.fetcher.fetchHeader(webUrl); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { return null; } if (statusCode != HttpStatus.SC_OK) { return null; } else { Page page = new Page(webUrl); if (!result.fetchContent(page)) { return null; } if (!parser.parse(page, webUrl.getURL())) { return null; } doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); } } catch (IOException e) { e.printStackTrace(); throw new ExtractException(e.getMessage()); } finally { if (result != null) result.discardContentIfNotConsumed(); } content = Maps.newHashMap(); if (selectors != null) for (ElementCssSelector<?> selector : selectors) { if (selector instanceof FileElementCssSelector) { Map<String, Object> m = ((FileElementCssSelector) selector).setResult(content) .setDocument(doc).getContentMap(); if ((null == m || m.size() == 0) && selector.isRequired()) { return null; } else { if (null != m && m.size() > 0) content.putAll(m); } } else { Map<String, Object> m = selector.setDocument(doc).getContentMap(); if ((null == m || m.size() == 0) && selector.isRequired()) { return null; } else { if (null != m && m.size() > 0) content.putAll(m); } } } return content; } newDoc = false; return null; }
From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java
@Override public String authenticate(String username, String password) throws IOException, ParseException, CaptchaException { final HttpResponse resp; final ArrayList<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair(PARAM_USERNAME, username)); params.add(new BasicNameValuePair(PARAM_PASSWORD, password)); final HttpEntity entity; try {// w w w .j a va 2s.co m entity = new UrlEncodedFormEntity(params); } catch (final UnsupportedEncodingException e) { // this should never happen. throw new IllegalStateException(e); } String uri = BASE_URL + "?" + URLEncodedUtils.format(Arrays.asList(new BasicNameValuePair(XML_ID, AUTH_XML_ID)), ENCODING); Log.i(TAG, "Authenticating to: " + uri); final HttpPost post = new HttpPost(uri); post.addHeader(entity.getContentType()); post.setHeader("Accept", "*/*"); post.setEntity(entity); try { resp = getHttpClient().execute(post); if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { throw new ParseException("login: unhandled http status " + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase()); } String response = EntityUtils.toString(resp.getEntity()); Log.v(TAG, "response = " + response); Document doc = Jsoup.parse(response, BASE_URL); Element mainForm = doc.getElementById("mainForm"); if (mainForm == null) { throw new ParseException("login: missing mainForm"); } String action = BASE_URL + mainForm.attr("action"); Log.v(TAG, "action=" + action); UrlQuerySanitizer sanitizer = new UrlQuerySanitizer(action); String user_id = sanitizer.getValue(PARAM_USER_ID); String session_id = sanitizer.getValue(PARAM_SESSION_ID); if (user_id == null || "".equals(user_id) || session_id == null || "".equals(session_id)) { if (doc.getElementsByClass("redtext").size() > 0) { // bad authentication return null; } else { // TODO handle captcha Elements captcha = doc.select("input[name=captcha_hkey]"); if (captcha != null && captcha.size() == 1) { String captchaHash = captcha.first().attr("value"); String captchaUri = BASE_URL + "?" + URLEncodedUtils .format(Arrays.asList(new BasicNameValuePair(XML_ID, CAPTCHA_XML_ID), new BasicNameValuePair("captcha_key", captchaHash)), ENCODING); throw new CaptchaException(captchaUri); } throw new ParseException("no user_id or session_id: " + action); } } return URLEncodedUtils.format(Arrays.asList(new BasicNameValuePair(PARAM_USER_ID, user_id), new BasicNameValuePair(PARAM_SESSION_ID, session_id)), ENCODING); } catch (ClientProtocolException e) { throw new IOException(e.getMessage()); } }
From source file:com.sinelead.car.club.NewsFragment.java
public void parseNewsUrl() { HttpCache httpCache = new HttpCache(context); httpCache.httpGet("http://m.xincheping.com/", new HttpCacheListener() { protected void onPreGet() { // do something like show progressBar before httpGet, runs on // the UI thread }// www . ja v a 2 s .c o m protected void onPostGet(HttpResponse httpResponse, boolean isInCache) { // do something like show data after httpGet, runs on the UI // thread if (httpResponse != null) { // get data success String html = httpResponse.getResponseBody(); Document doc = Jsoup.parse(html); Elements uls = doc.select("ul.slides"); // classul bannerList = uls.first().getElementsByTag("a"); if (imagePagerAdapter != null) { imagePagerAdapter.setBannerList(bannerList); imagePagerAdapter.notifyDataSetChanged(); } } else { // get data fail } } }); return; }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoReleaseDate(Document doc, Video video) { Elements rdElements = doc.select("div#video_date td.text"); if (CollectionUtils.isNotEmpty(rdElements)) { String releaseDate = rdElements.first().text().toString(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try {/*w w w . j a v a 2s. c o m*/ Date date = sdf.parse(releaseDate); video.setReleaseDate(date); } catch (ParseException e) { } } }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoDuration(Document doc, Video video) { Elements dmElements = doc.select("div#video_length span.text"); if (CollectionUtils.isNotEmpty(dmElements)) { String durationMinutes = dmElements.first().text().toString(); video.setDurationMinutes(Integer.valueOf(durationMinutes)); }//from w ww. ja v a 2 s . c om }
From source file:org.confab.PhpBB3Parser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/* ww w . j a va 2 s . c o m*/ Elements forum_tables = root.select("ul[class=topiclist forums]"); assert !forum_tables.isEmpty() : root.html(); for (Element forum_table : forum_tables) { Elements els_li = forum_table.select("li.row"); assert !els_li.isEmpty(); for (Element el_li : els_li) { Forum new_forum = new Forum(parent); // Get the forum url Elements els_a = el_li.select("a.forumtitle"); Element el_a = els_a.first(); assert el_a != null; new_forum.url = el_a.attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text new_forum.title = el_a.text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element _el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get the description/message of this topic String el_description = el_a.parent().text(); if (el_description != null) { new_forum.description = el_description; } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } } Utilities.debug("end parseForums"); return ret; }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoPeople(Document doc, Video video) { Elements dElements = doc.select("div#video_director td.text"); if (CollectionUtils.isNotEmpty(dElements)) { String director = dElements.first().text().toString(); video.setDirector(director);// w ww . j av a 2 s .c om } Elements pElements = doc.select("div#video_maker td.text"); if (CollectionUtils.isNotEmpty(pElements)) { String producer = pElements.first().text().toString(); video.setProducer(producer); } Elements disElements = doc.select("div#video_label td.text"); if (CollectionUtils.isNotEmpty(disElements)) { String distributor = disElements.first().text().toString(); video.setDistributor(distributor); } }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoScore(Document doc, Video video) { Elements sElements = doc.select("div#video_review td.text span.score"); if (CollectionUtils.isNotEmpty(sElements)) { String score = sElements.first().text().toString(); score = StringUtils.replace(score, "(", ""); score = StringUtils.replace(score, ")", ""); if (StringUtils.isNotBlank(score)) { try { video.setScore(Float.valueOf(score)); } catch (Exception e) { }/*w ww . j a v a2s .com*/ } } }
From source file:lolthx.autohome.buy.AutohomePriceListFetch.java
@Override public void parse(String result, Task task) throws Exception { if (StringUtils.isBlank(result)) { return;//from w w w .ja v a 2 s .c o m } Date start = task.getStartDate(); Date end = task.getEndDate(); Document doc = Jsoup.parse(result); Elements lis = doc.select("li.price-item"); AutohomePriceInfoBean bean = new AutohomePriceInfoBean(); for (Element li : lis) { try { Elements postTimeEl = li.select("div.user-name span"); String postTime = ""; if (!postTimeEl.isEmpty()) { postTime = StringUtils.trim( StringUtils.substringBefore(postTimeEl.first().text(), "?").replaceAll("", "")); if (!isTime(postTime, start, end)) { continue; } } bean.setPostTime(postTime); bean.setUrl(task.getUrl()); bean.setForumId(StringUtils.substringBefore(task.getExtra(), ":")); bean.setProjectName(task.getProjectName()); bean.setKeyword(StringUtils.substringAfter(task.getExtra(), ":")); // post id Elements id = li.select("div.price-share a.share"); if (!id.isEmpty()) { String idStr = id.first().attr("data-target"); idStr = StringUtils.substringAfterLast(idStr, "_"); if (StringUtils.isBlank(idStr)) { continue; } bean.setId(idStr); } // Elements user = li.select("div.user-name a"); if (!user.isEmpty()) { String userUrl = user.first().absUrl("href"); String userId = StringUtils.substringAfterLast(userUrl, "/"); String userName = user.first().text(); bean.setUserId(userId); bean.setUserUrl(userUrl); bean.setUserName(userName); } Elements dataLis = li.select("div.price-item-bd li"); for (Element dataLi : dataLis) { String data = dataLi.text(); if (StringUtils.startsWith(data, "")) { bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { String area = StringUtils.trim(StringUtils.substringAfter(data, "")); String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2); if (pAndC.length == 1) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[0]); } if (pAndC.length == 2) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[1]); } } if (StringUtils.startsWith(data, "")) { Elements level = dataLi.select("span.level"); // if (!level.isEmpty()) { bean.setSellerComment(level.first().text()); } // ? Elements seller = dataLi.select("a.title"); if (!seller.isEmpty()) { String sellerUrl = seller.first().absUrl("href"); String sellerName = seller.first().text(); String sellerId = StringUtils.substringAfterLast(sellerUrl, "/"); bean.setSellerId(sellerId); bean.setSellerName(sellerName); bean.setSellerUrl(sellerUrl); } // ? Elements sellerPhone = dataLi.select("em.phone-num"); if (!sellerPhone.isEmpty()) { bean.setSellerPhone(sellerPhone.first().text()); } // ? // Elements sellerAddress = // dataLi.select("em.phone-num"); } if (StringUtils.startsWith(data, "?")) { bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, ""))); } } bean.saveOnNotExist(); } catch (Exception e) { e.printStackTrace(); continue; } } }
From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java
@Override public List<RawBankAccount> getBankAccounts(String authToken) throws IOException, ParseException, AuthenticationException { String uri = BASE_URL + "?" + URLEncodedUtils.format( Arrays.asList(new BasicNameValuePair(XML_ID, LIST_ACCOUNTS_XML_ID)), ENCODING) + "&" + authToken; // Get the accounts list Log.i(TAG, "Getting from: " + uri); final HttpGet get = new HttpGet(uri); get.setHeader("Accept", "*/*"); DefaultHttpClient httpClient = getHttpClient(); Log.v(TAG, "sending " + get.toString()); final HttpResponse resp = httpClient.execute(get); if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { throw new ParseException("getBankAccounts: unhandled http status " + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase()); }// w ww . j ava2 s .c o m HttpEntity entity = resp.getEntity(); Document doc = Jsoup.parse(entity.getContent(), "utf-8", BASE_URL); if (!checkLoggedIn(doc)) { throw new AuthenticationException("session expired!"); } Element content = doc.getElementById("PageContent"); if (content == null) { throw new ParseException("getBankAccounts: can't find PageContent"); } Elements tables = content.getElementsByTag("table"); if (tables == null || tables.size() == 0) { throw new ParseException("getBankAccounts: can't find table in PageContent"); } Elements rows = tables.first().getElementsByTag("tr"); if (rows == null || rows.size() == 0) { throw new ParseException("getBankAccounts: first table is empty in PageContent"); } ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size()); String lastCurrency = null; for (Element row : rows) { RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row); if (bankAccount != null) { if (bankAccount.getCurrency() == null) { bankAccount.setCurrency(lastCurrency); } else { lastCurrency = bankAccount.getCurrency(); } bankAccounts.add(bankAccount); } } return bankAccounts; }