List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:strat.mining.multipool.stats.jersey.client.impl.WaffleRestClientImpl.java
@Override public GlobalStats getGlobalStats() { GlobalStats result = null;/*from www . ja v a 2s . c o m*/ try { LOGGER.debug("Start to get the waffle global stats."); long startTime = System.currentTimeMillis(); Document statsPage = Jsoup.connect(WAFFLE_POOL_GLOBAL_STATS_URL) .userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)").get(); PERF_LOGGER.info("Retrieved Wafflepool stats page in {} ms.", System.currentTimeMillis() - startTime); result = new GlobalStats(); try { Elements noteElements = statsPage.select("#note"); if (noteElements != null && !noteElements.isEmpty()) { String style = noteElements.get(0).attr("style"); if (style == null || (!style.contains("display:none") && !style.contains("visibility:hidden"))) { String note = noteElements.get(0).html(); result.setNote(note); } } } catch (Exception e) { LOGGER.error("Failed to get the last note.", e); } Elements headersElements = statsPage.select("#pool_stats"); String[] splitted = headersElements.get(0).text().split("\\s"); result.setMegaHashesPerSeconds(parsePower(splitted[1], splitted[2])); result.setNbMiners(Integer.parseInt(splitted[6])); result.setMiningCoin(splitted[10]); Elements contentElements = statsPage.select("#content"); splitted = contentElements.get(0).text().split("Bitcoins sent to miners:"); String[] splitted2 = splitted[1].split("Bitcoins earned \\(not yet sent\\):"); String rawPaidout = splitted2[0]; splitted = splitted2[1].split("Bitcoins unconverted \\(approximate\\):"); String rawBalance = splitted[0]; splitted2 = splitted[1].split("Date BTC"); String rawUnexchanged = splitted2[0]; result.setTotalPaidout(Float.parseFloat(rawPaidout.replaceAll(",", ""))); result.setTotalBalance(Float.parseFloat(rawBalance.replaceAll(",", ""))); result.setTotalUnexchanged(Float.parseFloat(rawUnexchanged.replaceAll(",", ""))); LOGGER.debug("Global stats from waffle retreived."); } catch (IOException e) { LOGGER.error("Failed to retrieve the stats page of Wafflepool.", e); } return result; }
From source file:com.liato.bankdroid.banking.banks.coop.Coop.java
@Override public void update() throws BankException, LoginException, BankChoiceException { super.update(); if (username == null || password == null || username.length() == 0 || password.length() == 0) { throw new LoginException(res.getText(R.string.invalid_username_password).toString()); }//from www . j av a2 s .c om login(); try { for (AccountType at : AccountType.values()) { response = urlopen.open(at.getUrl()); Document d = Jsoup.parse(response); Elements historik = d.select("#historik section"); TransactionParams params = new TransactionParams(); mTransactionParams.put(at, params); if (historik != null && !historik.isEmpty()) { String data = historik.first().attr("data-controller"); Matcher m = rePageGuid.matcher(data); if (m.find()) { params.setPageGuid(m.group(1)); } } Element date = d.getElementById("dateFrom"); if (date != null) { params.setMinDate(date.hasAttr("min") ? date.attr("min") : null); params.setMaxDate(date.hasAttr("max") ? date.attr("max") : null); } Elements es = d.select(".List:contains(Saldo)"); if (es != null && !es.isEmpty()) { List<String> names = new ArrayList<String>(); List<String> values = new ArrayList<String>(); for (Element e : es.first().select("dt")) { names.add(e.text().replaceAll(":", "").trim()); } for (Element e : es.first().select("dd")) { values.add(e.text().trim()); } for (int i = 0; i < Math.min(names.size(), values.size()); i++) { Account a = new Account(names.get(i), Helpers.parseBalance(values.get(i)), String.format("%s%d", at.getPrefix(), i)); a.setCurrency(Helpers.parseCurrency(values.get(i), "SEK")); if (a.getName().toLowerCase().contains("disponibelt")) { a.setType(Account.REGULAR); balance = a.getBalance(); setCurrency(a.getCurrency()); } else { a.setType(Account.OTHER); } if (i > 0) { a.setAliasfor(String.format("%s%d", at.getPrefix(), 0)); } accounts.add(a); } } } } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } try { RefundSummaryRequest refsumReq = new RefundSummaryRequest(mUserId, mToken, APPLICATION_ID); HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(refsumReq)); InputStream is = urlopen .openStream("https://www.coop.se/ExternalServices/RefundService.svc/RefundSummary", e, true); RefundSummaryResponse refsumResp = readJsonValue(is, RefundSummaryResponse.class); if (refsumResp != null && refsumResp.getRefundSummaryResult() != null) { Account a = new Account("terbring p ditt kort", BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getAccountBalance()), "refsummary"); a.setCurrency("SEK"); if (accounts.isEmpty()) { balance = a.getBalance(); setCurrency(a.getCurrency()); } accounts.add(a); a = new Account( String.format("terbring fr %s", refsumResp.getRefundSummaryResult().getMonthName()), BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getTotalRefund()), "refsummary_month"); accounts.add(a); } } catch (JsonParseException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } if (accounts.isEmpty()) { throw new BankException(res.getText(R.string.no_accounts_found).toString()); } super.updateComplete(); }
From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java
public String getIDCaptcha(Document doc) throws UnsupportedEncodingException { continueCaptcha = URLEncoder.encode(doc.select("input[name=continue]").first().attr("value"), "UTF-8"); q = URLEncoder.encode(doc.select("input[name=q]").first().attr("value"), "UTF-8"); return doc.select("input[value~=^\\d+$]").first().attr("value"); }
From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java
/** * Parse the description out of the meta tag if one exists. Otherwise, return * null// w w w.j av a2 s.c o m * * @param doc The Document to parse * @return The description if it exists in the HTML, otherwise null. */ private String getDescriptionFromDocument(Document doc) { Elements metaDescriptionElements = doc.select("meta[name=description]"); return metaDescriptionElements.size() > 0 ? metaDescriptionElements.attr("content") : ""; }
From source file:gov.medicaid.screening.dao.impl.BBHTLicenseDAOBean.java
/** * Parses the nursing license details page. * * @param page the details page// ww w . ja v a 2 s. c o m * @param licenseNo if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException if the page does not contain the expected elements */ private License parseLicense(Document page, String licenseNo) throws ParsingException { if (!page.select("span#lblFormTitle").text().equals("License Details")) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); String fullNameWithType = page.select("#_ctl7_lblName").text(); String fullName = fullNameWithType.indexOf(",") != -1 ? fullNameWithType.substring(0, fullNameWithType.indexOf(",")) : fullNameWithType; User user = new User(); profile.setUser(user); String[] nameParts = fullName.split(" "); user.setLastName(nameParts[nameParts.length - 1]); if (nameParts.length > 1) { user.setFirstName(nameParts[0]); } // everything else goes to middle name (per site behavior) if (nameParts.length > 2) { StringBuffer sb = new StringBuffer(); for (int i = 1; i < nameParts.length - 1; i++) { if (sb.length() > 0) { sb.append(" "); } sb.append(nameParts[i]); } user.setMiddleName(sb.toString()); } String gender = page.select("#_ctl7_lblGender").text(); if (Util.isNotBlank(gender)) { if ("Female".equals(gender)) { profile.setSex(Sex.FEMALE); } else { profile.setSex(Sex.MALE); } } String city = page.select("#_ctl7_lblPublicCity").text(); if (Util.isNotBlank(city)) { List<Address> addresses = new ArrayList<Address>(); Address address = new Address(); addresses.add(address); address.setCity(city); profile.setAddresses(addresses); } Elements licenses = page.select("#_ctl7_dgLicense tr.Normal"); for (Element row : licenses) { String licenseNumber = row.select("td:eq(1)").text(); if (licenseNo != null && !licenseNumber.startsWith(licenseNo)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } license.setLicenseNumber(licenseNumber); LicenseType type = new LicenseType(); type.setName(row.select("td:eq(0)").text()); license.setType(type); LicenseStatus status = new LicenseStatus(); status.setName(row.select("td:eq(2)").text()); license.setStatus(status); String issueDate = row.select("td:eq(3)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(4)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } licenses.clear(); return license; }
From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java
/** * ?page??/* w w w . j a v a 2 s.co m*/ */ @Override public void visit(Page page) { try { String url = page.getWebURL().getURL(); page.setContentType("text/html; charset=" + gather.getEncoding()); Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get(); String title = doc.title(); if (gather.getTitleExternal() && gather.getTitleRegex() != null && gather.getTitleRegex().length() > 0) { Elements titleEles = doc.select(gather.getTitleRegex()); if (!titleEles.isEmpty()) { String tempTitle = titleEles.text(); if (tempTitle != null && tempTitle.length() > 0) { title = tempTitle; } } } if (title != null && title.trim().length() > 0) { Elements elements = doc.select(matchRegex); if (filterRegex != null && filterRegex.trim().length() > 0) { elements = elements.not(filterRegex); } if (!elements.isEmpty()) { String subHtml = elements.html(); Document blockDoc = Jsoup.parse(subHtml); String contentText = blockDoc.html(); if (gather.getRemoveHref()) { Document moveDoc = Jsoup.parse(contentText); Elements moveEles = moveDoc.select("*").not("a"); contentText = moveEles.html(); } if (gather.getRemoveHtmlTag()) contentText = doc.text(); if (isLocal) { contentText = doc.text(); Boolean isMatcher = true; for (int i = 0; i < keys.length; i++) { Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find(); if (!result) { isMatcher = false; break; } } if (isMatcher) { Storage storage = new Storage(); storage.setGatherId(gather.getId()); storage.setGatherName(gather.getName()); storage.setTitle(title); storage.setUrl(url); try { gatherService.addStorage(storage); } catch (Exception e) { logger.error("save storage error : {}", e.getLocalizedMessage()); } finally { storage = null; } } } else { Content content = new Content(); content.setDetail(contentText); content.setPage(1); List<Content> contents = new ArrayList<Content>(); contents.add(content); Article article = new Article(); article.setTitle(title); article.setContents(contents); articleMainService.addArticleMainByCrawler(article, gather.getChannelId(), CrawlerUtil.USER_NAME); } } } } catch (IOException e) { logger.warn(e.getLocalizedMessage()); } }
From source file:info.smartkit.hairy_batman.query.SogouSearchQuery.java
public void parseWxUserId() { Document doc; try {//www . ja v a 2 s. c om // need http protocol doc = Jsoup.connect(GlobalConsts.SOGOU_SEARCH_URL_BASE + wxFoo.getSubscribeId()).get(); // get all "?:" value of html <span> Elements openIdSpans = doc.select(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_HTML_ELEMENTS); // for (Element openIdSpan : openIdSpans) { if (openIdSpan.hasText()) { if (openIdSpan.text().contains(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_KEYWORDS)) { // get the value from href attribute LOG.info("openId span text : " + openIdSpan.text()); // FIXME:???? if (this.wxFoo.getUserId() == null) { this.wxFoo.setOpenId( openIdSpan.text().split(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_KEYWORDS)[1]); LOG.info("saved wxUserId value: " + this.wxFoo.getUserId()); GlobalVariables.wxFooListWithUserId.add(this.wxFoo); } } } } } catch (IOException e) { // e.printStackTrace(); LOG.error(e.toString()); } }
From source file:org.brunocvcunha.taskerbox.impl.jobs.LinkedInJobSeeker.java
private boolean handleJob(JSONObject job) throws JSONException, ClientProtocolException, IOException, URISyntaxException { if (job.getBoolean("isApplied")) { return false; }// w ww.ja va2s .c o m long jobId = job.getLong("id"); if (!this.openIds.contains(jobId)) { this.openIds.add(jobId); // uniqueCount++; } else { return false; } String jobTitle = job.getString("fmt_jobTitle").replaceAll("</?B>", ""); if (!this.externalApply && job.has("sourceDomain")) { logInfo(log, jobId + " - " + jobTitle + " - " + job.getString("sourceDomain") + " --> ignored [external]"); String sourceDomain = job.getString("sourceDomain"); if (!sourceDomain.contains("jobvite") && !sourceDomain.contains("ziprecruiter")) { return true; } } String jobEmployer = job.getString("fmt_companyName"); String jobUrl = "https://www.linkedin.com/jobs2/view/" + jobId; if (alreadyPerformedAction(jobUrl)) { return true; } String location = ""; if (job.has("fmt_location")) { location = job.getString("fmt_location"); } String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer; if (job.has("sourceDomain")) { String sourceDomain = job.getString("sourceDomain"); if (this.externalApply && (sourceDomain.contains("empregocerto.uol.com.br") || sourceDomain.contains("jobomas.com") || sourceDomain.contains("curriculum.com.br"))) { logInfo(log, "-- Ignored [externalApply - domain " + sourceDomain + "] " + headline); addAlreadyPerformedAction(jobUrl); return true; } } if (!considerTitle(jobTitle)) { logInfo(log, "-- Ignored [title] " + headline); addAlreadyPerformedAction(jobUrl); return true; } try { FileWriter out = new FileWriter(new File(this.tempDir + "\\job-db\\_titles.txt"), true); out.write(jobTitle + "\r\n"); out.close(); } catch (Exception e) { } if (!considerEmployer(jobEmployer)) { logInfo(log, "-- Ignored [employer] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerLocation(location)) { logInfo(log, "-- Ignored [location] " + headline); addAlreadyPerformedAction(jobUrl); return true; } HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl); String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity); Document jobDocument = Jsoup.parse(jobResult); Elements elDescription = jobDocument.select("div.description-section").select("div.rich-text"); Elements elSkills = jobDocument.select("div.skills-section").select("div.rich-text"); // FileWriter out = new FileWriter(new File(tempDir + "\\job-db\\" + jobId + ".txt")); // out.write(elDescription.text() + "\r\n"); // out.write(elSkills.text()); // out.close(); if (!this.externalApply && !jobResult.contains("onsite-apply")) { logInfo(log, "-- Ignored [onsite apply] " + headline); addAlreadyPerformedAction(jobUrl); try { Thread.sleep(5000L); } catch (InterruptedException e) { e.printStackTrace(); } return true; } if (!considerVisaDescription(elDescription.html()) || !considerVisaDescription(elSkills.html())) { logInfo(log, "-- Ignored [visa] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerExperienceDescription(elDescription.html()) || !considerExperienceDescription(elSkills.html())) { logInfo(log, "-- Ignored [exp] " + headline); addAlreadyPerformedAction(jobUrl); return true; } ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html() + " - " + elSkills.html()); if (result.getScore() < this.requiredScore) { logInfo(log, "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline); addAlreadyPerformedAction(jobUrl); return true; } headline = headline + " - " + result.getMatches(); logInfo(log, headline); logInfo(log, elDescription.html()); if (this.actionCount++ == this.maxCount) { this.setPaused(true); return false; } performUnique(jobUrl); try { Thread.sleep(5000L); } catch (InterruptedException e) { e.printStackTrace(); } return true; }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ???//from w w w . jav a 2 s.c o m */ @Override public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception { if (CollectionUtils.isEmpty(seeds)) { return null; } Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>(); for (HttpSeed seed : seeds) { Document doc = parse(seed.getHtml()); // ?URL Elements page_form_elements = doc.select("#pageForm"); if (page_form_elements.isEmpty()) { return null; } Element page_form_e = page_form_elements.get(0); // URL String url = DOMAIN + page_form_e.attr("action"); Elements param_elements = page_form_e.select("input"); // int totalPageNum = this.getTotalPageNum(doc); for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) { // ? Map<String, String> params = new HashMap<String, String>(); for (Element param_e : param_elements) { params.put(param_e.attr("name"), param_e.attr("value")); } // params.put("curstart", String.valueOf(pageNo)); HttpSeed httpSeed = this.initListHttpSeed(url, params); seedGroups.add(httpSeed); } } return seedGroups; }
From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java
/** * Proxy a HTTP URL locally.//from w w w. j a va 2 s . c om * * @param url the HTTP URL to proxy. * @param filter regex filter. Only artifacts URL matching the filter will be considered. * @throws Exception in case of proxy failure. */ private void proxyHttp(String url, String filter) throws Exception { LOGGER.debug("Proxying HTTP URL {}", url); HttpClient httpClient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(url); HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { if (entity.getContentType().getValue().equals("application/java-archive") || entity.getContentType().getValue().equals("application/octet-stream")) { // I have a jar/binary, potentially a resource try { if ((filter == null) || (url.matches(filter))) { Resource resource = new DataModelHelperImpl().createResource(new URL(url)); if (resource != null) { obrRepository.addResource(resource); obrRepository.setLastModified(System.currentTimeMillis()); } } } catch (IllegalArgumentException e) { LOGGER.warn(e.getMessage()); } } else { // try to find link to "browse" try { Document document = Jsoup.connect(url).get(); Elements links = document.select("a"); if (links.size() > 1) { for (int i = 1; i < links.size(); i++) { Element link = links.get(i); String absoluteHref = link.attr("abs:href"); this.proxyHttp(absoluteHref, filter); } } } catch (UnsupportedMimeTypeException e) { // ignore } } } }