List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:me.vertretungsplan.parser.DSBLightParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { String id = data.getString(PARAM_ID); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); Map<String, String> referer = new HashMap<>(); String baseUrl = data.optString(PARAM_BASEURL, BASE_URL); referer.put("Referer", baseUrl + "/Player.aspx?ID=" + id); String response = httpGet(baseUrl + "/Player.aspx?ID=" + id, ENCODING, referer); Document doc = Jsoup.parse(response); // IFrame.aspx String iframeUrl = doc.select("iframe").first().attr("src"); response = httpGet(iframeUrl, ENCODING, referer); doc = Jsoup.parse(response);/*from w w w . jav a2 s .co m*/ if (data.has(PARAM_LOGIN) && data.get(PARAM_LOGIN) instanceof Boolean && data.getBoolean(PARAM_LOGIN)) { if (!(credential instanceof UserPasswordCredential)) { throw new IllegalArgumentException("no login"); } String username = ((UserPasswordCredential) credential).getUsername(); String password = ((UserPasswordCredential) credential).getPassword(); List<NameValuePair> params = new ArrayList<>(); params.add(new BasicNameValuePair("__VIEWSTATE", doc.select("#__VIEWSTATE").attr("value"))); params.add(new BasicNameValuePair("__VIEWSTATEGENERATOR", doc.select("#__VIEWSTATEGENERATOR").attr("value"))); params.add(new BasicNameValuePair("__EVENTVALIDATION", doc.select("#__EVENTVALIDATION").attr("value"))); params.add(new BasicNameValuePair("ctl02$txtBenutzername", username)); params.add(new BasicNameValuePair("ctl02$txtPasswort", password)); params.add(new BasicNameValuePair("ctl02$btnLogin", "weiter")); response = httpPost(iframeUrl, ENCODING, params, referer); doc = Jsoup.parse(response); if (doc.select("#ctl02_lblLoginFehlgeschlagen").size() > 0) throw new CredentialInvalidException(); } else if (data.has(PARAM_LOGIN) && data.get(PARAM_LOGIN) instanceof JSONObject) { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); } Elements iframes = doc.select("iframe"); if (data.has(PARAM_IFRAME_INDEX)) { parsePreProgram(v, referer, iframes.get(data.getInt(PARAM_IFRAME_INDEX))); } else { for (Element iframe : iframes) { parsePreProgram(v, referer, iframe); } } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); v.setWebsite(baseUrl + "/Player.aspx?ID=" + id); return v; }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;/*ww w . j a va2s .c o m*/ if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { if (element != null) { attrib_value = element.text(); } } } if ((attrib_value != null) && (attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) { String regexSelector = attributeDetail.extractor_args.get(1); pattern = Pattern.compile(regexSelector); Matcher m = pattern.matcher(attrib_value); m.find(); attrib_value = m.group(1); } return attrib_value; }
From source file:br.gov.jfrj.siga.base.SigaHTTP.java
private String getAttributeValueFromHtml(String htmlContent, String attribute) { String value = ""; Document doc = Jsoup.parse(htmlContent); // Get SAMLRequest value for (Element el : doc.select("input")) { if (el.attr("name").equals(attribute)) { value = el.attr("value"); }/*w w w . java 2s. c om*/ } return value; }
From source file:uk.co.jassoft.markets.utils.article.ContentGrabber.java
public Date getPublishedDate(String html) { try {//from w ww . ja v a2 s . co m Document doc = Jsoup.parse(html); List<Date> possibleDates = new ArrayList<>(); for (String selector : getSelectors()) { Elements metalinks = doc.select(selector); if (metalinks.isEmpty()) continue; Date value = getDateValue(metalinks.get(0).toString()); if (value != null) { return value; } if (possibleDates.isEmpty()) { LOG.info("Date Format Not recognised for [{}]", metalinks.get(0).toString()); missingDateFormatRepository .save(new MissingDateFormat(metalinks.get(0).toString(), new Date())); } } if (!possibleDates.isEmpty()) { if (possibleDates.size() > 1) { possibleDates.sort(Date::compareTo); } return possibleDates.get(possibleDates.size() - 1); } return null; } catch (Exception exception) { LOG.error("Failed to get Published Date", exception); return null; } }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrValueDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;//from w w w.ja v a 2 s . c o m if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { int arg_count = 0; for (String value_name : attributeDetail.extractor_args) { if (arg_count > 0) { // skip the first one, its the cssSelector if (element != null && element.attr(value_name) != null) { attrib_value = element.attr(value_name); if (StringUtils.isNotBlank(attrib_value)) { break; } } } arg_count++; } } } return attrib_value; }
From source file:com.liato.bankdroid.banking.banks.MinPension.java
private Account updateAccount(String URL, String selector, String name) throws IOException { String response = urlopen.open(URL); Document dResponse = Jsoup.parse(response); List<Transaction> transactions = new ArrayList<>(); String institute = ""; String subInstitute = ""; for (Element e : dResponse.select(selector)) { if (e.hasClass("GroupRow")) { institute = e.children().first().text(); } else if (e.hasClass("GroupMemberRow") || e.hasClass("SubRow")) { Elements elements = e.children(); if (elements.size() == 6) { //Special case for "Allmn pension" if (elements.get(2).text().isEmpty()) { // subInstitute = " " + elements.get(1).text(); /* Doesn't fit atm. */ } else { transactions.add(new Transaction(elements.get(5).text(), institute + subInstitute + "\n " + elements.get(1).text(), Helpers.parseBalance(elements.get(2).text()))); subInstitute = ""; }//from ww w .ja va 2 s . c o m } else if (elements.size() >= 7) { transactions.add( new Transaction(elements.get(6).text(), institute + "\n " + elements.get(1).text(), Helpers.parseBalance(elements.get(4).text()))); } } } balance = BigDecimal.ZERO; for (Transaction t : transactions) { balance = balance.add(t.getAmount()); } Account account = new Account(name, balance, name, Account.REGULAR, ""); account.setTransactions(transactions); return account; }
From source file:gov.medicaid.screening.dao.impl.PharmacyLicenseDAOBean.java
/** * Parse License information.// ww w . j a va 2s .c o m * * @param details The html document. * @return the license information * @throws ParseException When an error occurs while parsing the dates. */ private License parseLicense(Document details) throws ParseException { String name = details.select("span#lblName2").text(); String address = details.select("span#lblAddress").text(); String licenseType = details.select("span#lblLicenseType").text(); String licenseNumber = details.select("span#lblLicenseNumber").text(); String licenseStatus = details.select("span#lblLicenseStatus").text(); String originalIssueDate = details.select("span#lblOriginalIssueDate").text(); String expirationDate = details.select("span#lblExpirationDate").text(); String disciplinaryAction = details.select("span#lblDisciplinaryAction").text(); License license = new License(); license.setLicenseNumber(licenseNumber); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); User user = new User(); if (name.lastIndexOf(" ") > -1) { String firstName = name.substring(0, name.lastIndexOf(" ")).trim(); String lastName = name.substring(name.lastIndexOf(" "), name.length()).trim(); user.setFirstName(firstName); user.setLastName(lastName); } else { user.setLastName(name); } List<Address> addresses = new ArrayList<Address>(); addresses.add(parseAddress(address.trim())); profile.setAddresses(addresses); profile.setUser(user); LicenseType licType = new LicenseType(); licType.setName(licenseType); license.setType(licType); Date issueDate = parseDate(originalIssueDate, DATE_FORMAT); if (issueDate != null) { license.setOriginalIssueDate(issueDate); } Date expireDate = parseDate(expirationDate, DATE_FORMAT); if (expireDate != null) { license.setExpireDate(expireDate); } LicenseStatus status = new LicenseStatus(); status.setName(licenseStatus); license.setStatus(status); license.setDiscipline(!"No".equals(disciplinaryAction.trim())); return license; }
From source file:accountgen.controller.Controller.java
private void setBday(Document doc, Person p) { Element bday = doc.select(".bday").first(); Date bd = new Date(); Date date = null;//from www . ja v a 2 s . c o m try { date = new SimpleDateFormat("MMM", Locale.ENGLISH).parse(bday.text().split(" ")[0]); } catch (ParseException ex) { Logger.getLogger(Controller.class.getName()).log(Level.SEVERE, null, ex); } Calendar cal = Calendar.getInstance(); cal.setTime(date); int month = cal.get(Calendar.MONTH); bd.setMonth(month); bd.setDate(Integer.parseInt(bday.text().split(" ")[1].replace(",", ""))); bd.setYear(Integer.parseInt(bday.text().split(",")[1].substring(1, 5)) - 1900); p.setBirthday(bd); }
From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBTopicProvider.java
/** * Try to find further URLs in the document that need to be requested to complete the resource data. * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an empty list. * <p/>/* w ww . j av a 2 s. c o m*/ * This implementation tries to locate the paging area of PHPBB and selects the last link of the paging, which will * be the "next" page. * * @param document * @param requestUrl * @return */ @Override protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) { List<String> result = new LinkedList<String>(); // return the next page in the result list Elements values = document.select("div#pagecontent a[href~=viewtopic\\.php.*start=]"); for (Element o : values) { String baseUrl = o.absUrl("href"); if (baseUrl.length() > 0) { result.add(baseUrl); } } return result; }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrListValueDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;// w w w . ja v a 2 s . com if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { int arg_count = 0; for (String value_name : attributeDetail.extractor_args) { if (arg_count > 0) { // skip the first one, its the cssSelector if (element != null && element.attr(value_name) != null) { String rawList = element.attr(value_name); if (StringUtils.isNotBlank(rawList)) { String[] parts = rawList.split(","); for (int i = 0; i < parts.length; i++) { parts[i] = parts[i].trim().toLowerCase(); } attrib_value = StringUtils.join(parts, ','); break; } } } arg_count++; } } } return attrib_value; }