Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:me.vertretungsplan.parser.DSBLightParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    String id = data.getString(PARAM_ID);
    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    Map<String, String> referer = new HashMap<>();
    String baseUrl = data.optString(PARAM_BASEURL, BASE_URL);
    referer.put("Referer", baseUrl + "/Player.aspx?ID=" + id);

    String response = httpGet(baseUrl + "/Player.aspx?ID=" + id, ENCODING, referer);
    Document doc = Jsoup.parse(response);
    // IFrame.aspx
    String iframeUrl = doc.select("iframe").first().attr("src");

    response = httpGet(iframeUrl, ENCODING, referer);

    doc = Jsoup.parse(response);/*from   w w  w  .  jav  a2  s .co m*/

    if (data.has(PARAM_LOGIN) && data.get(PARAM_LOGIN) instanceof Boolean && data.getBoolean(PARAM_LOGIN)) {
        if (!(credential instanceof UserPasswordCredential)) {
            throw new IllegalArgumentException("no login");
        }
        String username = ((UserPasswordCredential) credential).getUsername();
        String password = ((UserPasswordCredential) credential).getPassword();

        List<NameValuePair> params = new ArrayList<>();
        params.add(new BasicNameValuePair("__VIEWSTATE", doc.select("#__VIEWSTATE").attr("value")));
        params.add(new BasicNameValuePair("__VIEWSTATEGENERATOR",
                doc.select("#__VIEWSTATEGENERATOR").attr("value")));
        params.add(new BasicNameValuePair("__EVENTVALIDATION", doc.select("#__EVENTVALIDATION").attr("value")));
        params.add(new BasicNameValuePair("ctl02$txtBenutzername", username));
        params.add(new BasicNameValuePair("ctl02$txtPasswort", password));
        params.add(new BasicNameValuePair("ctl02$btnLogin", "weiter"));
        response = httpPost(iframeUrl, ENCODING, params, referer);
        doc = Jsoup.parse(response);
        if (doc.select("#ctl02_lblLoginFehlgeschlagen").size() > 0)
            throw new CredentialInvalidException();
    } else if (data.has(PARAM_LOGIN) && data.get(PARAM_LOGIN) instanceof JSONObject) {
        new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);
    }

    Elements iframes = doc.select("iframe");
    if (data.has(PARAM_IFRAME_INDEX)) {
        parsePreProgram(v, referer, iframes.get(data.getInt(PARAM_IFRAME_INDEX)));
    } else {
        for (Element iframe : iframes) {
            parsePreProgram(v, referer, iframe);
        }
    }

    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    v.setWebsite(baseUrl + "/Player.aspx?ID=" + id);

    return v;
}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;/*ww w .  j  a va2s  .c o m*/

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            if (element != null) {
                attrib_value = element.text();
            }
        }
    }

    if ((attrib_value != null) && (attributeDetail.extractor_args != null)
            && (attributeDetail.extractor_args.size() >= 2)) {
        String regexSelector = attributeDetail.extractor_args.get(1);
        pattern = Pattern.compile(regexSelector);
        Matcher m = pattern.matcher(attrib_value);
        m.find();
        attrib_value = m.group(1);
    }

    return attrib_value;
}

From source file:br.gov.jfrj.siga.base.SigaHTTP.java

private String getAttributeValueFromHtml(String htmlContent, String attribute) {
    String value = "";

    Document doc = Jsoup.parse(htmlContent);
    // Get SAMLRequest value
    for (Element el : doc.select("input")) {
        if (el.attr("name").equals(attribute)) {
            value = el.attr("value");
        }/*w w w . java  2s.  c om*/
    }

    return value;
}

From source file:uk.co.jassoft.markets.utils.article.ContentGrabber.java

public Date getPublishedDate(String html) {
    try {//from w ww .  ja v  a2  s  .  co m
        Document doc = Jsoup.parse(html);

        List<Date> possibleDates = new ArrayList<>();

        for (String selector : getSelectors()) {
            Elements metalinks = doc.select(selector);

            if (metalinks.isEmpty())
                continue;

            Date value = getDateValue(metalinks.get(0).toString());

            if (value != null) {
                return value;
            }

            if (possibleDates.isEmpty()) {
                LOG.info("Date Format Not recognised for [{}]", metalinks.get(0).toString());
                missingDateFormatRepository
                        .save(new MissingDateFormat(metalinks.get(0).toString(), new Date()));
            }
        }

        if (!possibleDates.isEmpty()) {
            if (possibleDates.size() > 1) {
                possibleDates.sort(Date::compareTo);
            }
            return possibleDates.get(possibleDates.size() - 1);
        }

        return null;
    } catch (Exception exception) {
        LOG.error("Failed to get Published Date", exception);
        return null;
    }
}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrValueDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//from  w w w.ja  v  a  2 s .  c  o m

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            int arg_count = 0;
            for (String value_name : attributeDetail.extractor_args) {
                if (arg_count > 0) { // skip the first one, its the cssSelector
                    if (element != null && element.attr(value_name) != null) {
                        attrib_value = element.attr(value_name);
                        if (StringUtils.isNotBlank(attrib_value)) {
                            break;
                        }
                    }
                }
                arg_count++;
            }
        }
    }

    return attrib_value;
}

From source file:com.liato.bankdroid.banking.banks.MinPension.java

private Account updateAccount(String URL, String selector, String name) throws IOException {
    String response = urlopen.open(URL);
    Document dResponse = Jsoup.parse(response);
    List<Transaction> transactions = new ArrayList<>();
    String institute = "";
    String subInstitute = "";
    for (Element e : dResponse.select(selector)) {
        if (e.hasClass("GroupRow")) {
            institute = e.children().first().text();
        } else if (e.hasClass("GroupMemberRow") || e.hasClass("SubRow")) {
            Elements elements = e.children();
            if (elements.size() == 6) { //Special case for "Allmn pension"
                if (elements.get(2).text().isEmpty()) {
                    //   subInstitute =  "  " + elements.get(1).text(); /* Doesn't fit atm. */
                } else {
                    transactions.add(new Transaction(elements.get(5).text(),
                            institute + subInstitute + "\n  " + elements.get(1).text(),
                            Helpers.parseBalance(elements.get(2).text())));
                    subInstitute = "";
                }//from   ww  w  .ja  va 2 s  .  c  o  m
            } else if (elements.size() >= 7) {
                transactions.add(
                        new Transaction(elements.get(6).text(), institute + "\n  " + elements.get(1).text(),
                                Helpers.parseBalance(elements.get(4).text())));
            }
        }
    }

    balance = BigDecimal.ZERO;
    for (Transaction t : transactions) {
        balance = balance.add(t.getAmount());
    }
    Account account = new Account(name, balance, name, Account.REGULAR, "");
    account.setTransactions(transactions);
    return account;
}

From source file:gov.medicaid.screening.dao.impl.PharmacyLicenseDAOBean.java

/**
 * Parse License information.// ww  w . j  a va 2s .c o m
 *
 * @param details The html document.
 * @return the license information
 * @throws ParseException When an error occurs while parsing the dates.
 */
private License parseLicense(Document details) throws ParseException {
    String name = details.select("span#lblName2").text();
    String address = details.select("span#lblAddress").text();
    String licenseType = details.select("span#lblLicenseType").text();
    String licenseNumber = details.select("span#lblLicenseNumber").text();
    String licenseStatus = details.select("span#lblLicenseStatus").text();
    String originalIssueDate = details.select("span#lblOriginalIssueDate").text();
    String expirationDate = details.select("span#lblExpirationDate").text();
    String disciplinaryAction = details.select("span#lblDisciplinaryAction").text();

    License license = new License();
    license.setLicenseNumber(licenseNumber);
    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    User user = new User();
    if (name.lastIndexOf(" ") > -1) {
        String firstName = name.substring(0, name.lastIndexOf(" ")).trim();
        String lastName = name.substring(name.lastIndexOf(" "), name.length()).trim();
        user.setFirstName(firstName);
        user.setLastName(lastName);
    } else {
        user.setLastName(name);
    }

    List<Address> addresses = new ArrayList<Address>();
    addresses.add(parseAddress(address.trim()));
    profile.setAddresses(addresses);
    profile.setUser(user);

    LicenseType licType = new LicenseType();
    licType.setName(licenseType);
    license.setType(licType);

    Date issueDate = parseDate(originalIssueDate, DATE_FORMAT);
    if (issueDate != null) {
        license.setOriginalIssueDate(issueDate);
    }
    Date expireDate = parseDate(expirationDate, DATE_FORMAT);
    if (expireDate != null) {
        license.setExpireDate(expireDate);
    }
    LicenseStatus status = new LicenseStatus();
    status.setName(licenseStatus);
    license.setStatus(status);

    license.setDiscipline(!"No".equals(disciplinaryAction.trim()));
    return license;
}

From source file:accountgen.controller.Controller.java

private void setBday(Document doc, Person p) {
    Element bday = doc.select(".bday").first();
    Date bd = new Date();

    Date date = null;//from   www . ja  v a  2  s  . c  o m
    try {
        date = new SimpleDateFormat("MMM", Locale.ENGLISH).parse(bday.text().split(" ")[0]);
    } catch (ParseException ex) {
        Logger.getLogger(Controller.class.getName()).log(Level.SEVERE, null, ex);
    }
    Calendar cal = Calendar.getInstance();
    cal.setTime(date);
    int month = cal.get(Calendar.MONTH);
    bd.setMonth(month);
    bd.setDate(Integer.parseInt(bday.text().split(" ")[1].replace(",", "")));
    bd.setYear(Integer.parseInt(bday.text().split(",")[1].substring(1, 5)) - 1900);
    p.setBirthday(bd);
}

From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBTopicProvider.java

/**
 * Try to find further URLs in the document that need to be requested to complete the resource data.
 * Used e.g. to parse the result of paging in HTML pages. The default implementation returns an empty list.
 * <p/>/*  w  ww  . j  av a 2  s. c  o m*/
 * This implementation tries to locate the paging area of PHPBB and selects the last link of the paging, which will
 * be the "next" page.
 *
 * @param document
 * @param requestUrl
 * @return
 */
@Override
protected List<String> findAdditionalRequestUrls(String resource, Document document, String requestUrl) {
    List<String> result = new LinkedList<String>();

    // return the next page in the result list
    Elements values = document.select("div#pagecontent a[href~=viewtopic\\.php.*start=]");
    for (Element o : values) {
        String baseUrl = o.absUrl("href");
        if (baseUrl.length() > 0) {
            result.add(baseUrl);
        }
    }

    return result;

}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrListValueDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//  w w  w .  ja v  a  2  s  .  com

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            int arg_count = 0;
            for (String value_name : attributeDetail.extractor_args) {
                if (arg_count > 0) { // skip the first one, its the cssSelector
                    if (element != null && element.attr(value_name) != null) {
                        String rawList = element.attr(value_name);
                        if (StringUtils.isNotBlank(rawList)) {
                            String[] parts = rawList.split(",");
                            for (int i = 0; i < parts.length; i++) {
                                parts[i] = parts[i].trim().toLowerCase();
                            }
                            attrib_value = StringUtils.join(parts, ',');
                            break;
                        }
                    }
                }
                arg_count++;
            }
        }
    }

    return attrib_value;
}