Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:strat.mining.multipool.stats.jersey.client.impl.WaffleRestClientImpl.java

@Override
public GlobalStats getGlobalStats() {
    GlobalStats result = null;/*from www  . ja v  a 2s . c  o  m*/
    try {
        LOGGER.debug("Start to get the waffle global stats.");
        long startTime = System.currentTimeMillis();
        Document statsPage = Jsoup.connect(WAFFLE_POOL_GLOBAL_STATS_URL)
                .userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)").get();
        PERF_LOGGER.info("Retrieved Wafflepool stats page in {} ms.", System.currentTimeMillis() - startTime);

        result = new GlobalStats();
        try {
            Elements noteElements = statsPage.select("#note");
            if (noteElements != null && !noteElements.isEmpty()) {
                String style = noteElements.get(0).attr("style");
                if (style == null
                        || (!style.contains("display:none") && !style.contains("visibility:hidden"))) {
                    String note = noteElements.get(0).html();
                    result.setNote(note);
                }
            }
        } catch (Exception e) {
            LOGGER.error("Failed to get the last note.", e);
        }

        Elements headersElements = statsPage.select("#pool_stats");
        String[] splitted = headersElements.get(0).text().split("\\s");

        result.setMegaHashesPerSeconds(parsePower(splitted[1], splitted[2]));
        result.setNbMiners(Integer.parseInt(splitted[6]));
        result.setMiningCoin(splitted[10]);

        Elements contentElements = statsPage.select("#content");
        splitted = contentElements.get(0).text().split("Bitcoins sent to miners:");
        String[] splitted2 = splitted[1].split("Bitcoins earned \\(not yet sent\\):");
        String rawPaidout = splitted2[0];
        splitted = splitted2[1].split("Bitcoins unconverted \\(approximate\\):");
        String rawBalance = splitted[0];
        splitted2 = splitted[1].split("Date BTC");
        String rawUnexchanged = splitted2[0];
        result.setTotalPaidout(Float.parseFloat(rawPaidout.replaceAll(",", "")));
        result.setTotalBalance(Float.parseFloat(rawBalance.replaceAll(",", "")));
        result.setTotalUnexchanged(Float.parseFloat(rawUnexchanged.replaceAll(",", "")));

        LOGGER.debug("Global stats from waffle retreived.");

    } catch (IOException e) {
        LOGGER.error("Failed to retrieve the stats page of Wafflepool.", e);
    }
    return result;
}

From source file:com.liato.bankdroid.banking.banks.coop.Coop.java

@Override
public void update() throws BankException, LoginException, BankChoiceException {
    super.update();
    if (username == null || password == null || username.length() == 0 || password.length() == 0) {
        throw new LoginException(res.getText(R.string.invalid_username_password).toString());
    }//from   www  . j  av  a2  s .c om

    login();

    try {
        for (AccountType at : AccountType.values()) {
            response = urlopen.open(at.getUrl());
            Document d = Jsoup.parse(response);
            Elements historik = d.select("#historik section");
            TransactionParams params = new TransactionParams();
            mTransactionParams.put(at, params);
            if (historik != null && !historik.isEmpty()) {
                String data = historik.first().attr("data-controller");
                Matcher m = rePageGuid.matcher(data);
                if (m.find()) {
                    params.setPageGuid(m.group(1));
                }
            }
            Element date = d.getElementById("dateFrom");
            if (date != null) {
                params.setMinDate(date.hasAttr("min") ? date.attr("min") : null);
                params.setMaxDate(date.hasAttr("max") ? date.attr("max") : null);
            }
            Elements es = d.select(".List:contains(Saldo)");
            if (es != null && !es.isEmpty()) {
                List<String> names = new ArrayList<String>();
                List<String> values = new ArrayList<String>();
                for (Element e : es.first().select("dt")) {
                    names.add(e.text().replaceAll(":", "").trim());
                }
                for (Element e : es.first().select("dd")) {
                    values.add(e.text().trim());
                }
                for (int i = 0; i < Math.min(names.size(), values.size()); i++) {
                    Account a = new Account(names.get(i), Helpers.parseBalance(values.get(i)),
                            String.format("%s%d", at.getPrefix(), i));
                    a.setCurrency(Helpers.parseCurrency(values.get(i), "SEK"));
                    if (a.getName().toLowerCase().contains("disponibelt")) {
                        a.setType(Account.REGULAR);
                        balance = a.getBalance();
                        setCurrency(a.getCurrency());
                    } else {
                        a.setType(Account.OTHER);
                    }

                    if (i > 0) {
                        a.setAliasfor(String.format("%s%d", at.getPrefix(), 0));
                    }
                    accounts.add(a);
                }
            }
        }
    } catch (ClientProtocolException e) {
        e.printStackTrace();
        throw new BankException(e.getMessage());
    } catch (IOException e) {
        e.printStackTrace();
        throw new BankException(e.getMessage());
    }

    try {
        RefundSummaryRequest refsumReq = new RefundSummaryRequest(mUserId, mToken, APPLICATION_ID);
        HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(refsumReq));
        InputStream is = urlopen
                .openStream("https://www.coop.se/ExternalServices/RefundService.svc/RefundSummary", e, true);
        RefundSummaryResponse refsumResp = readJsonValue(is, RefundSummaryResponse.class);
        if (refsumResp != null && refsumResp.getRefundSummaryResult() != null) {
            Account a = new Account("terbring p ditt kort",
                    BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getAccountBalance()), "refsummary");
            a.setCurrency("SEK");
            if (accounts.isEmpty()) {
                balance = a.getBalance();
                setCurrency(a.getCurrency());
            }
            accounts.add(a);
            a = new Account(
                    String.format("terbring fr %s", refsumResp.getRefundSummaryResult().getMonthName()),
                    BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getTotalRefund()),
                    "refsummary_month");
            accounts.add(a);
        }
    } catch (JsonParseException e) {
        e.printStackTrace();
        throw new BankException(e.getMessage());
    } catch (ClientProtocolException e) {
        e.printStackTrace();
        throw new BankException(e.getMessage());
    } catch (IOException e) {
        e.printStackTrace();
        throw new BankException(e.getMessage());
    }

    if (accounts.isEmpty()) {
        throw new BankException(res.getText(R.string.no_accounts_found).toString());
    }
    super.updateComplete();
}

From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java

public String getIDCaptcha(Document doc) throws UnsupportedEncodingException {
    continueCaptcha = URLEncoder.encode(doc.select("input[name=continue]").first().attr("value"), "UTF-8");
    q = URLEncoder.encode(doc.select("input[name=q]").first().attr("value"), "UTF-8");
    return doc.select("input[value~=^\\d+$]").first().attr("value");
}

From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java

/**
 * Parse the description out of the meta tag if one exists. Otherwise, return
 * null// w  w  w.j  av  a2  s.c  o m
 *
 * @param doc The Document to parse
 * @return The description if it exists in the HTML, otherwise null.
 */
private String getDescriptionFromDocument(Document doc) {
    Elements metaDescriptionElements = doc.select("meta[name=description]");
    return metaDescriptionElements.size() > 0 ? metaDescriptionElements.attr("content") : "";
}

From source file:gov.medicaid.screening.dao.impl.BBHTLicenseDAOBean.java

/**
 * Parses the nursing license details page.
 *
 * @param page the details page//  ww w  .  ja  v  a  2  s.  c  o m
 * @param licenseNo if user has multiple licenses, this one will be used
 * @return the parsed license details
 * @throws ParsingException if the page does not contain the expected elements
 */
private License parseLicense(Document page, String licenseNo) throws ParsingException {
    if (!page.select("span#lblFormTitle").text().equals("License Details")) {
        throw new ParsingException(ErrorCode.MITA50002.getDesc());
    }

    License license = new License();
    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    String fullNameWithType = page.select("#_ctl7_lblName").text();
    String fullName = fullNameWithType.indexOf(",") != -1
            ? fullNameWithType.substring(0, fullNameWithType.indexOf(","))
            : fullNameWithType;

    User user = new User();
    profile.setUser(user);
    String[] nameParts = fullName.split(" ");
    user.setLastName(nameParts[nameParts.length - 1]);
    if (nameParts.length > 1) {
        user.setFirstName(nameParts[0]);
    }
    // everything else goes to middle name (per site behavior)
    if (nameParts.length > 2) {
        StringBuffer sb = new StringBuffer();
        for (int i = 1; i < nameParts.length - 1; i++) {
            if (sb.length() > 0) {
                sb.append(" ");
            }
            sb.append(nameParts[i]);
        }
        user.setMiddleName(sb.toString());
    }

    String gender = page.select("#_ctl7_lblGender").text();
    if (Util.isNotBlank(gender)) {
        if ("Female".equals(gender)) {
            profile.setSex(Sex.FEMALE);
        } else {
            profile.setSex(Sex.MALE);
        }
    }

    String city = page.select("#_ctl7_lblPublicCity").text();
    if (Util.isNotBlank(city)) {
        List<Address> addresses = new ArrayList<Address>();
        Address address = new Address();
        addresses.add(address);
        address.setCity(city);
        profile.setAddresses(addresses);
    }

    Elements licenses = page.select("#_ctl7_dgLicense tr.Normal");
    for (Element row : licenses) {
        String licenseNumber = row.select("td:eq(1)").text();
        if (licenseNo != null && !licenseNumber.startsWith(licenseNo)) {
            // user has multiple licenses, the results will show this user twice (search by name)
            continue;
        }
        license.setLicenseNumber(licenseNumber);

        LicenseType type = new LicenseType();
        type.setName(row.select("td:eq(0)").text());
        license.setType(type);

        LicenseStatus status = new LicenseStatus();
        status.setName(row.select("td:eq(2)").text());
        license.setStatus(status);

        String issueDate = row.select("td:eq(3)").text();
        if (Util.isNotBlank(issueDate)) {
            license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT));
        }

        String expirationDate = row.select("td:eq(4)").text();
        if (Util.isNotBlank(expirationDate)) {
            license.setExpireDate(parseDate(expirationDate, DATE_FORMAT));
        }
    }
    licenses.clear();
    return license;
}

From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java

/**
 * ?page??/* w w w .  j  a v a  2 s.co  m*/
 */
@Override
public void visit(Page page) {
    try {
        String url = page.getWebURL().getURL();

        page.setContentType("text/html; charset=" + gather.getEncoding());
        Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

        String title = doc.title();
        if (gather.getTitleExternal() && gather.getTitleRegex() != null
                && gather.getTitleRegex().length() > 0) {
            Elements titleEles = doc.select(gather.getTitleRegex());
            if (!titleEles.isEmpty()) {
                String tempTitle = titleEles.text();
                if (tempTitle != null && tempTitle.length() > 0) {
                    title = tempTitle;
                }
            }
        }

        if (title != null && title.trim().length() > 0) {
            Elements elements = doc.select(matchRegex);
            if (filterRegex != null && filterRegex.trim().length() > 0) {
                elements = elements.not(filterRegex);
            }
            if (!elements.isEmpty()) {
                String subHtml = elements.html();
                Document blockDoc = Jsoup.parse(subHtml);
                String contentText = blockDoc.html();

                if (gather.getRemoveHref()) {
                    Document moveDoc = Jsoup.parse(contentText);
                    Elements moveEles = moveDoc.select("*").not("a");
                    contentText = moveEles.html();
                }
                if (gather.getRemoveHtmlTag())
                    contentText = doc.text();

                if (isLocal) {
                    contentText = doc.text();

                    Boolean isMatcher = true;
                    for (int i = 0; i < keys.length; i++) {
                        Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                        if (!result) {
                            isMatcher = false;
                            break;
                        }
                    }

                    if (isMatcher) {
                        Storage storage = new Storage();
                        storage.setGatherId(gather.getId());
                        storage.setGatherName(gather.getName());
                        storage.setTitle(title);
                        storage.setUrl(url);
                        try {
                            gatherService.addStorage(storage);
                        } catch (Exception e) {
                            logger.error("save storage error : {}", e.getLocalizedMessage());
                        } finally {
                            storage = null;
                        }
                    }
                } else {
                    Content content = new Content();
                    content.setDetail(contentText);
                    content.setPage(1);
                    List<Content> contents = new ArrayList<Content>();
                    contents.add(content);

                    Article article = new Article();
                    article.setTitle(title);
                    article.setContents(contents);

                    articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                            CrawlerUtil.USER_NAME);
                }
            }
        }
    } catch (IOException e) {
        logger.warn(e.getLocalizedMessage());
    }
}

From source file:info.smartkit.hairy_batman.query.SogouSearchQuery.java

public void parseWxUserId() {
    Document doc;
    try {//www .  ja  v  a 2 s. c  om

        // need http protocol
        doc = Jsoup.connect(GlobalConsts.SOGOU_SEARCH_URL_BASE + wxFoo.getSubscribeId()).get();

        // get all "?:" value of html <span>
        Elements openIdSpans = doc.select(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_HTML_ELEMENTS);
        //
        for (Element openIdSpan : openIdSpans) {
            if (openIdSpan.hasText()) {
                if (openIdSpan.text().contains(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_KEYWORDS)) {
                    // get the value from href attribute
                    LOG.info("openId span text : " + openIdSpan.text());
                    // FIXME:????
                    if (this.wxFoo.getUserId() == null) {
                        this.wxFoo.setOpenId(
                                openIdSpan.text().split(GlobalConsts.SOGOU_SEARCH_WX_USER_ID_KEYWORDS)[1]);
                        LOG.info("saved wxUserId value: " + this.wxFoo.getUserId());
                        GlobalVariables.wxFooListWithUserId.add(this.wxFoo);
                    }
                }
            }
        }

    } catch (IOException e) {
        // e.printStackTrace();
        LOG.error(e.toString());
    }
}

From source file:org.brunocvcunha.taskerbox.impl.jobs.LinkedInJobSeeker.java

private boolean handleJob(JSONObject job)
        throws JSONException, ClientProtocolException, IOException, URISyntaxException {
    if (job.getBoolean("isApplied")) {
        return false;
    }//  w ww.ja  va2s  .c  o m

    long jobId = job.getLong("id");

    if (!this.openIds.contains(jobId)) {
        this.openIds.add(jobId);
        // uniqueCount++;
    } else {
        return false;
    }

    String jobTitle = job.getString("fmt_jobTitle").replaceAll("</?B>", "");

    if (!this.externalApply && job.has("sourceDomain")) {
        logInfo(log,
                jobId + " - " + jobTitle + " - " + job.getString("sourceDomain") + " --> ignored [external]");

        String sourceDomain = job.getString("sourceDomain");
        if (!sourceDomain.contains("jobvite") && !sourceDomain.contains("ziprecruiter")) {
            return true;
        }
    }

    String jobEmployer = job.getString("fmt_companyName");

    String jobUrl = "https://www.linkedin.com/jobs2/view/" + jobId;
    if (alreadyPerformedAction(jobUrl)) {
        return true;
    }

    String location = "";
    if (job.has("fmt_location")) {
        location = job.getString("fmt_location");
    }
    String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer;

    if (job.has("sourceDomain")) {
        String sourceDomain = job.getString("sourceDomain");
        if (this.externalApply && (sourceDomain.contains("empregocerto.uol.com.br")
                || sourceDomain.contains("jobomas.com") || sourceDomain.contains("curriculum.com.br"))) {
            logInfo(log, "-- Ignored [externalApply - domain " + sourceDomain + "] " + headline);
            addAlreadyPerformedAction(jobUrl);
            return true;
        }
    }

    if (!considerTitle(jobTitle)) {
        logInfo(log, "-- Ignored [title] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    try {
        FileWriter out = new FileWriter(new File(this.tempDir + "\\job-db\\_titles.txt"), true);
        out.write(jobTitle + "\r\n");
        out.close();
    } catch (Exception e) {
    }

    if (!considerEmployer(jobEmployer)) {
        logInfo(log, "-- Ignored [employer] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerLocation(location)) {
        logInfo(log, "-- Ignored [location] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl);
    String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity);
    Document jobDocument = Jsoup.parse(jobResult);
    Elements elDescription = jobDocument.select("div.description-section").select("div.rich-text");
    Elements elSkills = jobDocument.select("div.skills-section").select("div.rich-text");

    // FileWriter out = new FileWriter(new File(tempDir + "\\job-db\\" + jobId + ".txt"));
    // out.write(elDescription.text() + "\r\n");
    // out.write(elSkills.text());
    // out.close();

    if (!this.externalApply && !jobResult.contains("onsite-apply")) {
        logInfo(log, "-- Ignored [onsite apply] " + headline);
        addAlreadyPerformedAction(jobUrl);

        try {
            Thread.sleep(5000L);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        return true;
    }

    if (!considerVisaDescription(elDescription.html()) || !considerVisaDescription(elSkills.html())) {
        logInfo(log, "-- Ignored [visa] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }
    if (!considerExperienceDescription(elDescription.html())
            || !considerExperienceDescription(elSkills.html())) {
        logInfo(log, "-- Ignored [exp] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html() + " - " + elSkills.html());

    if (result.getScore() < this.requiredScore) {
        logInfo(log,
                "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    headline = headline + " - " + result.getMatches();

    logInfo(log, headline);
    logInfo(log, elDescription.html());

    if (this.actionCount++ == this.maxCount) {
        this.setPaused(true);
        return false;
    }

    performUnique(jobUrl);

    try {
        Thread.sleep(5000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    return true;

}

From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

/**
 * ???//from w w  w  . jav  a 2  s.c  o  m
 */
@Override
public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception {

    if (CollectionUtils.isEmpty(seeds)) {
        return null;
    }

    Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>();

    for (HttpSeed seed : seeds) {
        Document doc = parse(seed.getHtml());

        // ?URL
        Elements page_form_elements = doc.select("#pageForm");
        if (page_form_elements.isEmpty()) {
            return null;
        }

        Element page_form_e = page_form_elements.get(0);
        // URL
        String url = DOMAIN + page_form_e.attr("action");
        Elements param_elements = page_form_e.select("input");

        // 
        int totalPageNum = this.getTotalPageNum(doc);

        for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) {

            // ?
            Map<String, String> params = new HashMap<String, String>();
            for (Element param_e : param_elements) {
                params.put(param_e.attr("name"), param_e.attr("value"));
            }
            // 
            params.put("curstart", String.valueOf(pageNo));

            HttpSeed httpSeed = this.initListHttpSeed(url, params);

            seedGroups.add(httpSeed);
        }
    }

    return seedGroups;
}

From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java

/**
 * Proxy a HTTP URL locally.//from   w  w w. j a va 2 s .  c  om
 *
 * @param url    the HTTP URL to proxy.
 * @param filter regex filter. Only artifacts URL matching the filter will be considered.
 * @throws Exception in case of proxy failure.
 */
private void proxyHttp(String url, String filter) throws Exception {
    LOGGER.debug("Proxying HTTP URL {}", url);
    HttpClient httpClient = new DefaultHttpClient();

    HttpGet httpGet = new HttpGet(url);
    HttpResponse response = httpClient.execute(httpGet);
    HttpEntity entity = response.getEntity();

    if (entity != null) {
        if (entity.getContentType().getValue().equals("application/java-archive")
                || entity.getContentType().getValue().equals("application/octet-stream")) {
            // I have a jar/binary, potentially a resource
            try {
                if ((filter == null) || (url.matches(filter))) {
                    Resource resource = new DataModelHelperImpl().createResource(new URL(url));
                    if (resource != null) {
                        obrRepository.addResource(resource);
                        obrRepository.setLastModified(System.currentTimeMillis());
                    }
                }
            } catch (IllegalArgumentException e) {
                LOGGER.warn(e.getMessage());
            }
        } else {
            // try to find link to "browse"
            try {
                Document document = Jsoup.connect(url).get();

                Elements links = document.select("a");
                if (links.size() > 1) {
                    for (int i = 1; i < links.size(); i++) {
                        Element link = links.get(i);
                        String absoluteHref = link.attr("abs:href");
                        this.proxyHttp(absoluteHref, filter);
                    }
                }
            } catch (UnsupportedMimeTypeException e) {
                // ignore
            }
        }
    }
}