Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:net.GoTicketing.GoTicketing.java

/**
 * ???/*from w w w. ja v a  2  s  .  c  om*/
 * @throws Exception 
 */
private void praseFormActionSrc() throws Exception {
    Document doc = Jsoup.parse(TicketingPageHTML);
    Element form = doc.getElementsByTag("form").first();
    if (form == null)
        throw new Exception("Can't get form action source !");

    String src = form.attr("action");
    FormActionSrc = host + src.replace("/", "%2F");

    FormInputData = new TreeMap<>();
    for (Element elm : form.getElementsByTag("input")) {
        if (elm.attr("type").equals("hidden"))
            FormInputData.put(elm.attr("name"), elm.attr("value"));
    }
}

From source file:org.aliuge.crawler.fetcher.FetchWorker.java

/**
 * @param url/*from  w w  w . j  av  a 2  s. c  om*/
 * @desc 
 */
public void fetchPage(WebURL url) {

    PageFetchResult result = null;
    try {
        if (null != url && StringUtils.isNotBlank(url.getUrl())) {

            result = fetcher.fetch(url, true);
            // ??
            int statusCode = result.getStatusCode();
            if (statusCode == CustomFetchStatus.PageTooBig) {
                onIgnored(url);
                return;
            }
            if (statusCode != HttpStatus.SC_OK) {
                onFailed(url);
            } else {
                Page page = new Page(url);
                onSuccessed();
                if (!result.fetchContent(page)) {
                    onFailed(url);
                    return;
                }
                if (!parser.parse(page, url.getUrl())) {
                    onFailed(url);
                    return;
                }
                // ??
                String e_url = extractFilterAndChangeUrl(url.getUrl());
                if (StringUtils.isNoneBlank(e_url)) {
                    url.setUrl(e_url);
                    page.setWebURL(url);
                    pendingPages.addElement(page);
                    return;
                }

                // depth
                if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) {
                    return;
                }
                // ???Url?Url
                Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                        urlUtils.getBaseUrl(page.getWebURL().getUrl()));
                Elements links = doc.getElementsByTag("a");
                if (!links.isEmpty()) {
                    for (Element link : links) {
                        String linkHref = link.absUrl("href");

                        // ???url
                        if ((fetchFilter(linkHref) || extractFilter(linkHref))
                                && !bloomfilterHelper.exist(linkHref)) {
                            WebURL purl = new WebURL();
                            purl.setName(link.text());
                            purl.setUrl(linkHref);

                            purl.setDepth((short) (url.getDepth() + 1));
                            if (purl.getDepth() > config.getMaxDepthOfCrawling()
                                    && config.getMaxDepthOfCrawling() != -1)
                                return;
                            try {
                                if (!pendingUrls.addElement(purl)) {
                                    FileUtils.writeStringToFile(new File("status/_urls.good"),
                                            url.getUrl() + "\n", true);
                                }
                            } catch (QueueException e) {
                                log.error(e.getMessage());
                            }
                        }
                    }
                }
            }

        }
    } catch (QueueException e) {
        onFailed(url);
    } catch (Exception e) {
        e.printStackTrace();
        onFailed(url);
    } finally {
        if (null != result)
            result.discardContentIfNotConsumed();
    }
}

From source file:org.manalith.ircbot.plugin.weather.WeatherPlugin.java

@BotCommand("")
public String getYahooWeather(@Option(name = "", help = " ? ? ") String keyword) {
    try {/*  w ww  . j av a  2s . c  o m*/
        // TODO WOEID  ?
        final String url_woeid = "http://query.yahooapis.com/v1/public/yql"
                + "?q=select%20woeid%20from%20geo.places%20where%20text%3D%22"
                + URLEncoder.encode(keyword, "UTF-8") + "%20ko-KR%22%20limit%201";
        final String url_forecast = "http://weather.yahooapis.com/forecastrss?w=%s&u=c";
        final String error_woeid = "23424868";

        Document doc = Jsoup.connect(url_woeid).get();
        // example : http://query.yahooapis.com/v1/public/yql?q=select woeid
        // from geo.places where text%3D\"%2C ko-KR\" limit 1
        String woeid = doc.select("woeid").text();

        if (!woeid.equals(error_woeid)) {
            // example:
            // http://weather.yahooapis.com/forecastrss?w=1132599&u=c
            doc = Jsoup.connect(String.format(url_forecast, woeid)).get();
            String location = doc.getElementsByTag("yweather:location").attr("city");
            String condition = doc.getElementsByTag("yweather:condition").attr("text");
            String temp = doc.getElementsByTag("yweather:condition").attr("temp");
            String humidity = doc.getElementsByTag("yweather:atmosphere").attr("humidity");
            String windCondition = doc.getElementsByTag("yweather:wind").attr("speed");

            return String.format("[%s] %s ? %s, ? %s%%, ?? %skm/h", location, condition, temp,
                    humidity, windCondition);

        } else {
            return String.format(
                    "[%s] ? ?  . ?   ?.",
                    keyword);
        }

    } catch (IOException e) {
        logger.error("failed to run command", e);
        return " ? : " + e.getMessage();
    }
}

From source file:prince.app.ccm.tools.Task.java

public String getFormParams(String html, String username, String password) throws UnsupportedEncodingException {

    System.out.println("Extracting form's data...");

    Document doc = Jsoup.parse(html);

    // Google form id
    Element loginform = doc.getElementById("contenido_right");
    Elements loginaction = doc.getElementsByTag("form");
    Element form = loginaction.first();
    log = MAIN_PAGE + form.attr("action");
    Log.e(TAG, "Action: " + log);
    Elements inputElements = loginform.getElementsByTag("input");
    List<String> paramList = new ArrayList<String>();
    for (Element inputElement : inputElements) {
        String key = inputElement.attr("name");
        String value = inputElement.attr("value");

        if (key.equals("usuario")) {
            value = username;/*from w  ww  . j  av a2 s  .c o m*/
            paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8"));
        } else if (key.equals("contrasena")) {
            value = password;
            paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8"));
        }
    }

    // build parameters list
    StringBuilder result = new StringBuilder();
    for (String param : paramList) {
        if (result.length() == 0) {
            result.append(param);
        } else {
            result.append("&" + param);
        }
    }

    Log.d(TAG, "Done in getFormParams: " + result.toString());
    return result.toString();
}

From source file:com.danielme.muspyforandroid.services.MuspyClient.java

private String getCSRF() throws Exception {
    HttpGet httpGet = new HttpGet(Constants.URL_RESET);
    HttpResponse httpResponse = getDefaultHttpClient().execute(TARGETHOST, httpGet, getBasicHttpContext());
    String responseString = EntityUtils.toString(httpResponse.getEntity());
    Document doc = Jsoup.parse(responseString);
    Elements elementsByTag = doc.getElementsByTag("input");
    for (Element element : elementsByTag) {
        if ("csrfmiddlewaretoken".equals(element.attr("name"))) {
            return element.attr("value");
        }/*from w w w .  j a v  a  2 s  . co  m*/
    }
    throw new Exception("csrf not found");
}

From source file:net.GoTicketing.GoTicketing.java

/**
 * ??//from w  w w .jav  a2  s .c  o  m
 * @return ?
 */
private int praseTicketingResultPage() {
    Document doc = Jsoup.parse(FinishTicketingPageHTML);

    Elements fonts = doc.getElementsByTag("font");
    if (fonts != null) {

        //  ?
        for (Element font : fonts) {
            if (font.text().equals(", ?"))
                return ROCID_WRONG;
        }
    }

    Elements strongs = doc.getElementsByTag("strong");
    if (strongs != null) {

        //  ?
        for (Element strong : strongs) {
            if (strong.text().equals(""))
                return RAND_OR_TIMEOUT_FAIL;
        }

        //  ?
        for (Element strong : strongs) {
            if (strong.text().contains(
                    "????"))
                return TRAIN_STATION_WRONG;
        }

        //  ??
        for (Element strong : strongs) {
            if (strong.text().equals("??")
                    || strong.text().equals("????"))
                return TRAIN_NO_SEAT;
        }

        //  ??
        for (Element strong : strongs) {
            if (strong.text().contains("?(?)?"))
                return TICKETING_TO_LATE;
        }

        //  ?
        for (Element strong : strongs) {
            if (strong.text().contains(""))
                return TICKETING_TO_EARLY;
        }

        //  ?
        for (Element strong : strongs) {
            if (strong.text().contains("??"))
                return TRAIN_NUM_WRONG;
        }

        //  ??
        for (Element strong : strongs) {
            if (strong.text().equals("")) {
                String orderCode = doc.getElementById("spanOrderCode").text();
                ticket.setOrderCode(orderCode);

                return TICKETING_SUCCESS;
            }
        }
    }

    //  ??
    return UNKNOW_PRASE_RESULT;
}

From source file:ac.simons.oembed.Oembed.java

/**
 * Parses  the given html document into a document and processes 
 * all anchor elements. If a valid anchor is found, it tries to
 * get an oembed response for it's url and than render the result
 * into the document replacing the given anchor.<br>
 * It returns the html representation of the new document.<br>
 * If there's an error or no oembed result for an url, the anchor tag
 * will be left as it was. //from  ww w.  j  a va 2  s.c om
 * @param document The document that should be checked for links to transform
 * @return the transformed document
 */
public Document transformDocument(final Document document) {
    boolean changedBaseUri = false;
    if (document.baseUri() == null && this.getBaseUri() != null) {
        document.setBaseUri(this.getBaseUri());
        changedBaseUri = true;
    }
    for (Element a : document.getElementsByTag("a")) {
        final String href = a.absUrl("href");
        try {
            String renderedRespose = null;
            final OembedResponse oembedResponse = this.transformUrl(href);
            // There was no response or an exception happened
            if (oembedResponse == null)
                continue;
            // There is a handler for this response
            else if (this.getHandler().containsKey(oembedResponse.getSource()))
                this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse);
            // Try to render the response itself and replace the current anchor
            else if ((renderedRespose = oembedResponse.render()) != null) {
                a.before(renderedRespose);
                a.remove();
            }
        } catch (OembedException e) {
            logger.warn(String.format("Skipping '%s': %s", href, e.getMessage()));
        }
    }
    if (changedBaseUri)
        document.setBaseUri(null);
    return document;
}

From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java

protected Status handleCaptchaRedirect(String captchaRedirect) {
    if (captchaRedirect == null || !captchaRedirect.contains("?continue=")) {
        return Status.ERROR_NETWORK;
    }//from www  .ja  v a  2 s .com
    LOG.debug("captcha form detected via {}", http.getProxy() == null ? new DirectNoProxy() : http.getProxy());

    int status = http.get(captchaRedirect);
    if (status == 403) {
        return Status.ERROR_IP_BANNED;
    }

    if (solver == null) {
        return Status.ERROR_CAPTCHA_NO_SOLVER;
    }

    String content = http.getContentAsString();
    if (content == null) {
        return Status.ERROR_NETWORK;
    }

    String imageSrc = null;
    Document captchaDocument = Jsoup.parse(content, captchaRedirect);
    Elements elements = captchaDocument.getElementsByTag("img");
    for (Element element : elements) {
        String src = element.attr("abs:src");
        if (src != null && src.contains("/sorry/image")) {
            imageSrc = src;
        }
    }

    if (imageSrc == null) {
        LOG.debug("can't find captcha img tag");
        return Status.ERROR_NETWORK;
    }

    Element form = captchaDocument.getElementsByTag("form").first();
    if (form == null) {
        LOG.debug("can't find captcha form");
        return Status.ERROR_NETWORK;
    }

    String continueValue = null;
    String formIdValue = null;
    String formUrl = form.attr("abs:action");
    String formQValue = null;

    Element elementCaptchaId = form.getElementsByAttributeValue("name", "id").first();
    if (elementCaptchaId != null) {
        formIdValue = elementCaptchaId.attr("value");
    }
    Element elementContinue = form.getElementsByAttributeValue("name", "continue").first();
    if (elementContinue != null) {
        continueValue = elementContinue.attr("value");
    }
    Element elementQ = form.getElementsByAttributeValue("name", "q").first();
    if (elementQ != null) {
        formQValue = elementQ.attr("value");
    }

    if (formUrl == null || (formIdValue == null && formQValue == null) || continueValue == null) {
        LOG.debug("invalid captcha form");
        return Status.ERROR_NETWORK;
    }

    int imgStatus = http.get(imageSrc, captchaRedirect);
    if (imgStatus != 200 || http.getContent() == null) {
        LOG.debug("can't download captcha image {} (status code = {})", imageSrc, imgStatus);
        return Status.ERROR_NETWORK;
    }

    CaptchaImage captcha = new CaptchaImage(new byte[][] { http.getContent() });
    boolean solved = solver.solve(captcha);
    if (!solved || !Captcha.Status.SOLVED.equals(captcha.getStatus())) {
        LOG.error("solver can't resolve captcha (overload ?) error = {}", captcha.getError());
        return Status.ERROR_CAPTCHA_INCORRECT;
    }
    LOG.debug("got captcha response {} in {} seconds from {}", captcha.getResponse(),
            captcha.getSolveDuration() / 1000l,
            (captcha.getLastSolver() == null ? "?" : captcha.getLastSolver().getFriendlyName()));

    try {
        formUrl += "?continue=" + URLEncoder.encode(continueValue, "utf-8");
    } catch (Exception ex) {
    }
    formUrl += "&captcha=" + captcha.getResponse();

    if (formIdValue != null) {
        formUrl += "&id=" + formIdValue;
    }
    if (formQValue != null) {
        formUrl += "&q=" + formQValue;
    }

    int postCaptchaStatus = http.get(formUrl, captchaRedirect);

    if (postCaptchaStatus == 302) {
        String redirectOnSuccess = http.getResponseHeader("location");
        if (redirectOnSuccess.startsWith("http://")) {
            redirectOnSuccess = "https://" + redirectOnSuccess.substring(7);
        }

        int redirect1status = http.get(redirectOnSuccess, captchaRedirect);
        if (redirect1status == 200) {
            return Status.OK;
        }

        if (redirect1status == 302) {
            if (http.get(http.getResponseHeader("location"), captchaRedirect) == 200) {
                return Status.OK;
            }
        }
    }

    if (postCaptchaStatus == 503) {
        LOG.debug("reporting incorrect captcha (incorrect response = {})", captcha.getResponse());
        solver.reportIncorrect(captcha);
    }

    return Status.ERROR_CAPTCHA_INCORRECT;
}

From source file:se.vgregion.portal.iframe.controller.CSViewController.java

private Element findButtonWithIdWhichStartsWith(Document doc, String pattern) {
    Elements buttonElements = doc.getElementsByTag("button");
    Iterator<Element> iterator = buttonElements.iterator();
    Element dynamicValue = null;/*from  w w w  .  ja  v a2  s .  c o m*/
    while (iterator.hasNext()) {
        Element next = iterator.next();
        String id = next.attr("id");
        if (id != null && id.startsWith(pattern)) {
            dynamicValue = next;
            break;
        }
    }
    return dynamicValue;
}

From source file:com.zacwolf.commons.email.Email.java

private void prepare(final org.jsoup.nodes.Document doc) {
    removeComments(doc);//Remove any comments from the html of the message to reduce the size
    //Change the title to match the subject of the email
    if (doc.getElementsByTag("title").size() > 0)
        doc.getElementsByTag("title").first().html(getSubject());
    //Replace the contents of any tags with class="date" with the current date
    if (doc.getElementsByClass("date").size() > 0) {
        for (org.jsoup.nodes.Element datelem : doc.getElementsByClass("date")) {
            SimpleDateFormat df = new SimpleDateFormat("MMMMMMMMMM d, yyyy");
            if (datelem.hasAttr("format")) {
                try {
                    df = new SimpleDateFormat(datelem.attr("format"));
                } catch (Exception ee) {
                } //throw it away and just go back to the default format;
                datelem.html(df.format(TimeUtils.getGMTtime()));
            }//from   w  w  w . j av  a2 s  . c om
        }
    }
    //tables need the border-spacing: style attribute; added for GMail compatiblity
    for (org.jsoup.nodes.Element tbl : doc.getElementsByTag("table"))
        if (!tbl.attr("style").contains("border-spacing:"))
            tbl.attr("style",
                    tbl.attr("style") + (!tbl.attr("style").endsWith(";") ? ";" : "") + "border-spacing:0;");
}