List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:net.GoTicketing.GoTicketing.java
/** * ???/*from w w w. ja v a 2 s . c om*/ * @throws Exception */ private void praseFormActionSrc() throws Exception { Document doc = Jsoup.parse(TicketingPageHTML); Element form = doc.getElementsByTag("form").first(); if (form == null) throw new Exception("Can't get form action source !"); String src = form.attr("action"); FormActionSrc = host + src.replace("/", "%2F"); FormInputData = new TreeMap<>(); for (Element elm : form.getElementsByTag("input")) { if (elm.attr("type").equals("hidden")) FormInputData.put(elm.attr("name"), elm.attr("value")); } }
From source file:org.aliuge.crawler.fetcher.FetchWorker.java
/** * @param url/*from w w w . j av a 2 s. c om*/ * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getUrl())) { result = fetcher.fetch(url, true); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); onSuccessed(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getUrl())) { onFailed(url); return; } // ?? String e_url = extractFilterAndChangeUrl(url.getUrl()); if (StringUtils.isNoneBlank(e_url)) { url.setUrl(e_url); page.setWebURL(url); pendingPages.addElement(page); return; } // depth if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getUrl())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ???url if ((fetchFilter(linkHref) || extractFilter(linkHref)) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setName(link.text()); purl.setUrl(linkHref); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getUrl() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } } catch (QueueException e) { onFailed(url); } catch (Exception e) { e.printStackTrace(); onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } }
From source file:org.manalith.ircbot.plugin.weather.WeatherPlugin.java
@BotCommand("") public String getYahooWeather(@Option(name = "", help = " ? ? ") String keyword) { try {/* w ww . j av a 2s . c o m*/ // TODO WOEID ? final String url_woeid = "http://query.yahooapis.com/v1/public/yql" + "?q=select%20woeid%20from%20geo.places%20where%20text%3D%22" + URLEncoder.encode(keyword, "UTF-8") + "%20ko-KR%22%20limit%201"; final String url_forecast = "http://weather.yahooapis.com/forecastrss?w=%s&u=c"; final String error_woeid = "23424868"; Document doc = Jsoup.connect(url_woeid).get(); // example : http://query.yahooapis.com/v1/public/yql?q=select woeid // from geo.places where text%3D\"%2C ko-KR\" limit 1 String woeid = doc.select("woeid").text(); if (!woeid.equals(error_woeid)) { // example: // http://weather.yahooapis.com/forecastrss?w=1132599&u=c doc = Jsoup.connect(String.format(url_forecast, woeid)).get(); String location = doc.getElementsByTag("yweather:location").attr("city"); String condition = doc.getElementsByTag("yweather:condition").attr("text"); String temp = doc.getElementsByTag("yweather:condition").attr("temp"); String humidity = doc.getElementsByTag("yweather:atmosphere").attr("humidity"); String windCondition = doc.getElementsByTag("yweather:wind").attr("speed"); return String.format("[%s] %s ? %s, ? %s%%, ?? %skm/h", location, condition, temp, humidity, windCondition); } else { return String.format( "[%s] ? ? . ? ?.", keyword); } } catch (IOException e) { logger.error("failed to run command", e); return " ? : " + e.getMessage(); } }
From source file:prince.app.ccm.tools.Task.java
public String getFormParams(String html, String username, String password) throws UnsupportedEncodingException { System.out.println("Extracting form's data..."); Document doc = Jsoup.parse(html); // Google form id Element loginform = doc.getElementById("contenido_right"); Elements loginaction = doc.getElementsByTag("form"); Element form = loginaction.first(); log = MAIN_PAGE + form.attr("action"); Log.e(TAG, "Action: " + log); Elements inputElements = loginform.getElementsByTag("input"); List<String> paramList = new ArrayList<String>(); for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (key.equals("usuario")) { value = username;/*from w ww . j av a2 s .c o m*/ paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8")); } else if (key.equals("contrasena")) { value = password; paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8")); } } // build parameters list StringBuilder result = new StringBuilder(); for (String param : paramList) { if (result.length() == 0) { result.append(param); } else { result.append("&" + param); } } Log.d(TAG, "Done in getFormParams: " + result.toString()); return result.toString(); }
From source file:com.danielme.muspyforandroid.services.MuspyClient.java
private String getCSRF() throws Exception { HttpGet httpGet = new HttpGet(Constants.URL_RESET); HttpResponse httpResponse = getDefaultHttpClient().execute(TARGETHOST, httpGet, getBasicHttpContext()); String responseString = EntityUtils.toString(httpResponse.getEntity()); Document doc = Jsoup.parse(responseString); Elements elementsByTag = doc.getElementsByTag("input"); for (Element element : elementsByTag) { if ("csrfmiddlewaretoken".equals(element.attr("name"))) { return element.attr("value"); }/*from w w w . j a v a 2 s . co m*/ } throw new Exception("csrf not found"); }
From source file:net.GoTicketing.GoTicketing.java
/** * ??//from w w w .jav a2 s .c o m * @return ? */ private int praseTicketingResultPage() { Document doc = Jsoup.parse(FinishTicketingPageHTML); Elements fonts = doc.getElementsByTag("font"); if (fonts != null) { // ? for (Element font : fonts) { if (font.text().equals(", ?")) return ROCID_WRONG; } } Elements strongs = doc.getElementsByTag("strong"); if (strongs != null) { // ? for (Element strong : strongs) { if (strong.text().equals("")) return RAND_OR_TIMEOUT_FAIL; } // ? for (Element strong : strongs) { if (strong.text().contains( "????")) return TRAIN_STATION_WRONG; } // ?? for (Element strong : strongs) { if (strong.text().equals("??") || strong.text().equals("????")) return TRAIN_NO_SEAT; } // ?? for (Element strong : strongs) { if (strong.text().contains("?(?)?")) return TICKETING_TO_LATE; } // ? for (Element strong : strongs) { if (strong.text().contains("")) return TICKETING_TO_EARLY; } // ? for (Element strong : strongs) { if (strong.text().contains("??")) return TRAIN_NUM_WRONG; } // ?? for (Element strong : strongs) { if (strong.text().equals("")) { String orderCode = doc.getElementById("spanOrderCode").text(); ticket.setOrderCode(orderCode); return TICKETING_SUCCESS; } } } // ?? return UNKNOW_PRASE_RESULT; }
From source file:ac.simons.oembed.Oembed.java
/** * Parses the given html document into a document and processes * all anchor elements. If a valid anchor is found, it tries to * get an oembed response for it's url and than render the result * into the document replacing the given anchor.<br> * It returns the html representation of the new document.<br> * If there's an error or no oembed result for an url, the anchor tag * will be left as it was. //from ww w. j a va 2 s.c om * @param document The document that should be checked for links to transform * @return the transformed document */ public Document transformDocument(final Document document) { boolean changedBaseUri = false; if (document.baseUri() == null && this.getBaseUri() != null) { document.setBaseUri(this.getBaseUri()); changedBaseUri = true; } for (Element a : document.getElementsByTag("a")) { final String href = a.absUrl("href"); try { String renderedRespose = null; final OembedResponse oembedResponse = this.transformUrl(href); // There was no response or an exception happened if (oembedResponse == null) continue; // There is a handler for this response else if (this.getHandler().containsKey(oembedResponse.getSource())) this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse); // Try to render the response itself and replace the current anchor else if ((renderedRespose = oembedResponse.render()) != null) { a.before(renderedRespose); a.remove(); } } catch (OembedException e) { logger.warn(String.format("Skipping '%s': %s", href, e.getMessage())); } } if (changedBaseUri) document.setBaseUri(null); return document; }
From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java
protected Status handleCaptchaRedirect(String captchaRedirect) { if (captchaRedirect == null || !captchaRedirect.contains("?continue=")) { return Status.ERROR_NETWORK; }//from www .ja v a 2 s .com LOG.debug("captcha form detected via {}", http.getProxy() == null ? new DirectNoProxy() : http.getProxy()); int status = http.get(captchaRedirect); if (status == 403) { return Status.ERROR_IP_BANNED; } if (solver == null) { return Status.ERROR_CAPTCHA_NO_SOLVER; } String content = http.getContentAsString(); if (content == null) { return Status.ERROR_NETWORK; } String imageSrc = null; Document captchaDocument = Jsoup.parse(content, captchaRedirect); Elements elements = captchaDocument.getElementsByTag("img"); for (Element element : elements) { String src = element.attr("abs:src"); if (src != null && src.contains("/sorry/image")) { imageSrc = src; } } if (imageSrc == null) { LOG.debug("can't find captcha img tag"); return Status.ERROR_NETWORK; } Element form = captchaDocument.getElementsByTag("form").first(); if (form == null) { LOG.debug("can't find captcha form"); return Status.ERROR_NETWORK; } String continueValue = null; String formIdValue = null; String formUrl = form.attr("abs:action"); String formQValue = null; Element elementCaptchaId = form.getElementsByAttributeValue("name", "id").first(); if (elementCaptchaId != null) { formIdValue = elementCaptchaId.attr("value"); } Element elementContinue = form.getElementsByAttributeValue("name", "continue").first(); if (elementContinue != null) { continueValue = elementContinue.attr("value"); } Element elementQ = form.getElementsByAttributeValue("name", "q").first(); if (elementQ != null) { formQValue = elementQ.attr("value"); } if (formUrl == null || (formIdValue == null && formQValue == null) || continueValue == null) { LOG.debug("invalid captcha form"); return Status.ERROR_NETWORK; } int imgStatus = http.get(imageSrc, captchaRedirect); if (imgStatus != 200 || http.getContent() == null) { LOG.debug("can't download captcha image {} (status code = {})", imageSrc, imgStatus); return Status.ERROR_NETWORK; } CaptchaImage captcha = new CaptchaImage(new byte[][] { http.getContent() }); boolean solved = solver.solve(captcha); if (!solved || !Captcha.Status.SOLVED.equals(captcha.getStatus())) { LOG.error("solver can't resolve captcha (overload ?) error = {}", captcha.getError()); return Status.ERROR_CAPTCHA_INCORRECT; } LOG.debug("got captcha response {} in {} seconds from {}", captcha.getResponse(), captcha.getSolveDuration() / 1000l, (captcha.getLastSolver() == null ? "?" : captcha.getLastSolver().getFriendlyName())); try { formUrl += "?continue=" + URLEncoder.encode(continueValue, "utf-8"); } catch (Exception ex) { } formUrl += "&captcha=" + captcha.getResponse(); if (formIdValue != null) { formUrl += "&id=" + formIdValue; } if (formQValue != null) { formUrl += "&q=" + formQValue; } int postCaptchaStatus = http.get(formUrl, captchaRedirect); if (postCaptchaStatus == 302) { String redirectOnSuccess = http.getResponseHeader("location"); if (redirectOnSuccess.startsWith("http://")) { redirectOnSuccess = "https://" + redirectOnSuccess.substring(7); } int redirect1status = http.get(redirectOnSuccess, captchaRedirect); if (redirect1status == 200) { return Status.OK; } if (redirect1status == 302) { if (http.get(http.getResponseHeader("location"), captchaRedirect) == 200) { return Status.OK; } } } if (postCaptchaStatus == 503) { LOG.debug("reporting incorrect captcha (incorrect response = {})", captcha.getResponse()); solver.reportIncorrect(captcha); } return Status.ERROR_CAPTCHA_INCORRECT; }
From source file:se.vgregion.portal.iframe.controller.CSViewController.java
private Element findButtonWithIdWhichStartsWith(Document doc, String pattern) { Elements buttonElements = doc.getElementsByTag("button"); Iterator<Element> iterator = buttonElements.iterator(); Element dynamicValue = null;/*from w w w . ja v a2 s . c o m*/ while (iterator.hasNext()) { Element next = iterator.next(); String id = next.attr("id"); if (id != null && id.startsWith(pattern)) { dynamicValue = next; break; } } return dynamicValue; }
From source file:com.zacwolf.commons.email.Email.java
private void prepare(final org.jsoup.nodes.Document doc) { removeComments(doc);//Remove any comments from the html of the message to reduce the size //Change the title to match the subject of the email if (doc.getElementsByTag("title").size() > 0) doc.getElementsByTag("title").first().html(getSubject()); //Replace the contents of any tags with class="date" with the current date if (doc.getElementsByClass("date").size() > 0) { for (org.jsoup.nodes.Element datelem : doc.getElementsByClass("date")) { SimpleDateFormat df = new SimpleDateFormat("MMMMMMMMMM d, yyyy"); if (datelem.hasAttr("format")) { try { df = new SimpleDateFormat(datelem.attr("format")); } catch (Exception ee) { } //throw it away and just go back to the default format; datelem.html(df.format(TimeUtils.getGMTtime())); }//from w w w . j av a2 s . c om } } //tables need the border-spacing: style attribute; added for GMail compatiblity for (org.jsoup.nodes.Element tbl : doc.getElementsByTag("table")) if (!tbl.attr("style").contains("border-spacing:")) tbl.attr("style", tbl.attr("style") + (!tbl.attr("style").endsWith(";") ? ";" : "") + "border-spacing:0;"); }