List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().contains("version") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;//from ww w .j a v a 2s . c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("h4:contains(Editor) ~ blockquote"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("h4")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().endsWith("version:") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;// w w w . j av a 2 s .c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("h4")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:org.manalith.ircbot.plugin.weather.WeatherPlugin.java
@BotCommand("") public String getYahooWeather(@Option(name = "", help = " ? ? ") String keyword) { try {//from w w w . j av a 2 s.com // TODO WOEID ? final String url_woeid = "http://query.yahooapis.com/v1/public/yql" + "?q=select%20woeid%20from%20geo.places%20where%20text%3D%22" + URLEncoder.encode(keyword, "UTF-8") + "%20ko-KR%22%20limit%201"; final String url_forecast = "http://weather.yahooapis.com/forecastrss?w=%s&u=c"; final String error_woeid = "23424868"; Document doc = Jsoup.connect(url_woeid).get(); // example : http://query.yahooapis.com/v1/public/yql?q=select woeid // from geo.places where text%3D\"%2C ko-KR\" limit 1 String woeid = doc.select("woeid").text(); if (!woeid.equals(error_woeid)) { // example: // http://weather.yahooapis.com/forecastrss?w=1132599&u=c doc = Jsoup.connect(String.format(url_forecast, woeid)).get(); String location = doc.getElementsByTag("yweather:location").attr("city"); String condition = doc.getElementsByTag("yweather:condition").attr("text"); String temp = doc.getElementsByTag("yweather:condition").attr("temp"); String humidity = doc.getElementsByTag("yweather:atmosphere").attr("humidity"); String windCondition = doc.getElementsByTag("yweather:wind").attr("speed"); return String.format("[%s] %s ? %s, ? %s%%, ?? %skm/h", location, condition, temp, humidity, windCondition); } else { return String.format( "[%s] ? ? . ? ?.", keyword); } } catch (IOException e) { logger.error("failed to run command", e); return " ? : " + e.getMessage(); } }
From source file:com.liato.bankdroid.banking.banks.Hemkop.java
@Override public void update() throws BankException, LoginException, BankChoiceException { super.update(); if (username == null || password == null || username.length() == 0 || password.length() == 0) { throw new LoginException(res.getText(R.string.invalid_username_password).toString()); }//w w w .j a va 2 s .c om urlopen = login(); Document d = Jsoup.parse(response); Elements amounts = d.select(".bonusStatement .amount"); Elements names = d.select(".bonusStatement .label"); for (int i = 0; i < Math.min(amounts.size(), names.size()); i++) { Element amount = amounts.get(i); Element name = names.get(i); BigDecimal accountBalance = Helpers.parseBalance(amount.ownText()); Account account = new Account(name.ownText().replace(":", "").trim(), accountBalance, String.format("acc_%d", i)); if (i > 0) { account.setAliasfor("acc_0"); } accounts.add(account); balance = balance.add(accountBalance); } if (accounts.isEmpty()) { throw new BankException(res.getText(R.string.no_accounts_found).toString()); } Account account = accounts.get(0); try { response = urlopen.open("https://www.hemkop.se/Mina-sidor/Kontoutdrag/"); d = Jsoup.parse(response); Elements es = d.select(".transactions tbody tr"); ArrayList<Transaction> transactions = new ArrayList<Transaction>(); for (Element e : es) { Transaction t = new Transaction(e.child(1).ownText().trim(), e.child(0).ownText().trim(), Helpers.parseBalance(e.child(3).ownText())); if (!TextUtils.isEmpty(e.child(2).ownText())) { t.setCurrency(Helpers.parseCurrency(e.child(2).ownText().trim(), "SEK")); } transactions.add(t); } account.setTransactions(transactions); es = d.select(".currentBalance,.disposable"); int i = 0; for (Element e : es) { Account a = new Account(e.child(0).ownText().trim(), Helpers.parseBalance(e.child(1).ownText()), String.format("acc_cc_%d", i)); a.setAliasfor("acc_0"); accounts.add(a); i++; } } catch (ClientProtocolException e) { e.printStackTrace(); Log.e(TAG, e.getMessage() != null ? e.getMessage() : ""); } catch (IOException e) { e.printStackTrace(); Log.e(TAG, e.getMessage() != null ? e.getMessage() : ""); } super.updateComplete(); }
From source file:model.ParseInfoFromSite.java
/** * Return a List of Strings (part of URLS). * /*w w w . j ava2s . c om*/ * @param URL * where are 20 links to page with information about banks * @return List<String> with 20 links to page with information about banks */ public List<String> getListOfBanks(String URL) { logger.info("run"); List<String> list = new ArrayList<String>(); Document doc; try { doc = Jsoup.connect(URL).userAgent("Mozilla").timeout(10 * 1000).get(); Elements links = doc.select("a[href]"); for (int i = 0; i < links.size(); i++) { String res = ""; if ((res = links.get(i).attr("href")).startsWith("control/uk/bankdict/bank?id=")) { list.add(res); } } } catch (IOException e) { e.printStackTrace(); } return list; }
From source file:co.dilaver.quoter.fragments.QODFragment.java
private void parseQodResponse(JSONObject response) throws JSONException { JSONObject parse = response.getJSONObject("parse"); JSONObject text = parse.getJSONObject("text"); String content = text.getString("*"); Document doc = Jsoup.parse(content); Elements table = doc.select("table[style=\"text-align:center; width:100%\"]"); Elements rows = table.select("tr"); Elements qod = rows.get(0).select("td"); Elements author = rows.get(1).select("td"); Whitelist whitelist = Whitelist.none(); String newQuote = Html.fromHtml(Jsoup.clean(qod.toString(), whitelist)).toString(); String newAuthor = Html.fromHtml(Jsoup.clean(author.toString(), whitelist).replace("~", "")).toString(); if (!qodString.equals("") && !authorString.equals("")) { if (!qodString.equals(newQuote) || !authorString.equals(newAuthor)) { Snackbar.make(rootLayout, getString(R.string.str_Refreshing), Snackbar.LENGTH_SHORT).show(); }/*w ww. j a va 2 s .c o m*/ } qodString = newQuote; authorString = newAuthor; sharedPrefStorage.setQodText(qodString); sharedPrefStorage.setQodAuthor(authorString); Log.e(TAG, "quote: " + qodString); Log.e(TAG, "author: " + authorString); qodText.setText(getString(R.string.str_WithinQuotation, qodString)); qodAuthor.setText(authorString); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;// ww w .j a v a2s. c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:org.abondar.experimental.eventsearch.EventFinder.java
public String getEventPlaces(String place) { String res = ""; try {/*from ww w . j a v a 2s . co m*/ Document placeDoc = Jsoup.connect("https://afisha.yandex.ru" + place).get(); Elements elems = placeDoc.select("p"); for (Element e : elems) { if (e.parents().get(1).html().contains("<div style")) { if (e.children().size() > 1) { if (e.child(1).hasAttr("href")) { res = e.child(1).html() + " ?"; } } else if (e.children().isEmpty()) { res = e.html() + " ?"; } } } } catch (IOException ex) { Logger.getLogger(EventFinder.class.getName()).log(Level.SEVERE, null, ex); } return res; }
From source file:br.ufsc.das.gtscted.shibbauth.Connection.java
public String authenticate(String wayfLocation, String wayfActionPath, String idpUrl, String username, String password) throws ClientProtocolException, IOException { //POST para o WAYF passando o idp escolhido HttpPost httpPost1 = new HttpPost(wayfLocation + wayfActionPath); List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(); nameValuePairs.add(new BasicNameValuePair("user_idp", idpUrl)); httpPost1.setEntity(new UrlEncodedFormEntity(nameValuePairs, HTTP.UTF_8)); HttpResponse response1 = httpClient.execute(httpPost1); String strResponse1 = readResponse(response1.getEntity().getContent()).toString(); //----------------------------------------------- //Obtm o caminho indicado no campo "action" da pgina do idp (/idp/Authn/UserPassword) Document idpDoc = Jsoup.parse(strResponse1); Element idpFormElement = idpDoc.select("form").get(0); String idpActionPath = idpFormElement.attr("action"); //POST para o idp escolhido (por exemplo https://idpstcfed.sj.ifsc.edu.br/idp/Authn/UserPassword) // passando o usuario (j_username) e a senha (j_password) HttpPost httpPost2 = new HttpPost(idpUrl.replace("/idp/shibboleth", idpActionPath)); List<NameValuePair> nameValuePairs2 = new ArrayList<NameValuePair>(); nameValuePairs2.add(new BasicNameValuePair("j_username", username)); nameValuePairs2.add(new BasicNameValuePair("j_password", password)); httpPost2.setEntity(new UrlEncodedFormEntity(nameValuePairs2, HTTP.UTF_8)); HttpResponse response2 = httpClient.execute(httpPost2); String strResponse2 = readResponse(response2.getEntity().getContent()).toString(); //----------------------------------------------- // Obtm os elementos que sero passados para o SP para criar o security context Document authResponseDoc = Jsoup.parse(strResponse2); Element authResponseFormElement = authResponseDoc.select("form").get(0); Element relayStateElement = authResponseDoc.select("input").get(0); Element SAMLResponseElement = authResponseDoc.select("input").get(1); String action = authResponseFormElement.attr("action"); String relayStateValue = relayStateElement.attr("value"); String SAMLResponseValue = SAMLResponseElement.attr("value"); // POST para o "assertion consumer" no SP, indicado no campo "action" da resposta // recebida aps a autenticacao. Este POST contm dois valores: RelayState e // SAMLResponse. HttpPost httpPost3 = new HttpPost(action); List<NameValuePair> nameValuePairs3 = new ArrayList<NameValuePair>(); nameValuePairs3.add(new BasicNameValuePair("RelayState", relayStateValue)); nameValuePairs3.add(new BasicNameValuePair("SAMLResponse", SAMLResponseValue)); httpPost3.setEntity(new UrlEncodedFormEntity(nameValuePairs3, HTTP.UTF_8)); HttpResponse response3 = httpClient.execute(httpPost3); return readResponse(response3.getEntity().getContent()).toString(); }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl, int recursionDepth) throws IOException, CredentialInvalidException { String html;/*from w ww. j a v a2s .c om*/ if (url.equals(VALUE_URL_LOGIN_RESPONSE)) { html = loginResponse; } else { try { html = httpGet(url, encoding).replace(" ", ""); } catch (HttpResponseException e) { if (docs.size() == 0) { throw e; } else { return; // ignore if first page was loaded and redirect didn't work } } } Document doc = Jsoup.parse(html); doc.setBaseUri(url); if (doc.select(".mon_title").size() == 0) { // We have a problem - there seems to be no substitution schedule. Maybe it is hiding // inside a frame? if (doc.select("frameset frame[name").size() > 0) { for (Element frame : doc.select("frameset frame")) { if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?") && recursionDepth < MAX_RECURSION_DEPTH) { String frameUrl = frame.absUrl("src"); loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1); } } } else if (doc.text().contains("registriert")) { throw new CredentialInvalidException(); } else { if (docs.size() == 0) { // ignore if first page was loaded and redirect didn't work throw new IOException( "Could not find .mon-title, seems like there is no Untis " + "schedule here"); } } } else { findSubDocs(docs, html, doc); if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) { loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1); } } } }