List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
static List<ReservedItem> parse_reslist(Document doc) { List<ReservedItem> reservations = new ArrayList<>(); Elements copytrs = doc.select(".data tr"); int trs = copytrs.size(); if (trs <= 1) { return null; }//from w ww .ja v a 2 s .c o m for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); if (tr.text().contains("keine Daten") || tr.children().size() == 1) { return null; } item.setTitle(tr.child(2).select("b, strong").text().trim()); try { String[] rowsplit2 = tr.child(2).html().split("<br[ /]*>"); String[] rowsplit3 = tr.child(3).html().split("<br[ /]*>"); if (rowsplit2.length > 1) item.setAuthor(rowsplit2[1].trim()); if (rowsplit3.length > 2) item.setBranch(rowsplit3[2].trim()); if (rowsplit3.length > 2) { item.setStatus(rowsplit3[0].trim() + " (" + rowsplit3[1].trim() + ")"); } } catch (Exception e) { e.printStackTrace(); } reservations.add(item); } return reservations; }
From source file:org.javiermoreno.torrentscratcher.Runner.java
public List<String> getRecordsUrl(int page) throws IOException { List<String> result = new ArrayList<>(); String url = "http://www.elitetorrent.net/categoria/13/peliculas-hdrip/modo:listado/orden:valoracion/pag:{page}"; url = url.replace("{page}", String.valueOf(page)); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a.nombre"); for (Element elem : links) { result.add(elem.attr("href")); }/* ww w .ja v a 2 s . com*/ return result; }
From source file:com.gorsini.searcher.CanalplaySearcher.java
/** * * @param movieToSearch/*from ww w.j ava 2s .c o m*/ * @return ArrayList<Movie> empty if no result */ public ArrayList<Movie> searchMovie(Movie movieToSearch) { try { /* curl "http://vod.canalplay.com/pages/recherche/challengeexplorer.aspx?action=4&search=hercule" -H "Referer: http://vod.canalplay.com/" ramne que les films dispo. */ String titleToSearch = movieToSearch.getTitle(); LOG.log(Level.FINER, "titre rechercher : {0}", titleToSearch); String url = makeURL(titleToSearch); Document doc = Jsoup.connect(url).referrer("http://vod.canalplay.com/").get(); Elements movies = doc.select("div.list_movie"); if (movies.isEmpty()) { LOG.log(Level.FINER, "no movie found with title {0}", titleToSearch); return null; } else { ArrayList<Movie> result = new ArrayList<Movie>(); for (Element movie : movies) { MovieSelector selector = new MovieSelector(); Movie movieFound = selector.selectMovie(movie, movieToSearch); if (movieFound != null) { LOG.log(Level.FINER, "film trouv:{0}", movieFound.toString()); result.add(movieFound); } } return result; } } catch (Exception e) { System.out.println("problme HTTP"); e.printStackTrace(); return null; } }
From source file:caarray.client.examples.java.CASRemoteEJBClient.java
private HttpResponse submitCasLogin(HttpResponse casLoginPageResponse) throws ClientProtocolException, IOException { String postURL = BaseProperties.CAS_URL + "/login"; HttpPost httpPost = new HttpPost(postURL); // These 3 input fields are hidden values set in the CAS login page, CAS server expects them to facilitate its workflows, // we will be capturing them from the page and adding to username/password combination when posting login // <input type="hidden" name="lt" value="LT-38-TPi7eJ2w3UNbO2dK0qPphKUPGuZJna" /> // <input type="hidden" name="execution" value="e1s1" /> // <input type="hidden" name="_eventId" value="submit" /> Document contentDocument = getContentDocument(casLoginPageResponse); String loginTicketParam = contentDocument.select("input[name=lt]").attr("value"); String executionParam = contentDocument.select("input[name=execution]").attr("value"); String eventIdParam = contentDocument.select("input[name=_eventId]").attr("value"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("username", BaseProperties.CAS_USERNAME)); nvps.add(new BasicNameValuePair("password", BaseProperties.CAS_PASSWORD)); nvps.add(new BasicNameValuePair("lt", loginTicketParam)); nvps.add(new BasicNameValuePair("execution", executionParam)); nvps.add(new BasicNameValuePair("_eventId", eventIdParam)); httpPost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); return httpclient.execute(httpPost); }
From source file:hu.petabyte.redflags.engine.gear.archiver.Archiver.java
@Override protected Notice processImpl(Notice notice) throws Exception { for (DisplayLanguage lang : langs) { // get data tab TedResponse r = ted.get().requestNoticeTabQuietly(notice.getId(), lang, Tab.DATA); // get other tabs if (null != r) { Document d = r.getParsedDocument(); for (Tab tab : Tab.values()) { if (tab != Tab.DATA && !d.select("a[href~=tabId=" + tab.getId()).isEmpty()) { LOG.trace("Found tab: {}:{}:{}, fetching", notice.getId(), lang, tab); ted.get().requestNoticeTabQuietly(notice.getId(), lang, tab); }/*from ww w . j a va2 s .c o m*/ } } } return notice; }
From source file:com.soulgalore.crawler.core.impl.AhrefPageURLParser.java
private Set<CrawlerURL> fetch(String query, String attributeKey, Document doc, String url) { final Set<CrawlerURL> urls = new HashSet<CrawlerURL>(); final Elements elements = doc.select(query); for (Element src : elements) { if (src.attr(attributeKey).isEmpty()) continue; // don't fetch mailto links if (src.attr(attributeKey).startsWith(MAIL_TO)) continue; else if (IFRAME.equals(src.tag().getName())) urls.add(new CrawlerURL(src.attr(attributeKey), url)); else/*from w ww .j a va2 s . c om*/ urls.add(new CrawlerURL(src.attr(attributeKey), url)); } return urls; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule6.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Element editor = null;/* ww w . j a v a 2s. c om*/ try { editor = doc.select(".authlist").get(0).select("p").get(0); } catch (IndexOutOfBoundsException e) { try { editor = doc.select("h4:contains(Editors) ~ p").get(0); } catch (IndexOutOfBoundsException e1) { return null; } } String[] splitted = editor.html().split("<br />"); if (splitted.length < 2) splitted = editor.html().split("<br clear=\"none\" />"); for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } if (editorList.size() == 0) return null; return editorList; }
From source file:org.javiermoreno.torrentscratcher.Runner.java
public Movie enrichMovieWithFilmAffinity(Movie movie) { try {/*w w w . jav a 2 s .com*/ String url = "http://www.filmaffinity.com/es/search.php?stext={title}&stype=all"; String title = URLEncoder.encode(movie.getTitle(), "UTF8"); url = url.replace("{title}", title); Document doc = Jsoup.connect(url).get(); if (doc.select("[property=og:title]").size() == 0) { // several results found, take the first Element firstResult = doc.select(".item-search .mc-title a").first(); if (firstResult == null) { // filmaffinity search engine failed log.warn("FilmAffinity 404: " + movie.getTitle()); return movie; } url = "http://www.filmaffinity.com" + firstResult.attr("href"); doc = Jsoup.connect(url).get(); } movie.setFilmAffinityId(doc.select("div.rate-movie-box").attr("data-movie-id")); Elements movieInfo = doc.select("dl.movie-info"); String originalTitle = movieInfo.select("dd").eq(0).text(); originalTitle = originalTitle.replaceAll("\\([^\\(]*\\)", "").replaceAll("\\[[^\\(]*\\]", "") .replaceAll("aka$", "").trim(); movie.setOriginalTitle(originalTitle); movie.setDescription(movieInfo.select("dd").eq(11).text()); } catch (IOException ex) { log.warn(ex.getMessage()); } return movie; }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;// ww w . j ava 2 s .co m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }
From source file:zjut.soft.finalwork.fragment.PageFragment1.java
public void showLevelResult() { new Thread(new Runnable() { @Override// w ww . j av a2 s . c om public void run() { try { HttpGet get = new HttpGet(((YCApplication) getActivity().getApplication()).get("selectedIp") + Constant.levelQuery); YCApplication app = (YCApplication) getActivity().getApplicationContext(); HttpResponse response = app.getClient().execute(get); HttpEntity entity = response.getEntity(); BufferedReader br = new BufferedReader( new InputStreamReader(entity.getContent(), Constant.ENCODING)); StringBuilder sb = new StringBuilder(); String temp = null; while ((temp = br.readLine()) != null) { sb.append(temp); } Document doc = Jsoup.parse(sb.toString()); Elements tables = doc.select("#DJKCJ"); if (tables.size() > 0) { Element table = tables.get(0); Elements trs = table.select("tr"); levelTest = new ArrayList<LevelTest>(); info = new StringBuilder(); if (trs.size() > 1) { for (int i = 1; i < trs.size(); i++) { LevelTest test = new LevelTest(); Element tr = trs.get(i); Elements tds = tr.select("td"); String name = tds.get(0).select("span").get(0).html(); String grade = tds.get(1).select("span").get(0).html(); String date = tds.get(2).select("span").get(0).html(); System.out.println(name + "," + grade + "," + date); info.append(name + "," + grade + "," + date + "\n"); test.setName(name); test.setGrade(grade); test.setDate(date); levelTest.add(test); } } } mHandler.post(new Runnable() { @Override public void run() { tv.setText(info.toString()); } }); } catch (Exception e) { e.printStackTrace(); } } }).start(); }