List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:org.wallride.web.support.Posts.java
public String thumbnail(Post post) { if (post.getCover() != null) { return wallRideProperties.getMediaUrlPrefix() + post.getCover().getId(); } else {//from w ww . j a va 2 s . c o m Document document = Jsoup.parse(post.getBody()); Elements elements = document.select("img"); for (Element element : elements) { return element.attr("src"); } } return null; }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element;// w w w .j av a 2s .c o m try { element = doc.select("article.rfd").iterator().next(); } catch (NoSuchElementException exception) { throw new IOException("Cannot find article.rfd element"); } // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString())); // debate title result.setDebateTitle(TextCleaningUtils .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(TextCleaningUtils .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator() .next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }
From source file:com.liato.bankdroid.banking.banks.PayPal.java
@Override protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException { urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_paypal)); urlopen.setUserAgent(/* ww w .j av a 2 s. co m*/ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36"); //Get cookies and url to post to response = urlopen.open("https://www.paypal.com/en"); Document d = Jsoup.parse(response); Element e = d.select("form[name=login_form]").first(); String strPostUrl; if (e != null && !TextUtils.isEmpty(e.attr("action"))) { strPostUrl = e.attr("action").trim(); } else { throw new BankException(res.getText(R.string.unable_to_find).toString() + " post url."); } List<NameValuePair> postData = new ArrayList<NameValuePair>(); postData.add(new BasicNameValuePair("login_email", username)); postData.add(new BasicNameValuePair("login_password", password)); postData.add(new BasicNameValuePair("target_page", "0")); postData.add(new BasicNameValuePair("submit.x", "Log In")); postData.add(new BasicNameValuePair("form_charset", "UTF-8")); postData.add(new BasicNameValuePair("browser_name", "undefined")); postData.add(new BasicNameValuePair("browser_version", "undefined")); postData.add(new BasicNameValuePair("operating_system", "Windows")); postData.add(new BasicNameValuePair("bp_mid", "v=1;a1=na~a2=na~a3=na~a4=Mozilla~a5=Netscape~a6=5.0 (Windows; en-US)~a7=20100713~a8=na~a9=true~a10=Windows NT 6.1~a11=true~a12=Win32~a13=na~a14=Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.7) Gecko/20100713 Firefox/3.6.7 ( .NET CLR 3.5.30729; .NET4.0C)~a15=true~a16=en-US~a17=na~a18=www.paypal.com~a19=na~a20=na~a21=na~a22=na~a23=1280~a24=720~a25=24~a26=658~a27=na~a28=Sun Oct 31 2010 18:41:07 GMT 0100~a29=1~a30=def|qt1|qt2|qt3|qt4|qt5|qt6|swf|~a31=yes~a32=na~a33=na~a34=no~a35=no~a36=yes~a37=no~a38=online~a39=no~a40=Windows NT 6.1~a41=no~a42=no~")); postData.add(new BasicNameValuePair("bp_ks1", "v=1;l=16;Di0:2663Di1:48Ui0:15Ui1:81Di2:176Di3:48Ui2:32Ui3:96Di4:384Ui4:48Di5:352Ui5:48Di6:128Ui6:80Di7:112Ui7:48Di8:113Ui8:79Di9:125Ui9:51Di10:98Ui10:72Di11:227Ui11:51Di12:80Ui12:80Di13:128Ui13:64Di14:48Ui14:80Di15:416Ui15:80")); postData.add(new BasicNameValuePair("bp_ks2", "")); postData.add(new BasicNameValuePair("bp_ks3", "")); postData.add(new BasicNameValuePair("flow_name", "xpt/Marketing_CommandDriven/homepage/IndividualsHome")); postData.add(new BasicNameValuePair("fso", "k2TDENTlxEJnhbuYDYFmKMyVq0kUZPsdK6j3V1gPUwuZvyAmzzpRs4Cmjet0z19AwlxXfW")); return new LoginPackage(urlopen, postData, response, strPostUrl); }
From source file:org.javiermoreno.torrentscratcher.Runner.java
public Movie getMovie(String path) throws IOException { String url = "http://www.elitetorrent.net{path}"; url = url.replace("{path}", path); log.debug("Retrieving " + path + "."); Document doc = Jsoup.connect(url).get(); Movie movie = new Movie(); String title = doc.select("#box-ficha > h2").text(); // strip parentheses: http://stackoverflow.com/questions/1138552/replace-string-in-parentheses-using-regex title = title.replaceAll("\\([^\\(]*\\)", "").replaceAll("\\[[^\\(]*\\]", "").replaceAll("aka$", "").trim(); movie.setTitle(title);//from w w w . j a va 2 s. c o m movie.setUrl(url); movie.setDescription(doc.select("p.descrip").eq(1).text()); movie.setType("movie"); movie.setImage("http://www.elitetorrent.net/" + doc.select("img.imagen_ficha").attr("src")); Torrent torrent = new Torrent(); torrent.setMagnet(doc.select("a[href^=magnet]").attr("href")); torrent.setFilesize(doc.select("dl.info-tecnica dd").eq(3).text()); movie.getTorrents().put("720p", torrent); return movie; }
From source file:com.anhao.spring.service.impl.PhotosServiceImpl.java
private void getWallpaperTags(String wallpaperId) { String wallpaperUrl = "http://alpha.wallhaven.cc/wallpaper/" + wallpaperId; Document docDetails = getWallpaperHtmlDocument(wallpaperUrl); Elements Tags = docDetails.select("#tags li"); for (Element tag : Tags) { //iduuid ?wallhavenID String photosId = jobPhotosDAO.findByWallpaperId(wallpaperId); //tagUUID Element tagName = tag.select(".tagname").first(); String TagId = tagDAO.findByTagName(tagName.text()); System.out.println("wallpaperId:" + wallpaperId + "====tag name " + tagName.text()); PhotosTag photosTag = new PhotosTag(); photosTag.setPhotoId(photosId);//from ww w . jav a 2 s .c o m photosTag.setTagId(TagId); photostagDAO.add(photosTag); } }
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }//from ww w. j av a2 s.c om LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:mg.jerytodik.business.service.impl.JeryTodikSourceServiceImpl.java
private Elements getCssLinks(final String url) throws IOException { Document document = Jsoup.connect(url).get(); return document.select("link[href]"); }
From source file:org.ala.lucene.CreateWordPressIndex.java
/** * Index the WP pages by parsing with Jsoup and indexing into SOLR * * @return//from w ww . j a va 2 s .c o m * @throws IOException */ protected int indexPages() throws Exception { int documentCount = 0; // Initialise SOLR SolrServer solrServer = solrUtils.getSolrServer(); logger.info("Deleting all WordPress documents in SOLR index..."); solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages solrServer.commit(); for (String pageUrl : this.pageUrls) { try { // Crawl and extract text from WP pages Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get(); String title = document.select("head > title").text(); String id = document.select("head > meta[name=id]").attr("content"); String bodyText = document.body().text(); Elements postCategories = document.select("ul[class=post-categories]"); List<String> categoriesOut = new ArrayList<String>(); Boolean excludePost = false; if (!postCategories.isEmpty()) { // Is a WP post (not page) Elements categoriesIn = postCategories.select("li > a"); // get list of li elements for (Element cat : categoriesIn) { String thisCat = cat.text(); if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat) // exclude category "button" posts excludePost = true; } if (thisCat != null) { // add category to list categoriesOut.add(thisCat.replaceAll(" ", "_")); } } } if (excludePost) { logger.debug("Excluding post (id: " + id + ") with category: " + StringUtils.join(categoriesOut, "|")); continue; } documentCount++; // Index with SOLR logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: " + StringUtils.substring(bodyText, 0, 100) + "... "); SolrInputDocument doc = new SolrInputDocument(); doc.addField("idxtype", IndexedTypes.WORDPRESS); doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field doc.addField("id", "wp" + id); // probably not needed but safer to leave in doc.addField("name", title, 1.2f); doc.addField("content", bodyText); doc.addField("australian_s", "recorded"); // so they appear in default QF search doc.addField("categories", categoriesOut); // add to index solrServer.add(doc); if (documentCount % 100 == 0) { logger.info("Committing to SOLR (count = " + documentCount + ")..."); solrServer.commit(); } } catch (IOException ex) { // catch it so we don't stop indexing other pages logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex); } } logger.info("Final Committing to SOLR..."); solrServer.commit(); //logger.info("Optimising SOLR index..."); //solrServer.optimize(); // throws errors on my machine?? logger.info("Committed to SOLR. Final document count: " + documentCount); return documentCount; }
From source file:me.vertretungsplan.parser.UntisSubstitutionParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); String encoding = data.optString(PARAM_ENCODING, null); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); int successfulSchedules = 0; HttpResponseException lastExceptionSchedule = null; for (String baseUrl : ParserUtils.handleUrlsWithDateFormat(urls)) { try {//from www. j a va 2 s .c o m Document doc = Jsoup.parse(this.httpGet(baseUrl, encoding)); Elements classes = doc.select("td a"); String lastChange = doc.select("td[align=right]:not(:has(b))").text(); int successfulClasses = 0; HttpResponseException lastExceptionClass = null; for (Element klasse : classes) { try { Document classDoc = Jsoup.parse( httpGet(baseUrl.substring(0, baseUrl.lastIndexOf("/")) + "/" + klasse.attr("href"), encoding)); parseSubstitutionTable(v, lastChange, classDoc); successfulClasses++; } catch (HttpResponseException e) { lastExceptionClass = e; } } if (successfulClasses == 0 && lastExceptionClass != null) { throw lastExceptionClass; } successfulSchedules++; } catch (HttpResponseException e) { lastExceptionSchedule = e; } } if (successfulSchedules == 0 && lastExceptionSchedule != null) { throw lastExceptionSchedule; } if (data.has(PARAM_WEBSITE)) { v.setWebsite(data.getString(PARAM_WEBSITE)); } else { v.setWebsite(urls.get(0)); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:mobi.jenkinsci.ci.client.sso.GoogleSsoHandler.java
@Override public IOException getException(final HttpResponse response) { final StatusLine httpStatusLine = response.getStatusLine(); final int statusCode = httpStatusLine.getStatusCode(); switch (statusCode) { case HttpURLConnection.HTTP_OK: try {/*from w w w. j a v a 2s. co m*/ final Document responseDoc = Jsoup.parse(response.getEntity().getContent(), "UTF-8", ""); final Element errorDiv = responseDoc.select("div[id~=(error|errormsg)").first(); if (errorDiv != null) { return new IOException(getDivText(errorDiv)); } } catch (final Exception e) { } // Break is not needed here: we want to fallback to the 'default' case // if no error div is found default: return new IOException("Google Authentication FAILED"); } }