List of usage examples for org.jsoup.nodes Document getElementsByAttributeValueContaining
public Elements getElementsByAttributeValueContaining(String key, String match)
From source file:ph.fingra.statisticsweb.service.DashBoardServiceImpl.java
private void checkAppIcon(App app) { if ((app.getAppInfo() != null && app.getAppInfo().getSmallicon() != null) || !app.hasValidAppId()) return;//from w w w . ja v a 2 s . c om if (app.getAppInfo() == null) { AppInfo appInfo = new AppInfo(); appInfo.setAppkey(app.getAppkey()); app.setAppInfo(appInfo); } if (AppPlatform.valueOf(app.getPlatform()) == AppPlatform.IPHONE) { ResponseEntity<String> response = restTemplate .getForEntity("https://itunes.apple.com/lookup?id={appId}", String.class, app.getAppid()); if (response.getStatusCode() != HttpStatus.OK) return; JsonObject result = (JsonObject) new JsonParser().parse(response.getBody()); JsonArray arr = result.getAsJsonArray("results"); if (result.getAsJsonPrimitive("resultCount").getAsInt() != 1) return; JsonPrimitive smallIconUrl = arr.get(0).getAsJsonObject().getAsJsonPrimitive("artworkUrl60"); app.getAppInfo().setSmallicon(smallIconUrl.getAsString()); } else { Document d = null; try { d = Jsoup.connect("https://play.google.com/store/apps/details?id=" + app.getAppid()).get(); Elements div = d.getElementsByAttributeValueContaining("class", "cover-container"); //System.out.println(div.hasClass("cover-container")); if (div.size() == 0) return; String path = div.get(0).getElementsByTag("img").attr("src"); app.getAppInfo().setSmallicon(path); } catch (IOException e) { e.printStackTrace(); return; } } appDao.updateAppInfo(app.getAppInfo()); }
From source file:com.mycompany.parcinghtml.ParsingClassPlayers.java
public void downloadSource() throws SQLException { //ds = prepareDataSource(); String sql = "INSERT INTO PLAYERS(NAME,AGE,HEIGHT,WEIGHT,PLAYERNUM,POSITION,PLAYERID) VALUES(?,?,?,?,?,?,?)"; ArrayList<String> duplicity = new ArrayList<>(); int playerID = 1; for (int i = 2015; i > 2004; i--) { Document doc = null; try {//from w w w .j a va 2 s.c om doc = Jsoup.connect("http://www.hcsparta.cz/soupiska.asp?sezona=" + Integer.toString(i)).get(); } catch (IOException e) { System.out.println(e.getMessage()); } if (doc == null) { System.out.println("doc is null"); return; } Elements posNum; Elements elList; posNum = doc.getElementsByAttributeValueContaining("class", "soupiska"); //elList = doc.getElementsByAttributeValueContaining("id", "soupiska"); for (int j = 0; j < 3; j++) { elList = posNum.get(j).getElementsByAttributeValueContaining("id", "soupiska"); for (Element item : elList) { String[] secondName = item.child(2).text().split(" "); if (duplicity.contains(item.child(2).text())) continue; duplicity.add(item.child(2).text()); try (Connection conn = ds.getConnection()) { try (PreparedStatement st = conn.prepareStatement(sql)) { st.setString(1, item.child(2).text()); String[] age = item.child(4).text().split(" "); st.setInt(2, Integer.parseInt(age[0])); String[] height = item.child(5).text().split(" "); st.setInt(3, Integer.parseInt(height[0])); String[] weight = item.child(6).text().split(" "); st.setInt(4, Integer.parseInt(weight[0])); try { st.setInt(5, Integer.parseInt(item.child(0).text())); } catch (NumberFormatException ex) { st.setInt(5, 0); } st.setInt(6, j); st.setInt(7, playerID); int addedRows = st.executeUpdate(); playerID++; } } catch (SQLException ex) { throw new SQLException(ex.getMessage(), ex.fillInStackTrace()); } } } } }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Method to get the various html stats//from ww w.j a v a 2s.com * @param link_html the url to analyze * @return flag if we got all the stats */ public boolean gethtmlstats(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); Elements schemas = doc.getElementsByAttributeValueContaining("itemtype", "schema.org"); Elements microdata = doc.getElementsByAttribute("itemtype"); Elements microformats_vcard = doc.getElementsByAttributeValueContaining("class", "vcard"); Elements microformats_hreview = doc.getElementsByAttributeValueContaining("class", "hreview"); Elements microformats_vevent = doc.getElementsByAttributeValueContaining("class", "vevent"); Elements microformats_vcalendar = doc.getElementsByAttributeValueContaining("class", "vcalendar"); Elements microformats_vgeo = doc.getElementsByAttributeValueContaining("class", "geo"); Elements microformats_vadrn = doc.getElementsByAttributeValueContaining("class", "ardn"); Elements microformats_acquaintance = doc.getElementsByAttributeValueContaining("rel", "link_html"); Elements microformats_alternate = doc.getElementsByAttributeValueContaining("rel", "alternate"); Elements microformats_appendix = doc.getElementsByAttributeValueContaining("rel", "appendix"); Elements microformats_bookmark = doc.getElementsByAttributeValueContaining("rel", "bookmark"); Elements microformats_chapter = doc.getElementsByAttributeValueContaining("rel", "chapter"); Elements microformats_child = doc.getElementsByAttributeValueContaining("rel", "child"); Elements microformats_coll = doc.getElementsByAttributeValueContaining("rel", "colleague"); Elements microformats_contact = doc.getElementsByAttributeValueContaining("rel", "contact"); Elements microformats_contents = doc.getElementsByAttributeValueContaining("rel", "contents"); Elements microformats_copyright = doc.getElementsByAttributeValueContaining("rel", "copyright"); Elements microformats_coresident = doc.getElementsByAttributeValueContaining("rel", "co-resident"); Elements microformats_coworker = doc.getElementsByAttributeValueContaining("rel", "co-worker"); Elements microformats_crush = doc.getElementsByAttributeValueContaining("rel", "crush"); Elements microformats_date = doc.getElementsByAttributeValueContaining("rel", "date"); Elements microformats_friend = doc.getElementsByAttributeValueContaining("rel", "friend"); Elements microformats_glossary = doc.getElementsByAttributeValueContaining("rel", "glossary"); Elements microformats_help = doc.getElementsByAttributeValueContaining("rel", "help"); Elements microformats_itsrules = doc.getElementsByAttributeValueContaining("rel", "its-rules"); Elements microformats_kin = doc.getElementsByAttributeValueContaining("rel", "kin"); Elements microformats_license = doc.getElementsByAttributeValueContaining("rel", "license"); Elements microformats_me = doc.getElementsByAttributeValueContaining("rel", "me"); Elements microformats_met = doc.getElementsByAttributeValueContaining("rel", "met"); Elements microformats_muse = doc.getElementsByAttributeValueContaining("rel", "muse"); Elements microformats_neighbor = doc.getElementsByAttributeValueContaining("rel", "neighbor"); Elements microformats_next = doc.getElementsByAttributeValueContaining("rel", "next"); Elements microformats_nofollow = doc.getElementsByAttributeValueContaining("rel", "nofollow"); Elements microformats_parent = doc.getElementsByAttributeValueContaining("rel", "parent"); Elements microformats_prev = doc.getElementsByAttributeValueContaining("rel", "prev"); Elements microformats_previous = doc.getElementsByAttributeValueContaining("rel", "previous"); Elements microformats_section = doc.getElementsByAttributeValueContaining("rel", "section"); Elements microformats_sibling = doc.getElementsByAttributeValueContaining("rel", "sibling"); Elements microformats_spouse = doc.getElementsByAttributeValueContaining("rel", "spouse"); Elements microformats_start = doc.getElementsByAttributeValueContaining("rel", "start"); Elements microformats_stylesheet = doc.getElementsByAttributeValueContaining("rel", "stylesheet"); Elements microformats_subsection = doc.getElementsByAttributeValueContaining("rel", "subsection"); Elements microformats_sweetheart = doc.getElementsByAttributeValueContaining("rel", "sweetheart"); Elements microformats_tag = doc.getElementsByAttributeValueContaining("rel", "tag"); Elements microformats_toc = doc.getElementsByAttributeValueContaining("rel", "toc"); Elements microformats_transformation = doc.getElementsByAttributeValueContaining("rel", "transformation"); Elements microformats_appleti = doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon"); Elements microformats_appletip = doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon-precomposed"); Elements microformats_appletsi = doc.getElementsByAttributeValueContaining("rel", "apple-touch-startup-image"); Elements microformats_attachment = doc.getElementsByAttributeValueContaining("rel", "attachment"); Elements microformats_can = doc.getElementsByAttributeValueContaining("rel", "canonical"); Elements microformats_categ = doc.getElementsByAttributeValueContaining("rel", "category"); Elements microformats_compon = doc.getElementsByAttributeValueContaining("rel", "component"); Elements microformats_chromewebi = doc.getElementsByAttributeValueContaining("rel", "chrome-webstore-item"); Elements microformats_disclosure = doc.getElementsByAttributeValueContaining("rel", "disclosure"); Elements microformats_discussion = doc.getElementsByAttributeValueContaining("rel", "discussion"); Elements microformats_dns = doc.getElementsByAttributeValueContaining("rel", "dns-prefetch"); Elements microformats_edit = doc.getElementsByAttributeValueContaining("rel", "edit"); Elements microformats_edituri = doc.getElementsByAttributeValueContaining("rel", "EditURI"); Elements microformats_entrycon = doc.getElementsByAttributeValueContaining("rel", "entry-content"); Elements microformats_external = doc.getElementsByAttributeValueContaining("rel", "external"); Elements microformats_home = doc.getElementsByAttributeValueContaining("rel", "home"); Elements microformats_hub = doc.getElementsByAttributeValueContaining("rel", "hub"); Elements microformats_inreplyto = doc.getElementsByAttributeValueContaining("rel", "in-reply-to"); Elements microformats_index = doc.getElementsByAttributeValueContaining("rel", "index"); Elements microformats_indieauth = doc.getElementsByAttributeValueContaining("rel", "indieauth"); Elements microformats_issues = doc.getElementsByAttributeValueContaining("rel", "issues"); Elements microformats_lightbox = doc.getElementsByAttributeValueContaining("rel", "lightbox"); Elements microformats_meta = doc.getElementsByAttributeValueContaining("rel", "meta"); Elements microformats_openid = doc.getElementsByAttributeValueContaining("rel", "opendid"); Elements microformats_p3pv1 = doc.getElementsByAttributeValueContaining("rel", "p3pv1"); Elements microformats_pgpkey = doc.getElementsByAttributeValueContaining("rel", "pgpkey"); Elements microformats_pingback = doc.getElementsByAttributeValueContaining("rel", "pingback"); Elements microformats_prerender = doc.getElementsByAttributeValueContaining("rel", "prerender"); Elements microformats_profile = doc.getElementsByAttributeValueContaining("rel", "profile"); Elements microformats_rendition = doc.getElementsByAttributeValueContaining("rel", "rendition"); Elements microformats_service = doc.getElementsByAttributeValueContaining("rel", "service"); Elements microformats_shortlink = doc.getElementsByAttributeValueContaining("rel", "shortlink"); Elements microformats_sidebar = doc.getElementsByAttributeValueContaining("rel", "sidebar"); Elements microformats_sitemap = doc.getElementsByAttributeValueContaining("rel", "sitemap"); Elements microformats_subresource = doc.getElementsByAttributeValueContaining("rel", "subresource"); Elements microformats_syndication = doc.getElementsByAttributeValueContaining("rel", "syndication"); Elements microformats_timesheet = doc.getElementsByAttributeValueContaining("rel", "timesheet"); Elements microformats_webmention = doc.getElementsByAttributeValueContaining("rel", "webmention"); Elements microformats_widget = doc.getElementsByAttributeValueContaining("rel", "widget"); Elements microformats_wlwmanifest = doc.getElementsByAttributeValueContaining("rel", "wlwmanifest"); Elements microformats_imgsrc = doc.getElementsByAttributeValueContaining("rel", "image_src"); Elements microformats_cmisacl = doc.getElementsByAttributeValueContaining("rel", "http://docs.oasis-open.org/ns/cmis/link/200908/acl"); Elements microformats_stylesheetless = doc.getElementsByAttributeValueContaining("rel", "stylesheet/less"); Elements microformats_accessibility = doc.getElementsByAttributeValueContaining("rel", "accessibility"); Elements microformats_biblio = doc.getElementsByAttributeValueContaining("rel", "bibliography"); Elements microformats_cite = doc.getElementsByAttributeValueContaining("rel", "cite"); Elements microformats_group = doc.getElementsByAttributeValueContaining("rel", "group"); Elements microformats_jslicence = doc.getElementsByAttributeValueContaining("rel", "jslicense"); Elements microformats_longdesc = doc.getElementsByAttributeValueContaining("rel", "longdesc"); Elements microformats_map = doc.getElementsByAttributeValueContaining("rel", "map"); Elements microformats_member = doc.getElementsByAttributeValueContaining("rel", "member"); Elements microformats_source = doc.getElementsByAttributeValueContaining("rel", "source"); Elements microformats_status = doc.getElementsByAttributeValueContaining("rel", "status"); Elements microformats_archive = doc.getElementsByAttributeValueContaining("rel", "archive"); Elements microformats_archives = doc.getElementsByAttributeValueContaining("rel", "archives"); Elements microformats_comment = doc.getElementsByAttributeValueContaining("rel", "comment"); Elements microformats_contribution = doc.getElementsByAttributeValueContaining("rel", "contribution"); Elements microformats_endorsed = doc.getElementsByAttributeValueContaining("rel", "endorsed"); Elements microformats_fan = doc.getElementsByAttributeValueContaining("rel", "fan"); Elements microformats_feed = doc.getElementsByAttributeValueContaining("rel", "feed"); Elements microformats_footnote = doc.getElementsByAttributeValueContaining("rel", "footnote"); Elements microformats_icon = doc.getElementsByAttributeValueContaining("rel", "icon"); Elements microformats_kinstyle = doc.getElementsByAttributeValueContaining("rel", "kinetic-stylesheet"); Elements microformats_prettyphoto = doc.getElementsByAttributeValueContaining("rel", "prettyPhoto"); Elements microformats_clearbox = doc.getElementsByAttributeValueContaining("rel", "clearbox"); Elements microformats_made = doc.getElementsByAttributeValueContaining("rel", "made"); Elements microformats_microsummary = doc.getElementsByAttributeValueContaining("rel", "microsummary"); Elements microformats_noreferrer = doc.getElementsByAttributeValueContaining("rel", "noreferrer"); Elements microformats_permalink = doc.getElementsByAttributeValueContaining("rel", "permalink"); Elements microformats_popover = doc.getElementsByAttributeValueContaining("rel", "popover"); Elements microformats_prefetch = doc.getElementsByAttributeValueContaining("rel", "prefetch"); Elements microformats_publickey = doc.getElementsByAttributeValueContaining("rel", "publickey"); Elements microformats_publisher = doc.getElementsByAttributeValueContaining("rel", "publisher"); Elements microformats_referral = doc.getElementsByAttributeValueContaining("rel", "referral"); Elements microformats_related = doc.getElementsByAttributeValueContaining("rel", "related"); Elements microformats_replies = doc.getElementsByAttributeValueContaining("rel", "replies"); Elements microformats_resource = doc.getElementsByAttributeValueContaining("rel", "resource"); Elements microformats_search = doc.getElementsByAttributeValueContaining("rel", "search"); Elements microformats_sponsor = doc.getElementsByAttributeValueContaining("rel", "sponsor"); Elements microformats_tooltip = doc.getElementsByAttributeValueContaining("rel", "tooltip"); Elements microformats_trackback = doc.getElementsByAttributeValueContaining("rel", "trackback"); Elements microformats_unendorsed = doc.getElementsByAttributeValueContaining("rel", "unendorsed"); Elements microformats_user = doc.getElementsByAttributeValueContaining("rel", "user"); Elements microformats_wlw = doc.getElementsByAttributeValueContaining("rel", "wlwmanifest"); //-----microformats2 Elements microformats2_hadr = doc.getElementsByAttributeValueContaining("class", "h-adr"); Elements microformats2_hcard = doc.getElementsByAttributeValueContaining("class", "h-card"); Elements microformats2_hentry = doc.getElementsByAttributeValueContaining("class", "h-entry"); Elements microformats2_hevent = doc.getElementsByAttributeValueContaining("class", "h-event"); Elements microformats2_hgeo = doc.getElementsByAttributeValueContaining("class", "h-geo"); Elements microformats2_hitem = doc.getElementsByAttributeValueContaining("class", "h-item"); Elements microformats2_hproduct = doc.getElementsByAttributeValueContaining("class", "h-product"); Elements microformats2_hrecipe = doc.getElementsByAttributeValueContaining("class", "h-recipe"); Elements microformats2_hresume = doc.getElementsByAttributeValueContaining("class", "h-resume"); Elements microformats2_hreview = doc.getElementsByAttributeValueContaining("class", "h-review"); Elements microformats2_hreviewagg = doc.getElementsByAttributeValueContaining("class", "h-review-aggregate"); Elements foaf_autodiscoveries = doc.getElementsByAttributeValueContaining("href", "foaf"); Elements foaf_types = doc.getElementsByAttributeValueContaining("type", "foaf"); Elements media = doc.select("embed"); Elements iframes = doc.select("iframe"); Elements script_el = doc.select("script"); Elements reltags = doc.select("link[rel]"); Elements reltags_a = doc.select("a[rel]"); number_embeded_videos = media.size(); scripts_number = script_el.size(); frames_number = iframes.size(); nschem = schemas.size(); hreln = reltags.size() + reltags_a.size(); foaf = foaf_autodiscoveries.size() + foaf_types.size(); micron1 = microformats_cmisacl.size() + microformats_vcard.size() + microformats_vevent.size() + microformats_hreview.size() + microformats_vgeo.size() + microformats_vcalendar.size() + microformats_vadrn.size() + microformats_acquaintance.size() + microformats_alternate.size() + microformats_appendix.size() + microformats_biblio.size() + microformats_bookmark.size() + microformats_chapter.size() + microformats_child.size() + microformats_coll.size() + microformats_contact.size() + microformats_contents.size() + microformats_copyright.size() + microformats_coresident.size() + microformats_coworker.size() + microformats_crush.size() + microformats_date.size() + microformats_friend.size() + microformats_glossary.size() + microformats_help.size() + microformats_itsrules.size() + microformats_kin.size() + microformats_license.size() + microformats_me.size() + microformats_met.size() + microformats_muse.size() + microformats_neighbor.size() + microformats_next.size() + microformats_nofollow.size() + microformats_parent.size() + microformats_prev.size() + microformats_previous.size() + microformats_section.size() + microformats_sibling.size() + microformats_spouse.size() + microformats_start.size() + microformats_stylesheet.size() + microformats_subsection.size() + microformats_sweetheart.size() + microformats_tag.size() + microformats_toc.size() + microformats_transformation.size() + microformats_appleti.size() + microformats_appletip.size() + microformats_appletsi.size() + microformats_attachment.size() + microformats_can.size() + microformats_categ.size() + microformats_compon.size() + microformats_chromewebi.size() + microformats_disclosure.size() + microformats_discussion.size() + microformats_dns.size() + microformats_edit.size() + microformats_edituri.size() + microformats_entrycon.size() + microformats_external.size() + microformats_home.size() + microformats_hub.size() + microformats_inreplyto.size() + microformats_index.size() + microformats_indieauth.size() + microformats_issues.size() + microformats_lightbox.size() + microformats_meta.size() + microformats_openid.size() + microformats_p3pv1.size() + microformats_pgpkey.size() + microformats_pingback.size() + microformats_prerender.size() + microformats_profile.size() + microformats_rendition.size() + microformats_service.size() + microformats_shortlink.size() + microformats_sidebar.size() + microformats_sitemap.size() + microformats_subresource.size() + microformats_syndication.size() + microformats_timesheet.size() + microformats_webmention.size() + microformats_widget.size() + microformats_wlwmanifest.size() + microformats_imgsrc.size() + microformats_imgsrc.size() + microformats_stylesheetless.size() + microformats_accessibility.size() + microformats_accessibility.size() + microformats_cite.size() + microformats_group.size() + microformats_jslicence.size() + microformats_longdesc.size() + microformats_map.size() + microformats_member.size() + microformats_source.size() + microformats_status.size() + microformats_archive.size() + microformats_archives.size() + microformats_comment.size() + microformats_contribution.size() + microformats_endorsed.size() + microformats_fan.size() + microformats_feed.size() + microformats_footnote.size() + microformats_icon.size() + microformats_kinstyle.size() + microformats_prettyphoto.size() + microformats_clearbox.size() + microformats_made.size() + microformats_microsummary.size() + microformats_noreferrer.size() + microformats_permalink.size() + microformats_popover.size() + microformats_prefetch.size() + microformats_publickey.size() + microformats_publisher.size() + microformats_referral.size() + microformats_related.size() + microformats_replies.size() + microformats_resource.size() + microformats_search.size() + microformats_sponsor.size() + microformats_tooltip.size() + microformats_trackback.size() + microformats_unendorsed.size() + microformats_user.size() + microformats_wlw.size() + foaf; micron2 = microformats2_hadr.size() + microformats2_hcard.size() + microformats2_hentry.size() + microformats2_hevent.size() + microformats2_hgeo.size() + microformats2_hitem.size() + microformats2_hproduct.size() + microformats2_hrecipe.size() + microformats2_hresume.size() + microformats2_hreview.size() + microformats2_hreviewagg.size(); total_micron = micron1 + micron2; microd = microdata.size(); return true; } catch (IOException | IllegalCharsetNameException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); return false; } }
From source file:com.quarterfull.newsAndroid.NewsDetailFragment.java
public void onCreateContextMenu(ContextMenu menu, View v, ContextMenu.ContextMenuInfo menuInfo) { if (v instanceof WebView) { WebView.HitTestResult result = ((WebView) v).getHitTestResult(); if (result != null) { int type = result.getType(); Document htmldoc = Jsoup.parse(html); FragmentTransaction ft = getFragmentManager().beginTransaction(); if (type == WebView.HitTestResult.IMAGE_TYPE || type == WebView.HitTestResult.SRC_IMAGE_ANCHOR_TYPE) { String imageUrl = result.getExtra(); if (imageUrl.startsWith("http") || imageUrl.startsWith("file")) { URL mImageUrl; String imgtitle; String imgaltval; String imgsrcval; imgsrcval = imageUrl.substring(imageUrl.lastIndexOf('/') + 1, imageUrl.length()); Elements imgtag = htmldoc.getElementsByAttributeValueContaining("src", imageUrl); try { imgtitle = imgtag.first().attr("title"); } catch (NullPointerException e) { imgtitle = ""; }/*from w w w.j a v a2 s. co m*/ try { imgaltval = imgtag.first().attr("alt"); } catch (NullPointerException e) { imgaltval = ""; } try { mImageUrl = new URL(imageUrl); } catch (MalformedURLException e) { return; } String title = imgsrcval; int titleIcon = android.R.drawable.ic_menu_gallery; String text = (imgtitle.isEmpty()) ? imgaltval : imgtitle; // Create and show the dialog. DialogFragment newFragment = NewsDetailImageDialogFragment.newInstanceImage(title, titleIcon, text, mImageUrl); newFragment.show(ft, "menu_fragment_dialog"); } } else if (type == WebView.HitTestResult.SRC_ANCHOR_TYPE) { String url = result.getExtra(); URL mUrl; String text; try { Elements urltag = htmldoc.getElementsByAttributeValueContaining("href", url); text = urltag.text(); mUrl = new URL(url); } catch (MalformedURLException e) { return; } // Create and show the dialog. DialogFragment newFragment = NewsDetailImageDialogFragment.newInstanceUrl(text, mUrl.toString()); newFragment.show(ft, "menu_fragment_dialog"); } //else if (type == WebView.HitTestResult.EMAIL_TYPE) { } //else if (type == WebView.HitTestResult.GEO_TYPE) { } //else if (type == WebView.HitTestResult.PHONE_TYPE) { } //else if (type == WebView.HitTestResult.EDIT_TEXT_TYPE) { } } } }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }//ww w . ja va2 s . c o m // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // we can only work further if we got a search result on zelluloid.de if (options.getResult() == null) { throw new Exception("Scrape with Zelluloid.de without prior search is not supported"); }/*from w w w .ja v a 2 s . com*/ MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; // preset values from searchresult (if we have them) md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, Utils.removeSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle())); md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear()); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle()); String id = ""; if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } String detailurl = options.getResult().getUrl(); if (StringUtils.isEmpty(detailurl)) { detailurl = BASE_URL + "/filme/index.php3?id=" + id; } Url url; try { LOGGER.debug("get details page"); url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.storeMetadata(MediaMetadata.PLOT, plot); md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src")); } // parse year if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) { el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { md.storeMetadata(MediaMetadata.YEAR, el.get(0).text()); } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); sdf = new SimpleDateFormat("yyyy-MM-dd"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d)); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); } if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(detailsUrl); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { mcm.setCharacter(el.get(0).text()); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); // System.out.println("Cast: " + mcm.getCharacter() + " - " + // mcm.getName()); md.addCastMember(mcm); // TODO: parse actor detail pages :/ } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { mcm.setPart(lastRole); } else { mcm.setPart(crewrole); lastRole = crewrole; } if (crewrole.equals("Regie")) { mcm.setType(MediaCastMember.CastType.DIRECTOR); } else if (crewrole.equals("Drehbuch")) { mcm.setType(MediaCastMember.CastType.WRITER); } else { mcm.setType(MediaCastMember.CastType.OTHER); } mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); // System.out.println("Crew: " + mcm.getPart() + " - " + // mcm.getName()); md.addCastMember(mcm); } } else if (header == 3) { // production md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(linksUrl); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDBID, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + options.getResult().getUrl()); // clear cache CachedUrl.removeCachedFileForUrl(detailurl); throw e; } return md; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else {//from w w w . ja v a2s .co m LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }