List of usage examples for org.jsoup.nodes Document getElementsByClass
public Elements getElementsByClass(String className)
From source file:ca.appvelopers.mcgillmobile.model.retrofit.CourseResultConverter.java
@Override public List<CourseResult> convert(ResponseBody value) throws IOException { String html = value.string(); List<CourseResult> courses = new ArrayList<>(); Document document = Jsoup.parse(html, "UTF-8"); //Parse the response body into a list of rows Elements rows = document.getElementsByClass("dddefault"); // Parse the term from the page header Element header = document.getElementsByClass("staticheaders").get(0); Term term = Term.parseTerm(header.childNode(2).toString()); // Get the table in the form of a set of rows Element table = document.getElementsByClass("datadisplaytable").get(0).select("tbody").get(0); // Go through the rows in the table for (Element row : table.select("tr")) { // Check that there at least 19 elements in the row Elements rowElements = row.select("td"); if (rowElements.size() < 19) { // If there aren't, it must not be a course row continue; }/* ww w . j av a2 s . co m*/ // Create a new course object with the default values double credits = 99; String subject = null; String number = null; String title = ""; String type = ""; List<DayOfWeek> days = new ArrayList<>(); int crn = 0; String instructor = ""; String location = ""; //So that the rounded start time will be 0 LocalTime startTime = ScheduleConverter.getDefaultStartTime(); LocalTime endTime = ScheduleConverter.getDefaultEndTime(); int capacity = 0; int seatsRemaining = 0; int waitlistRemaining = 0; LocalDate startDate = LocalDate.now(); LocalDate endDate = LocalDate.now(); try { for (int i = 0; i < rowElements.size(); i++) { if (rowElements.get(i).toString().contains(" ")) { // Empty row: continue continue; } String rowString = rowElements.get(i).text(); switch (i) { // CRN case 1: crn = Integer.parseInt(rowString); break; // Subject case 2: subject = rowString; break; // Number case 3: number = rowString; break; // Type case 5: type = rowString; break; // Number of credits case 6: credits = Double.parseDouble(rowString); break; // Course title case 7: //Remove the extra period at the end of the course title title = rowString.substring(0, rowString.length() - 1); break; // Days of the week case 8: if (rowString.equals("TBA")) { // TBA Stuff: no time associated so skip the next one // and add a dummy to keep the index correct rowElements.add(9, null); i++; } else { // Day Parsing rowString = rowString.replace('\u00A0', ' ').trim(); for (int k = 0; k < rowString.length(); k++) { days.add(DayUtils.getDay(rowString.charAt(k))); } } break; // Time case 9: String[] times = rowString.split("-"); try { int startHour = Integer.parseInt(times[0].split(" ")[0].split(":")[0]); int startMinute = Integer.parseInt(times[0].split(" ")[0].split(":")[1]); int endHour = Integer.parseInt(times[1].split(" ")[0].split(":")[0]); int endMinute = Integer.parseInt(times[1].split(" ")[0].split(":")[1]); //If it's PM, then add 12 hours to the hours for 24 hours format //Make sure it isn't noon String startPM = times[0].split(" ")[1]; if (startPM.equals("PM") && startHour != 12) { startHour += 12; } String endPM = times[1].split(" ")[1]; if (endPM.equals("PM") && endHour != 12) { endHour += 12; } startTime = LocalTime.of(startHour, startMinute); endTime = LocalTime.of(endHour, endMinute); } catch (NumberFormatException e) { //Courses sometimes don't have assigned times startTime = ScheduleConverter.getDefaultStartTime(); endTime = ScheduleConverter.getDefaultEndTime(); } break; // Capacity case 10: capacity = Integer.parseInt(rowString); break; // Seats remaining case 12: seatsRemaining = Integer.parseInt(rowString); break; // Waitlist remaining case 15: waitlistRemaining = Integer.parseInt(rowString); break; // Instructor case 16: instructor = rowString; break; // Start/end date case 17: Pair<LocalDate, LocalDate> dates = parseDateRange(term, rowString); startDate = dates.first; endDate = dates.second; break; // Location case 18: location = rowString; break; } } } catch (Exception e) { Timber.e(e, "Course Results Parser Error"); } // Don't add any courses with errors if (subject != null && number != null) { // Create a new course object and add it to list // TODO Should we be parsing the course section? courses.add(new CourseResult(term, subject, number, title, crn, "", startTime, endTime, days, type, location, instructor, credits, startDate, endDate, capacity, seatsRemaining, waitlistRemaining)); } } return courses; }
From source file:ca.zadrox.dota2esportticker.service.UpdateMatchService.java
private void updateMatches(boolean doResults) { if (!checkForConnectivity()) { LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_NO_CONNECTIVITY)); return;/* www. jav a 2 s. c o m*/ } LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_STARTED)); final String BASE_URL = "http://www.gosugamers.net/dota2/gosubet"; final String MATCH_LINK_URL_BASE = "http://www.gosugamers.net"; try { String rawHtml = new OkHttpClient().newCall(new Request.Builder().url(BASE_URL).build()).execute() .body().string(); rawHtml = rawHtml.substring(rawHtml.indexOf("<div id=\"col1\" class=\"rows\">"), rawHtml.indexOf("<div id=\"col2\" class=\"rows\">")); Document doc = Jsoup.parse(rawHtml); Elements tables = doc.getElementsByClass("matches"); ArrayList<ArrayList<String>> matchLinks = new ArrayList<ArrayList<String>>(tables.size()); int numSeries = 0; for (Element table : tables) { Elements links = table.getElementsByClass("match"); if (links.size() != 0) { ArrayList<String> innerMatchLink = new ArrayList<String>(links.size()); for (Element link : links) { String linkHref = link.attr("href"); innerMatchLink.add(MATCH_LINK_URL_BASE + linkHref); numSeries++; } matchLinks.add(innerMatchLink); } } // needed if there are massive reschedules to update content properly. Uri resultsUri = MatchContract.SeriesEntry.buildSeriesUriWithAfterTime(TimeUtils.getUTCTime()); Cursor c = getContentResolver().query(resultsUri, new String[] { MatchContract.SeriesEntry.COLUMN_GG_MATCH_PAGE }, null, null, null); while (c.moveToNext()) { if (!matchLinks.get(0).contains(c.getString(0))) { matchLinks.get(0).add(c.getString(0)); } } Iterator<ArrayList<String>> iterator = matchLinks.iterator(); int numResults = 0; ExecutorService executorService = Executors.newFixedThreadPool(10); ArrayList<Future<BundledMatchItem>> seriesItemFutures = new ArrayList<Future<BundledMatchItem>>( numSeries); LogUtils.LOGD(TAG, "Starting Retrieval, num elements gathered: " + numSeries); int i = 0; while (iterator.hasNext()) { ArrayList<String> matchList = iterator.next(); for (String matchUrl : matchList) { boolean hasResult = !iterator.hasNext(); if (!doResults && hasResult) { continue; } else if (hasResult) { numResults++; } seriesItemFutures.add(executorService.submit(new MatchGetter(matchUrl, hasResult))); i++; } } executorService.shutdown(); executorService.awaitTermination(20L, TimeUnit.SECONDS); LogUtils.LOGD(TAG, "Stopping Retrieval, elements submitted for fetching: " + i); ContentValues[] seriesEntries = new ContentValues[i]; ContentValues[] resultEntries = new ContentValues[numResults]; int seriesEntryWriteIndex = 0; int resultEntryWriteIndex = 0; for (Future<BundledMatchItem> seriesItemFuture : seriesItemFutures) { try { BundledMatchItem seriesItem = seriesItemFuture.get(); if (seriesItem != null) { seriesEntries[seriesEntryWriteIndex] = seriesItem.mMatch; seriesEntryWriteIndex++; if (seriesItem.hasResult) { resultEntries[resultEntryWriteIndex] = seriesItem.mResult; resultEntryWriteIndex++; } } } catch (ExecutionException e) { Log.e(TAG, "Should never get here"); } } this.getContentResolver().bulkInsert(MatchContract.SeriesEntry.CONTENT_URI, seriesEntries); if (doResults) this.getContentResolver().bulkInsert(MatchContract.ResultEntry.CONTENT_URI, resultEntries); PrefUtils.setLastUpdateTime(this, TimeUtils.getUTCTime()); } catch (IOException e) { Log.e(TAG, e.getMessage(), e); e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_COMPLETE)); PrefUtils.setLastResultsUpdateTime(this, TimeUtils.getUTCTime()); }
From source file:im.ene.lab.attiq.ui.activities.ItemDetailActivity.java
@SuppressWarnings("unused") public void onEventMainThread(ItemCommentsEvent event) { if (!UIUtil.isEmpty(event.comments)) { mCommentsView.setVisibility(View.VISIBLE); List<Comment> comments = event.comments; mCommentCount.setText(comments.size() + ""); String info = comments.size() == 1 ? getString(R.string.comment_singular) : getString(R.string.comment_plural); // FIXME should use plural strings mCommentInfo.setText(getString(R.string.article_comment, comments.size(), info)); final String html; try {//from w w w .j a va 2 s. com html = IOUtil.readAssets("html/comments.html"); Document fullBody = Jsoup.parse(html); Element content = fullBody.getElementById("content"); for (Comment comment : comments) { String commentHtml = IOUtil.readAssets("html/comment.html"); commentHtml = commentHtml.replace("{user_icon_url}", comment.getUser().getProfileImageUrl()) .replace("{user_name}", comment.getUser().getId()) .replace("{comment_time}", TimeUtil.commentTime(comment.getCreatedAt())) .replace("{article_uuid}", mItemUuid).replace("{comment_id}", comment.getId()); Document commentDoc = Jsoup.parse(commentHtml); Element eComment = commentDoc.getElementsByClass("comment-box").first(); eComment.getElementsByClass("message").first().append(comment.getRenderedBody()); // remove comment edit block if it is not from current user if (mMyProfile == null || !mMyProfile.getId().equals(comment.getUser().getId())) { String commentId = "comment_{comment_id}_{user_name}" .replace("{comment_id}", comment.getId()) .replace("{user_name}", comment.getUser().getId()); Element commentEditor = commentDoc.getElementById(commentId); commentEditor.remove(); } content.appendChild(eComment); } String result = fullBody.outerHtml(); mCommentsView.loadDataWithBaseURL("http://qiita.com/", result, null, null, null); } catch (IOException e) { e.printStackTrace(); } } else { mCommentCount.setText("0"); mCommentInfo.setText(getString(R.string.article_comment, 0, getString(R.string.comment_plural))); mCommentsView.setVisibility(View.GONE); } }
From source file:net.yoomai.virgo.spider.parsers.ICISParser.java
@Override public List<Estimate> parser(File htmlFile) { Document doc = null; try {/*from w ww . jav a2 s. c om*/ doc = Jsoup.parse(htmlFile, "UTF-8"); } catch (IOException e) { log.error(e.getMessage() + " : " + e.getCause()); } log.info("doc: " + doc); if (doc != null) { Elements elements = doc.getElementsByClass("price-datalist"); for (int i = 0; i < elements.size(); i++) { Element element = elements.get(i); log.info(element.toString()); } } return null; }
From source file:org.keionline.keionline.ArticleView.java
private String getContent(String url) throws IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla").get(); Element data = doc.getElementsByClass("node").first();// get the third content div, Elements select = data.select("img"); // Change the links to absolute!! so that images work for (Element e : select) { e.attr("src", e.absUrl("src")); }/*w ww .ja v a 2 s .c o m*/ select = data.select("a"); for (Element e : select) { e.attr("href", e.absUrl("href")); } Element info = data.getElementsByClass("submitted").first(); info.after("<hr>"); String cont = data.toString(); cont = CSS + cont + "</body>"; content = cont; return cont; }
From source file:org.loklak.api.search.EventBriteCrawlerService.java
public static SusiThought crawlEventBrite(String url) { Document htmlPage = null; try {/*from ww w. j a va 2s. c o m*/ htmlPage = Jsoup.connect(url).get(); } catch (Exception e) { e.printStackTrace(); } String eventID = null; String eventName = null; String eventDescription = null; // TODO Fetch Event Color String eventColor = null; String imageLink = null; String eventLocation = null; String startingTime = null; String endingTime = null; String ticketURL = null; Elements tagSection = null; Elements tagSpan = null; String[][] tags = new String[5][2]; String topic = null; // By default String closingDateTime = null; String schedulePublishedOn = null; JSONObject creator = new JSONObject(); String email = null; Float latitude = null; Float longitude = null; String privacy = "public"; // By Default String state = "completed"; // By Default String eventType = ""; String temp; Elements t; eventID = htmlPage.getElementsByTag("body").attr("data-event-id"); eventName = htmlPage.getElementsByClass("listing-hero-body").text(); eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text(); eventColor = null; imageLink = htmlPage.getElementsByTag("picture").attr("content"); eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text(); temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); if (temp.length() >= 20) { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content") .substring(0, 19); } else { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); } temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); if (temp.length() >= 20) { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content") .substring(0, 19); } else { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); } ticketURL = url + "#tickets"; // TODO Tags to be modified to fit in the format of Open Event "topic" tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs"); tagSpan = tagSection.select("span"); topic = ""; int iterator = 0, k = 0; for (Element e : tagSpan) { if (iterator % 2 == 0) { tags[k][1] = "www.eventbrite.com" + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href"); } else { tags[k][0] = e.text(); k++; } iterator++; } creator.put("email", ""); creator.put("id", "1"); // By Default temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"); if (temp.length() > 0) { latitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content")); } temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"); if (temp.length() > 0) { longitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content")); } // TODO This returns: "events.event" which is not supported by Open // Event Generator // eventType = htmlPage.getElementsByAttributeValue("property", // "og:type").attr("content"); String organizerName = null; String organizerLink = null; String organizerProfileLink = null; String organizerWebsite = null; String organizerContactInfo = null; String organizerDescription = null; String organizerFacebookFeedLink = null; String organizerTwitterFeedLink = null; String organizerFacebookAccountLink = null; String organizerTwitterAccountLink = null; temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text(); if (temp.length() >= 5) { organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text() .substring(4); } else { organizerName = ""; } organizerLink = url + "#listing-organizer"; organizerProfileLink = htmlPage .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden") .attr("href"); organizerContactInfo = url + "#lightbox_contact"; Document orgProfilePage = null; try { orgProfilePage = Jsoup.connect(organizerProfileLink).get(); } catch (Exception e) { e.printStackTrace(); } if (orgProfilePage != null) { t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website"); if (t != null) { organizerWebsite = orgProfilePage .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text(); } else { organizerWebsite = ""; } t = orgProfilePage.select("div.js-long-text.organizer-description"); if (t != null) { organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text(); } else { organizerDescription = ""; } organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed"; organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed"; t = orgProfilePage.getElementsByAttributeValue("class", "fb-page"); if (t != null) { organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page") .attr("data-href"); } else { organizerFacebookAccountLink = ""; } t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline"); if (t != null) { organizerTwitterAccountLink = orgProfilePage .getElementsByAttributeValue("class", "twitter-timeline").attr("href"); } else { organizerTwitterAccountLink = ""; } } JSONArray socialLinks = new JSONArray(); JSONObject fb = new JSONObject(); fb.put("id", "1"); fb.put("name", "Facebook"); fb.put("link", organizerFacebookAccountLink); socialLinks.put(fb); JSONObject tw = new JSONObject(); tw.put("id", "2"); tw.put("name", "Twitter"); tw.put("link", organizerTwitterAccountLink); socialLinks.put(tw); JSONArray jsonArray = new JSONArray(); JSONObject event = new JSONObject(); event.put("event_url", url); event.put("id", eventID); event.put("name", eventName); event.put("description", eventDescription); event.put("color", eventColor); event.put("background_url", imageLink); event.put("closing_datetime", closingDateTime); event.put("creator", creator); event.put("email", email); event.put("location_name", eventLocation); event.put("latitude", latitude); event.put("longitude", longitude); event.put("start_time", startingTime); event.put("end_time", endingTime); event.put("logo", imageLink); event.put("organizer_description", organizerDescription); event.put("organizer_name", organizerName); event.put("privacy", privacy); event.put("schedule_published_on", schedulePublishedOn); event.put("state", state); event.put("type", eventType); event.put("ticket_url", ticketURL); event.put("social_links", socialLinks); event.put("topic", topic); jsonArray.put(event); JSONObject org = new JSONObject(); org.put("organizer_name", organizerName); org.put("organizer_link", organizerLink); org.put("organizer_profile_link", organizerProfileLink); org.put("organizer_website", organizerWebsite); org.put("organizer_contact_info", organizerContactInfo); org.put("organizer_description", organizerDescription); org.put("organizer_facebook_feed_link", organizerFacebookFeedLink); org.put("organizer_twitter_feed_link", organizerTwitterFeedLink); org.put("organizer_facebook_account_link", organizerFacebookAccountLink); org.put("organizer_twitter_account_link", organizerTwitterAccountLink); jsonArray.put(org); JSONArray microlocations = new JSONArray(); jsonArray.put(new JSONObject().put("microlocations", microlocations)); JSONArray customForms = new JSONArray(); jsonArray.put(new JSONObject().put("customForms", customForms)); JSONArray sessionTypes = new JSONArray(); jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes)); JSONArray sessions = new JSONArray(); jsonArray.put(new JSONObject().put("sessions", sessions)); JSONArray sponsors = new JSONArray(); jsonArray.put(new JSONObject().put("sponsors", sponsors)); JSONArray speakers = new JSONArray(); jsonArray.put(new JSONObject().put("speakers", speakers)); JSONArray tracks = new JSONArray(); jsonArray.put(new JSONObject().put("tracks", tracks)); String userHome = System.getProperty("user.home"); String path = userHome + "/Downloads/EventBriteInfo"; new File(path).mkdir(); try (FileWriter file = new FileWriter(path + "/event.json")) { file.write(event.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/org.json")) { file.write(org.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/social_links.json")) { file.write(socialLinks.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/microlocations.json")) { file.write(microlocations.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/custom_forms.json")) { file.write(customForms.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/session_types.json")) { file.write(sessionTypes.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sessions.json")) { file.write(sessions.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sponsors.json")) { file.write(sponsors.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/speakers.json")) { file.write(speakers.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/tracks.json")) { file.write(tracks.toString()); } catch (IOException e1) { e1.printStackTrace(); } SusiThought json = new SusiThought(); json.setData(jsonArray); return json; }
From source file:org.loklak.api.search.QuoraProfileScraper.java
public static SusiThought scrapeQuora(String profile) { JSONObject quoraProfile = new JSONObject(); Document userHTML = null; String url = "https://www.quora.com/profile/" + profile; try {/*from w w w . j a v a 2 s .c o m*/ userHTML = Jsoup.connect(url).get(); } catch (IOException e) { e.printStackTrace(); } String bio = userHTML.getElementsByClass("qtext_para").text(); quoraProfile.put("bio", bio); String profileImage = userHTML.select("img.profile_photo_img").attr("data-src"); quoraProfile.put("profileImage", profileImage); String userName = userHTML.select("img.profile_photo_img").attr("alt"); quoraProfile.put("user", userName); String rssFeedLink = url + "/rss"; quoraProfile.put("rss_feed_link", rssFeedLink); JSONArray jsonArray = new JSONArray(); jsonArray.put(quoraProfile); SusiThought json = new SusiThought(); json.setData(jsonArray); return json; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }/*w ww . j ava 2 s.c om*/ MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("search() " + query.toString()); /*/*w ww. ja va 2 s . c o m*/ * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/"); List<MediaSearchResult> result = new ArrayList<MediaSearchResult>(); String searchTerm = ""; if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) { searchTerm = query.get(SearchParam.IMDBID); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.QUERY); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.TITLE); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper options String language = query.get(SearchParam.LANGUAGE); String myear = query.get(SearchParam.YEAR); String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(imdbSite.getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! LOGGER.debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(CAT_TITLE); LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { CachedUrl url = new CachedUrl(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { LOGGER.debug("tried to fetch search response", e); // clear Cache CachedUrl.removeCachedFileForUrl(sb.toString()); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(); options.setImdbId(movieId); options.setLanguage(MediaLanguages.valueOf(language)); options.setCountry(CountryCode.valueOf(country)); options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO))); options.setScrapeImdbForeignLanguage( Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE))); md = getMetadata(options); if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) { movieName = md.getStringValue(MediaMetadata.TITLE); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getStringValue(MediaMetadata.YEAR)); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Element td = doc.getElementById("img_primary"); if (td != null) { Elements imgs = td.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; String year = ""; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*| Matcher matcher = unwanted.matcher(element.text()); if (matcher.find()) { continue; } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { year = matcher.group(1); break; } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); // populate extra args MetadataUtil.copySearchQueryToSearchResult(query, sr); if (movieId.equals(query.get(SearchParam.IMDBID))) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) { LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { Date releaseDate = null;/* w ww . j ava 2 s . c o m*/ Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})"); // old way Element tableReleaseDates = doc.getElementById("release_dates"); if (tableReleaseDates != null) { Elements rows = tableReleaseDates.getElementsByTag("tr"); // first round: check the release date for the first one with the requested country for (Element row : rows) { // get the anchor Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release_date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } } } } // new way; iterating over class name items if (releaseDate == null) { Elements rows = doc.getElementsByClass("release-date-item"); for (Element row : rows) { Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release-date-item__date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } else { LOGGER.trace("country {} does not match ours {}", matcher.group(1), options.getCountry().getAlpha2()); } } } } // no matching local release date found; take the first one if (releaseDate == null) { Element column = doc.getElementsByClass("release_date").first(); if (column == null) { column = doc.getElementsByClass("release-date-item__date").first(); } if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException ignored) { } } } } if (releaseDate != null) { md.setReleaseDate(releaseDate); } return md; }