List of usage examples for org.jsoup.nodes Document getElementById
public Element getElementById(String id)
From source file:me.postar.postarv2.LocalService.java
@Override public int onStartCommand(Intent intent, int flags, int startId) { Functions.getParcels(parcels, this); PowerManager pm = (PowerManager) getSystemService(Context.POWER_SERVICE); wl = pm.newWakeLock(PowerManager.PARTIAL_WAKE_LOCK, "Postar"); if (Functions.isConnectedToInternet(LocalService.this)) { Ion.with(this).load("GET", "https://e-racuni.postacg.me/PracenjePosiljaka/").asString().withResponse() .setCallback(new FutureCallback<Response<String>>() { @Override//w w w . j a va 2 s .c om public void onCompleted(Exception e, Response<String> result) { Document html = Jsoup.parse(result.getResult()); Element viewState = html.getElementById("__VIEWSTATE"); Element eventValidation = html.getElementById("__EVENTVALIDATION"); Element btnPronadji = html.getElementById("btnPronadji"); for (final PostParcel parcel : parcels) { if (parcel.isAlarmOn()) { Ion.with(LocalService.this) .load("POST", "https://e-racuni.postacg.me/PracenjePosiljaka/") .setBodyParameter("__VIEWSTATE", viewState.val()) .setBodyParameter("__EVENTVALIDATION", eventValidation.val()) .setBodyParameter("btnPronadji", btnPronadji.val()) .setBodyParameter("txtPrijemniBroj", parcel.getParcelNo()).asString() .withResponse().setCallback(new FutureCallback<Response<String>>() { @Override public void onCompleted(Exception e, final Response<String> result) { Document html = Jsoup.parse(result.getResult()); Element table = html.getElementById("dgInfo"); if (table != null) { NotificationCompat.Builder mBuilder = new NotificationCompat.Builder( LocalService.this) .setSmallIcon(R.drawable.ic_mail_outline) .setLargeIcon(BitmapFactory.decodeResource( getResources(), R.drawable.ic_mail_outline)) .setAutoCancel(true); mBuilder.setContentTitle(getString(R.string.message_title)); mBuilder.setContentText( getString(R.string.message_content)); Intent activityIntent = new Intent(LocalService.this, StatusActivity.class); activityIntent.putExtra("parcel", parcel); PendingIntent resultPendingIntent = PendingIntent .getActivity(LocalService.this, 0, activityIntent, PendingIntent.FLAG_UPDATE_CURRENT); mBuilder.setContentIntent(resultPendingIntent); NotificationManager mNotificationManager = (NotificationManager) getSystemService( Context.NOTIFICATION_SERVICE); mNotificationManager.notify(12, mBuilder.build()); stopSelf(); wl.release(); } } }); } } } }); } return START_NOT_STICKY; }
From source file:org.apache.nutch.protocol.httpclient.HttpFormAuthentication.java
private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException { List<NameValuePair> params = new ArrayList<NameValuePair>(); Document doc = Jsoup.parse(pageContent); Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); if (loginform == null) { LOG.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId()); loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first(); if (loginform == null) { LOG.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId()); throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId()); }/* w w w . j av a2 s .c om*/ } Elements inputElements = loginform.getElementsByTag("input"); // skip fields in removedFormFields or loginPostData for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (authConfigurer.getLoginPostData().containsKey(key) || authConfigurer.getRemovedFormFields().contains(key)) { // value = loginPostData.get(key); continue; } params.add(new NameValuePair(key, value)); } // add key and value in loginPostData for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) { params.add(new NameValuePair(entry.getKey(), entry.getValue())); } return params; }
From source file:org.apache.nutch.protocol.httpclient.proxy.HttpFormAuthentication.java
private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException { List<NameValuePair> params = new ArrayList<NameValuePair>(); Document doc = Jsoup.parse(pageContent); Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); if (loginform == null) { LOGGER.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId()); loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first(); if (loginform == null) { LOGGER.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId()); throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId()); }// w ww . j a v a 2s.c o m } Elements inputElements = loginform.getElementsByTag("input"); // skip fields in removedFormFields or loginPostData for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (authConfigurer.getLoginPostData().containsKey(key) || authConfigurer.getRemovedFormFields().contains(key)) { // value = loginPostData.get(key); continue; } params.add(new NameValuePair(key, value)); } // add key and value in loginPostData for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) { params.add(new NameValuePair(entry.getKey(), entry.getValue())); } return params; }
From source file:org.cellcore.code.engine.page.extractor.starcity.STCPageDataExtractor.java
@Override protected String getName(Document doc) throws UnsupportedCardException { if (!doc.select("h3").select(":contains(Foil)").isEmpty()) { throw new UnsupportedCardException("foil"); }/* w ww.ja v a 2 s.c o m*/ String href = doc.baseUri(); String code = href.substring(href.lastIndexOf("=") + 1, href.length()); jsonProc(code, doc); if (doc.getElementById("custom_card_name_STC") != null) { return doc.getElementById("custom_card_name_STC").text(); } return null; }
From source file:org.cellcore.code.engine.page.extractor.starcity.STCPageDataExtractor.java
@Override protected float getPrice(Document doc) { if (doc.getElementById("custom_card_price_STC") != null) { return Float.parseFloat(doc.getElementById("custom_card_price_STC").text()); }/* w ww . jav a 2 s . com*/ return -1; }
From source file:org.craftercms.social.migration.controllers.MainController.java
protected void getHtml(final FileWriter writer) throws TransformerException, IOException { final URL in = getClass().getResource( MigrationTool.systemProperties.getString("crafter" + ".migration" + "" + ".loggerTemplate")); if (in == null) { log.error("Unable to find {} " + MigrationTool.systemProperties.getString("crafter" + ".migration" + "" + ".loggerTemplate")); }/*from w w w. j ava2s. c o m*/ final Document loggingDoc = Jsoup.parse(IOUtils.toString(in)); final Element logs = loggingDoc.getElementById("logs"); for (Object o : logTable.getItems()) { if (o instanceof UserLogEntry) { UserLogEntry userLogEntry = (UserLogEntry) o; String dateFormat = new SimpleDateFormat("yyyy MM dd hh:mm:ss zzz").format(userLogEntry.getDate()); final Element tr = loggingDoc.createElement("tr"); tr.attr("class", userLogEntry.getLevel().getCssClass()); final Element tmigrator = loggingDoc.createElement("td"); final Element tdate = loggingDoc.createElement("td"); final Element tmessage = loggingDoc.createElement("td"); tmessage.attr("class", "text-center"); tmessage.text(userLogEntry.getMessage()); tdate.text(dateFormat); tmigrator.text(userLogEntry.getSource()); tr.appendChild(tmigrator); tr.appendChild(tdate); tr.appendChild(tmessage); logs.appendChild(tr); } } IOUtils.write(loggingDoc.toString(), writer); // Transformer transformer = TransformerFactory.newInstance().newTransformer(); // transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); // transformer.setOutputProperty(OutputKeys.METHOD, "xml"); // transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); // transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); // transformer.transform(new DOMSource(loggingDoc), new StreamResult(writer)); writer.flush(); writer.close(); }
From source file:org.dataconservancy.ui.it.UiConfigurationActionBeanIT.java
/** * Insures that an XSD schema, composed of a single schema document (no <xsd:include> statements), can be * added to the system.//from w ww . j a va 2 s . c om * * @throws Exception */ @Test public void testAddMavenPomMetadataFormat() throws Exception { // Get a count of the current number of metadata formats in the system List<UiConfigurationActionBean.MetaDataFormatTransport> mdfts = getMdfs(); int mdfCount = mdfts.size(); // Compose the mdft to add final UiConfigurationActionBean.MetaDataFormatTransport mdft = new UiConfigurationActionBean() .getNewMetadataFormatTransport(); final AddMetadataFormatRequest req = new AddMetadataFormatRequest(urlConfig); // A unique name insures that this Metadata Format doesn't exist yet in the system (but we verify this // assumption anyway) final String name = UUID.randomUUID().toString(); final boolean validates = true; final String version = this.getClass().getSimpleName() + " Maven 4.0.0 POM"; final boolean appliesToCollection = false; final boolean appliesToProject = false; final boolean appliesToItem = true; final List<String> disciplineIds = Arrays.asList("dc:discipline:Biology"); mdft.setName(name); mdft.setVersion(version); mdft.setSchemaURL(MAVEN_MODEL_4_0_0_SCHEMA_URL); mdft.setSchemaSource(MAVEN_MODEL_4_0_0_SCHEMA_URL); mdft.setValidates(validates); mdft.setAppliesToCollection(appliesToCollection); mdft.setAppliesToProject(appliesToProject); mdft.setAppliesToItem(appliesToItem); mdft.setDisciplineIds(disciplineIds); // Insure that the new Metadata Format being added isn't in the list of existing metadata formats assertFalse(mdfts.contains(mdft)); // Add the metadata format HttpAssert.ResponseHolder holder = new HttpAssert.ResponseHolder(); HttpAssert.assertStatus(hc, req.asHttpPost(mdft), 200, holder); final String html = IOUtils.toString(holder.getBody()); assertNotNull(html); final Document dom = Jsoup.parse(html); assertNotNull(dom); Element nameElement = dom.getElementById("schemaName"); assertNotNull(nameElement); String testText = nameElement.text(); assertTrue(nameElement.text().equalsIgnoreCase("Schema Name: " + name)); Element versionElement = dom.getElementById("schemaVersion"); assertNotNull(versionElement); assertTrue(versionElement.text().equalsIgnoreCase("Version: " + version)); Element namespacesElement = dom.getElementById("namespaces"); //assertEquals(2, namespacesElement.childNodeSize()); Elements namespaceElements = namespacesElement.children(); boolean foundPrefixedNamespace = false; boolean foundNamespace = false; for (Element namespaceElement : namespaceElements) { String namespaceText = namespaceElement.text(); if (namespaceText.contains("Namespace:")) { if (namespaceText.contains("Prefix")) { assertTrue(namespaceText .equalsIgnoreCase("Namespace: http://www.w3.org/2001/XMLSchema Prefix: xs")); foundPrefixedNamespace = true; } else { assertTrue(namespaceText.equalsIgnoreCase("Namespace: http://maven.apache.org/POM/4.0.0")); foundNamespace = true; } } } assertTrue(foundPrefixedNamespace); assertTrue(foundNamespace); // Now we need to persist the format in the system by emulating a click on the "save" button HttpAssert.assertStatus(hc, new SaveMetadataFormatRequest(urlConfig).asHttpPost(), 200); // insure that the format we've added was added properly (all the values for table columns were // persisted properly) mdfts = getMdfs(); assertTrue(mdfts.contains(mdft)); assertEquals(mdfCount + 1, mdfts.size()); }
From source file:org.fcrepo.apix.integration.LoaderIT.java
@Test public void htmlMinimalTest() throws Exception { final String SERVICE_RESPONSE_BODY = "BODY"; optionsResponse.set(// w w w .jav a 2s. com IOUtils.toString(testResource("objects/options_LoaderIT_minimal.ttl").representation(), "utf8")); serviceResponse.set(SERVICE_RESPONSE_BODY); final Document html = attempt(60, () -> Jsoup.connect(LOADER_URI).method(Method.GET).timeout(1000).execute().parse()); final FormElement form = ((FormElement) html.getElementById("uriForm")); form.getElementById("uri").val(serviceEndpoint); final Response response = form.submit().ignoreHttpErrors(true).followRedirects(false).execute(); update(); assertEquals("OPTIONS", requestToService.getHeader(Exchange.HTTP_METHOD)); assertEquals(303, response.statusCode()); assertNotNull(response.header("Location")); // Verify that extension works! // Get the intercept/proxy URI for a fedora container final URI container = routing.of(REQUEST_URI).interceptUriFor(objectContainer); // Deposit an object into the container final URI deposited = client.post(container).slug("LoaderIT_htmlMinimalTest") .body(IOUtils.toInputStream("<> a <test:LoaderIT#minimal> .", "utf8"), "text/turtle").perform() .getLocation(); // Get the service discovery document final URI discoveryDoc = client.options(deposited).perform().getLinkHeaders("service").get(0); // Invoke the "minimal" service, and verify that the response body is as expected final String body = attempt(10, () -> IOUtils.toString( client.get(serviceEndpoints(discoveryDoc).get(SERVICE_MINIMAL)).perform().getBody(), "utf8")); assertEquals(SERVICE_RESPONSE_BODY, body); }
From source file:org.loklak.api.search.MeetupsCrawlerService.java
public static SusiThought crawlMeetups(String url) { Document meetupHTML = null; String meetupGroupName = null; String meetupType = null;// w w w . ja v a2 s. c om String groupDescription = null; String groupLocality = null; String groupCountry = null; String latitude = null; String longitude = null; String imageLink = null; Elements topicList = null; String[] topicListArray = new String[100]; Integer numberOfTopics = 0; Elements recentMeetupsSection = null; Integer numberOfRecentMeetupsShown = 0; Integer i = 0, j = 0; String recentMeetupsResult[][] = new String[100][3]; // recentMeetupsResult[i][0] == date && time // recentMeetupsResult[i][1] == Attendance && Review // recentMeetupsResult[i][2] == Information JSONObject result = new JSONObject(); try { meetupHTML = Jsoup.connect(url).userAgent("Mozilla)").get(); } catch (Exception e) { e.printStackTrace(); } meetupGroupName = meetupHTML.getElementsByAttributeValue("property", "og:title").attr("content"); result.put("group_name", meetupGroupName); meetupType = meetupHTML.getElementsByAttributeValue("property", "og:type").attr("content"); result.put("meetup_type", meetupType); groupDescription = meetupHTML.getElementById("groupDesc").text(); result.put("group_description", groupDescription); groupLocality = meetupHTML.getElementsByAttributeValue("property", "og:locality").attr("content"); result.put("group_locality", groupLocality); groupCountry = meetupHTML.getElementsByAttributeValue("property", "og:country-name").attr("content"); result.put("group_country_code", groupCountry); latitude = meetupHTML.getElementsByAttributeValue("property", "og:latitude").attr("content"); result.put("group_latitude", latitude); longitude = meetupHTML.getElementsByAttributeValue("property", "og:longitude").attr("content"); result.put("group_longitude", longitude); imageLink = meetupHTML.getElementsByAttributeValue("property", "og:image").attr("content"); result.put("group_imageLink", imageLink); topicList = meetupHTML.getElementById("topic-box-2012").getElementsByTag("a"); int p = 0; for (Element topicListStringsIterator : topicList) { topicListArray[p] = topicListStringsIterator.text().toString(); p++; } numberOfTopics = p; JSONArray groupTopics = new JSONArray(); for (int l = 0; l < numberOfTopics; l++) { groupTopics.put(l, topicListArray[l]); } result.put("group_topics", groupTopics); recentMeetupsSection = meetupHTML.getElementById("recentMeetups").getElementsByTag("p"); i = 0; j = 0; for (Element recentMeetups : recentMeetupsSection) { if (j % 3 == 0) { j = 0; i++; } recentMeetupsResult[i][j] = recentMeetups.text().toString(); j++; } numberOfRecentMeetupsShown = i; JSONArray recentMeetups = new JSONArray(); for (int k = 1; k < numberOfRecentMeetupsShown; k++) { JSONObject obj = new JSONObject(); obj.put("recent_meetup_number", k); obj.put("date_time", recentMeetupsResult[k][0]); obj.put("attendance", recentMeetupsResult[k][1]); obj.put("information", recentMeetupsResult[k][2]); recentMeetups.put(obj); } result.put("recent_meetups", recentMeetups); JSONArray meetupsCrawlerResultArray = new JSONArray(); meetupsCrawlerResultArray.put(result); SusiThought json = new SusiThought(); json.setData(meetupsCrawlerResultArray); return json; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }//from w w w.j a v a2s .c o m MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }