List of usage examples for org.jsoup.nodes Document getElementsByClass
public Elements getElementsByClass(String className)
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) { // <table id="akas" class="subpage_data spEven2Col"> // <tr class="even"> // <td>(original title)</td> // <td>Intouchables</td> // </tr> // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p for (Element table : doc.getElementsByTag("table")) { if (table.id().equalsIgnoreCase("akas")) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { Element c1 = row.getElementsByTag("td").get(0); Element c2 = row.getElementsByTag("td").get(1); if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(c2.text()); break; }/*from w ww . jav a2s . c om*/ } } } // alternative; new way with table classes // <tr class="ipl-zebra-list__item aka-item"> // <td class="aka-item__name">Germany</td> // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td> // </tr> if (md.getOriginalTitle().isEmpty()) { Elements rows = doc.getElementsByClass("aka-item"); for (Element row : rows) { Element country = row.getElementsByClass("aka-item__name").first(); Element title = row.getElementsByClass("aka-item__title").first(); if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(title.text()); break; } } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
/** * do the search according to the type//from www . j av a 2 s . co m * * @param query * the search params * @return the found results */ protected List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { List<MediaSearchResult> result = new ArrayList<>(); /* * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ String searchTerm = ""; if (StringUtils.isNotEmpty(query.getImdbId())) { searchTerm = query.getImdbId(); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.getQuery(); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper query String language = query.getLanguage().getLanguage(); int myear = query.getYear(); String country = query.getCountry().getAlpha2(); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(getImdbSite().getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! getLogger().debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(getSearchCategory()); getLogger().debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { Url url = new Url(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { getLogger().debug("tried to fetch search response", e); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(type); options.setImdbId(movieId); options.setLanguage(query.getLanguage()); options.setCountry(CountryCode.valueOf(country)); md = getMetadata(options); if (!StringUtils.isEmpty(md.getTitle())) { movieName = md.getTitle(); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getYear()); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Elements posters = doc.getElementsByClass("poster"); if (posters != null && !posters.isEmpty()) { Elements imgs = posters.get(0).getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; int year = 0; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwantedSearchResultPattern = getUnwantedSearchResultPattern(); if (unwantedSearchResultPattern != null) { Matcher matcher = unwantedSearchResultPattern.matcher(element.text()); if (matcher.find()) { continue; } } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { try { year = Integer.parseInt(matcher.group(1)); break; } catch (Exception ignored) { } } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); if (movieId.equals(query.getImdbId())) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { getLogger().debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (yearDiffers(myear, year)) { float diff = (float) Math.abs(year - myear) / 100; getLogger() .debug("parsed year does not match search result year - downgrading score by " + diff); score -= diff; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) { /*//from w w w .ja v a2 s .c om * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // title Element title = doc.getElementsByAttributeValue("name", "title").first(); if (title != null) { String movieTitle = cleanString(title.attr("content")); int yearStart = movieTitle.lastIndexOf("("); if (yearStart > 0) { movieTitle = movieTitle.substring(0, yearStart - 1).trim(); md.setTitle(movieTitle); } } // original title and year Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first(); if (originalTitleYear != null) { String content = originalTitleYear.attr("content"); int startOfYear = content.lastIndexOf("("); if (startOfYear > 0) { // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page... // String originalTitle = content.substring(0, startOfYear - 1).trim(); // md.setOriginalTitle(originalTitle); String yearText = content.substring(startOfYear); // search year Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}"); Matcher matcher = yearPattern.matcher(yearText); while (matcher.find()) { if (matcher.group(0) != null) { String movieYear = matcher.group(0); try { md.setYear(Integer.parseInt(movieYear)); break; } catch (Exception ignored) { } } } } } // poster Element poster = doc.getElementsByAttributeValue("property", "og:image").first(); if (poster != null) { String posterUrl = poster.attr("content"); int fileStart = posterUrl.lastIndexOf("/"); if (fileStart > 0) { int parameterStart = posterUrl.indexOf("_", fileStart); if (parameterStart > 0) { int startOfExtension = posterUrl.lastIndexOf("."); if (startOfExtension > parameterStart) { posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension); } } } processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { md.setRating(Float.valueOf(ratingAsString)); } catch (Exception ignored) { } Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim(); try { md.setVoteCount(Integer.parseInt(countAsString)); } catch (Exception ignored) { } } } // top250 Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first(); if (topRatedElement != null) { Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(topRatedElement.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { try { String top250Text = matcher.group(1); md.setTop250(Integer.parseInt(top250Text)); } catch (Exception ignored) { } } } } // releasedate Element releaseDateElement = doc .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo") .first(); if (releaseDateElement != null) { String releaseDateText = releaseDateElement.ownText(); int startOfCountry = releaseDateText.indexOf("("); if (startOfCountry > 0) { releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim(); } try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException ignored) { } } } Elements elements = doc.getElementsByClass("ipl-zebra-list__label"); for (Element element : elements) { // only parse tds if (!"td".equals(element.tag().getName())) { continue; } String elementText = element.ownText(); if (elementText.equals("Taglines")) { if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) { Element taglineElement = element.nextElementSibling(); if (taglineElement != null) { String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.setTagline(tagline); } } } if (elementText.equals("Genres")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/"); for (Element genreElement : genreElements) { String genreText = genreElement.ownText(); md.addGenre(getTmmGenre(genreText)); } } } /* * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) * | 178 min (extended cut)</div></div> */ if (elementText.equals("Runtime")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first(); if (runtimeElement != null) { String first = runtimeElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.setRuntime(runtime); } } } if (elementText.equals("Country")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/"); Pattern pattern = Pattern.compile("/country/(.*)"); for (Element countryElement : countryElements) { Matcher matcher = pattern.matcher(countryElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addCountry(LanguageUtils.getLocalizedCountryForLanguage( options.getLanguage().getLanguage(), countryElement.text(), matcher.group(1))); } else { md.addCountry(matcher.group(1)); } } } } } if (elementText.equals("Language")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements languageElements = nextElement.getElementsByAttributeValueStarting("href", "/language/"); Pattern pattern = Pattern.compile("/language/(.*)"); for (Element languageElement : languageElements) { Matcher matcher = pattern.matcher(languageElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString( options.getLanguage(), languageElement.text(), matcher.group(1))); } else { md.addSpokenLanguage(matcher.group(1)); } } } } } if (elementText.equals("Certification")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { String languageCode = options.getCountry().getAlpha2(); Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=" + languageCode); boolean done = false; for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); done = true; break; } } if (!done && languageCode.equals("DE")) { certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=XWG"); for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); break; } } } } } } // director Element directorsElement = doc.getElementById("directors"); while (directorsElement != null && directorsElement.tag().getName() != "header") { directorsElement = directorsElement.parent(); } if (directorsElement != null) { directorsElement = directorsElement.nextElementSibling(); } if (directorsElement != null) { for (Element directorElement : directorsElement.getElementsByClass("name")) { String director = directorElement.text().trim(); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director); md.addCastMember(cm); } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } // writers Element writersElement = doc.getElementById("writers"); while (writersElement != null && writersElement.tag().getName() != "header") { writersElement = writersElement.parent(); } if (writersElement != null) { writersElement = writersElement.nextElementSibling(); } if (writersElement != null) { Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element writerElement : writersElements) { String writer = cleanString(writerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer); md.addCastMember(cm); } } // producers Element producersElement = doc.getElementById("producers"); while (producersElement != null && producersElement.tag().getName() != "header") { producersElement = producersElement.parent(); } if (producersElement != null) { producersElement = producersElement.nextElementSibling(); } if (producersElement != null) { Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element producerElement : producersElements) { String producer = cleanString(producerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER); cm.setName(producer); md.addCastMember(cm); } } // producers Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title"); Element prodCompHeaderElement = null; for (Element possibleProdCompHeaderEl : prodCompHeaderElements) { if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) { prodCompHeaderElement = possibleProdCompHeaderEl; break; } } while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") { prodCompHeaderElement = prodCompHeaderElement.parent(); } if (prodCompHeaderElement != null) { prodCompHeaderElement = prodCompHeaderElement.nextElementSibling(); } if (prodCompHeaderElement != null) { Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href", "/company/"); for (Element prodCompElement : prodCompElements) { String prodComp = prodCompElement.ownText(); md.addProductionCompany(prodComp); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
/** * get the episode metadata./*www.ja va2 s .c om*/ * * @param options * the scrape options * @return the MediaMetaData * @throws Exception */ MediaMetadata getEpisodeMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = options.getImdbId(); if (StringUtils.isBlank(imdbId)) { return md; } // get episode number and season number int seasonNr = -1; int episodeNr = -1; try { seasonNr = Integer.parseInt(options.getId(MediaMetadata.SEASON_NR)); episodeNr = Integer.parseInt(options.getId(MediaMetadata.EPISODE_NR)); } catch (Exception e) { LOGGER.warn("error parsing season/episode number"); } if (seasonNr == -1 || episodeNr == -1) { return md; } // first get the base episode metadata which can be gathered via // getEpisodeList() List<MediaEpisode> episodes = getEpisodeList(options); MediaEpisode wantedEpisode = null; for (MediaEpisode episode : episodes) { if (episode.season == seasonNr && episode.episode == episodeNr) { wantedEpisode = episode; break; } } // we did not find the episode; return if (wantedEpisode == null) { return md; } md.setId(providerInfo.getId(), wantedEpisode.ids.get(providerInfo.getId())); md.setEpisodeNumber(wantedEpisode.episode); md.setSeasonNumber(wantedEpisode.season); md.setTitle(wantedEpisode.title); md.setPlot(wantedEpisode.plot); md.setRating(wantedEpisode.rating); md.setVoteCount(wantedEpisode.voteCount); try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM. yyyy", Locale.US); md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException e) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy", Locale.US); // without "dot" - "May" for example md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException ign) { LOGGER.warn("Could not parse date format: {}", wantedEpisode.firstAired); } } // and finally the cast which needed to be fetched from the fullcredits page if (wantedEpisode.ids.get(providerInfo.getId()) instanceof String && StringUtils.isNotBlank((String) wantedEpisode.ids.get(providerInfo.getId()))) { Url url = new Url( imdbSite.getSite() + "/title/" + wantedEpisode.ids.get(providerInfo.getId()) + "/fullcredits"); url.addHeader("Accept-Language", "en"); // force EN for parsing by HTMl texts Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); // director & writer Element fullcredits = doc.getElementById("fullcredits_content"); if (fullcredits != null) { Elements tables = fullcredits.getElementsByTag("table"); // first table are directors if (tables.get(0) != null) { for (Element director : tables.get(0).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director.text()); md.addCastMember(cm); } } // second table are writers if (tables.get(1) != null) { for (Element writer : tables.get(1).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer.text()); md.addCastMember(cm); } } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) { Pattern unknownPattern = Pattern.compile("Unknown"); Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)"); int episodeCounter = 0; // parse episodes Elements tables = doc.getElementsByClass("eplist"); for (Element table : tables) { Elements rows = table.getElementsByClass("list_item"); for (Element row : rows) { Matcher matcher = season == 0 ? unknownPattern.matcher(row.text()) : seasonEpisodePattern.matcher(row.text()); if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) { try { // we found a row containing episode data MediaEpisode ep = new MediaEpisode(providerInfo.getId()); // parse season and ep number if (season == 0) { ep.season = season; ep.episode = ++episodeCounter; } else { ep.season = Integer.parseInt(matcher.group(1)); ep.episode = Integer.parseInt(matcher.group(2)); }/*from www . ja va 2 s .co m*/ // check if we have still valid data if (season > 0 && season != ep.season) { return false; } // get ep title and id Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt"); for (Element anchor : anchors) { if ("name".equals(anchor.attr("itemprop"))) { ep.title = anchor.text(); break; } } String id = ""; Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href")); while (idMatcher.find()) { if (idMatcher.group(1) != null) { id = idMatcher.group(1); } } if (StringUtils.isNotBlank(id)) { ep.ids.put(providerInfo.getId(), id); } // plot Element plot = row.getElementsByClass("item_description").first(); if (plot != null) { ep.plot = plot.ownText(); } // rating and rating count Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { ep.rating = Float.valueOf(ratingAsString); } catch (Exception ignored) { } Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim(); try { ep.voteCount = Integer.parseInt(countAsString); } catch (Exception ignored) { } } } // release date Element releaseDate = row.getElementsByClass("airdate").first(); if (releaseDate != null) { ep.firstAired = releaseDate.ownText(); } // poster Element image = row.getElementsByTag("img").first(); if (image != null) { String posterUrl = image.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); if (StringUtils.isNotBlank(posterUrl)) { MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(), MediaArtwork.MediaArtworkType.THUMB); ma.setPreviewUrl(posterUrl); ma.setDefaultUrl(posterUrl); ep.artwork.add(ma); } } episodes.add(ep); } catch (Exception e) { LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage()); } } } } return true; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }//from ww w.j av a 2 s. c om // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception { LOGGER.debug("getTrailers() " + options.toString()); List<MediaTrailer> trailers = new ArrayList<>(); if (!MetadataUtil.isValidImdbId(options.getImdbId())) { LOGGER.debug("IMDB id not found"); return trailers; }//from w w w .j a v a2 s . c o m /* * function getTrailerData(ci) { switch (ci) { case 'http://de.clip-1.filmtrailer.com/9507_31566_a_1.flv?log_var=72|491100001 -1|-' : return * '<b>Trailer 1</b><br><i>(small)</i><br><br>» 160px<br><br>Download:<br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_1.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(small)</i><br><br>» * 160px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; } } */ Url url = null; String searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + options.getImdbId(); try { // search with IMDB url = new Url(searchString); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,"); if (filme == null || filme.isEmpty()) { LOGGER.debug("found no search results"); return trailers; } LOGGER.debug("found " + filme.size() + " search results"); // hopefully // only one LOGGER.debug("get (trailer) details page"); url = new Url(BASE_URL + "/" + StrgUtils.substr(filme.first().toString(), "href=\\\"(.*?)\\\"")); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // OLD STYLE // <b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» 640px<br><br>Download:<br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br> Pattern regex = Pattern.compile("return '(.*?)';"); Matcher m = regex.matcher(doc.toString()); while (m.find()) { String s = m.group(1); String tname = StrgUtils.substr(s, "<b>(.*?)</b>"); String tpix = StrgUtils.substr(s, "raquo; (.*?)x<br>"); // String tqual = StrgUtils.substr(s, "<i>\\((.*?)\\)</i>"); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(s); while (lm.find()) { String turl = lm.group(1); // String tformat = lm.group(2); MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } // NEW STYLE (additional!) // <div class="clips" id="clips2" style="display: none;"> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 1:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 2:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 3:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <br> // </div> // new style size // 1 = 160 x 90 = small // 2 = 240 x 136 = medium // 3 = 320 x 180 = large // 4 = 400 x 226 = xlarge // 5 = 640 x 360 = xxlarge ; regex = Pattern.compile("<i>(.*?)</i>(.*?)<br>", Pattern.DOTALL); // get them as single trailer line m = regex.matcher(doc.getElementsByClass("clips").html()); while (m.find()) { // LOGGER.info(doc.getElementsByClass("clips").html()); // parse each line with 5 qualities String tname = m.group(1).trim(); tname = tname.replaceFirst(":$", ""); // replace ending colon String urls = m.group(2); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(urls); while (lm.find()) { String turl = lm.group(1); String tpix = ""; String tformat = lm.group(2).replaceAll(" ", "").trim(); switch (tformat) { case "small": tpix = "90p"; break; case "medium": tpix = "136p"; break; case "large": tpix = "180p"; break; case "xlarge": tpix = "226p"; break; case "xxlarge": tpix = "360p"; break; default: break; } MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } } catch (Exception e) { if (url != null) { LOGGER.error("Error parsing {}", url.toString()); } else { LOGGER.error("Error parsing {}", searchString); } throw e; } return trailers; }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void processTranslationTable(@NotNull String queryString, @NotNull Document document, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage, @NotNull Language targetLanguage) { // Find main table (german to X) String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-" + targetLanguage.getIdentifier().toLowerCase(); Element translationTable = document.getElementById("dictionary-" + languageIdentifier); // Process the main table with its entries if (translationTable != null) { // Find all relevant entries, filter them by class and process them translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0) .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage)); // Extract synonyms Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table"); if (synonymTableCandidates.size() > 0) { extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage); }/* w w w . ja v a 2 s .c om*/ } else { LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier, targetLanguage.getIdentifier(), queryString); } }
From source file:poe.trade.assist.SearchForm.java
private String removeAllExceptSearchForm(String html) { String htmlDirectory = htmlDirectory(); Document doc = Jsoup.parse(html); // Remove stuff outside of id="main" // doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove()); Element head = doc.head();/*from www . ja v a 2s. c om*/ // Replace everthing in the <head> head.children().stream().forEach(e -> e.remove()); head.appendElement("meta").attr("charset", "utf-8"); head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width"); head.appendElement("title").text("poe.trade.assist"); head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js"); head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css"); // Show search form Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", "")); Optional.ofNullable(doc.getElementById("search")) .ifPresent(e -> e.attr("action", "http://poe.trade/search")); // // Element mainElement = doc.getElementById("main"); // Element topDivContainer = mainElement.child(0); // // // Remove everthing that is not id="content" or h2 // topDivContainer.children().stream() // .filter(e -> !"content".equalsIgnoreCase(e.id())) // .filter(e -> !e.tag().getName().equalsIgnoreCase("h2")) // .forEach(e -> e.remove()); // // // Clean up stuff inside id="content" // // // Remove "Show search form", "search/import" // Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst(); // searchFormElem.ifPresent(e -> e.remove()); // // Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst(); // searchOrImportDiv.ifPresent(e -> e.remove()); // // // Remove search results Elements searchResultBlocks = doc.getElementsByClass("search-results-block"); if (searchResultBlocks.size() > 0) { searchResultBlocks.get(0).remove(); } // append assist as the last element in body // doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js"); String cleanHtml = doc.toString(); return cleanHtml; }
From source file:Search.DataManipulation.DataParser.java
public String getIcon(Document dom) throws IOException { Elements iconClass = dom.getElementsByClass("cover-container"); Elements iconClass1 = iconClass.select("img.cover-image[alt=Cover art]"); String iconUrl = iconClass1.first().attr("src"); byte[] iconByte = dataHandler.imageDownloader(iconUrl); if (iconByte.length == 0) { log.warn("Invalid Icon url found by Search.DataManipulation.DataValidator, not adding to appData"); return null; } else {//from w w w .j av a 2s . co m String icon = Base64.getEncoder().encodeToString(iconByte); return icon; } }