List of usage examples for org.jsoup.nodes Document getElementById
public Element getElementById(String id)
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("search() " + query.toString()); /*/*from ww w . j av a2 s . co m*/ * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/"); List<MediaSearchResult> result = new ArrayList<MediaSearchResult>(); String searchTerm = ""; if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) { searchTerm = query.get(SearchParam.IMDBID); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.QUERY); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.TITLE); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper options String language = query.get(SearchParam.LANGUAGE); String myear = query.get(SearchParam.YEAR); String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(imdbSite.getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! LOGGER.debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(CAT_TITLE); LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { CachedUrl url = new CachedUrl(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { LOGGER.debug("tried to fetch search response", e); // clear Cache CachedUrl.removeCachedFileForUrl(sb.toString()); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(); options.setImdbId(movieId); options.setLanguage(MediaLanguages.valueOf(language)); options.setCountry(CountryCode.valueOf(country)); options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO))); options.setScrapeImdbForeignLanguage( Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE))); md = getMetadata(options); if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) { movieName = md.getStringValue(MediaMetadata.TITLE); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getStringValue(MediaMetadata.YEAR)); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Element td = doc.getElementById("img_primary"); if (td != null) { Elements imgs = td.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; String year = ""; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*| Matcher matcher = unwanted.matcher(element.text()); if (matcher.find()) { continue; } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { year = matcher.group(1); break; } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); // populate extra args MetadataUtil.copySearchQueryToSearchResult(query, sr); if (movieId.equals(query.get(SearchParam.IMDBID))) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) { LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMediaMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMediaMetadata(); }// w ww .j a v a2 s .c om String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(providerInfo.getId(), imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor); // worker for imdb request (/reference) (everytime from www.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/reference"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); Future<Document> futureReference = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureReference.get(); parseReferencePage(doc, options, md); /* * plot from /plotsummary */ // build the url doc = futurePlotsummary.get(); parsePlotsummaryPage(doc, options, md); // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { Element title = doc.getElementById("tn15title"); if (title != null) { Element element; // title Elements elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.setTitle(movieTitle); } } } // get the release info page Future<Document> futureReleaseinfo; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/releaseinfo"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futureReleaseinfo = compSvcImdb.submit(worker); doc = futureReleaseinfo.get(); // parse original title here!! parseReleaseinfoPageAKAs(doc, options, md); // did we get a release date? if (md.getReleaseDate() == null || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) { parseReleaseinfoPage(doc, options, md); } // get data from tmdb? if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) { try { MediaMetadata tmdbMd = futureTmdb.get(); if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) { // tmdbid md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB)); // title if (StringUtils.isNotBlank(tmdbMd.getTitle())) { md.setTitle(tmdbMd.getTitle()); } // original title if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) { md.setOriginalTitle(tmdbMd.getOriginalTitle()); } // tagline if (StringUtils.isNotBlank(tmdbMd.getTagline())) { md.setTagline(tmdbMd.getTagline()); } // plot if (StringUtils.isNotBlank(tmdbMd.getPlot())) { md.setPlot(tmdbMd.getPlot()); } // collection info if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) { md.setCollectionName(tmdbMd.getCollectionName()); md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); } } if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo") && tmdbMd != null) { md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); md.setCollectionName(tmdbMd.getCollectionName()); } md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId())); } catch (Exception ignored) { } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // populate id md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId); return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { Date releaseDate = null;/*from w w w . j a v a 2s . c om*/ Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})"); // old way Element tableReleaseDates = doc.getElementById("release_dates"); if (tableReleaseDates != null) { Elements rows = tableReleaseDates.getElementsByTag("tr"); // first round: check the release date for the first one with the requested country for (Element row : rows) { // get the anchor Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release_date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } } } } // new way; iterating over class name items if (releaseDate == null) { Elements rows = doc.getElementsByClass("release-date-item"); for (Element row : rows) { Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release-date-item__date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } else { LOGGER.trace("country {} does not match ours {}", matcher.group(1), options.getCountry().getAlpha2()); } } } } // no matching local release date found; take the first one if (releaseDate == null) { Element column = doc.getElementsByClass("release_date").first(); if (column == null) { column = doc.getElementsByClass("release-date-item__date").first(); } if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException ignored) { } } } } if (releaseDate != null) { md.setReleaseDate(releaseDate); } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) { /*//from w ww. ja v a 2 s.com * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // title Element title = doc.getElementsByAttributeValue("name", "title").first(); if (title != null) { String movieTitle = cleanString(title.attr("content")); int yearStart = movieTitle.lastIndexOf("("); if (yearStart > 0) { movieTitle = movieTitle.substring(0, yearStart - 1).trim(); md.setTitle(movieTitle); } } // original title and year Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first(); if (originalTitleYear != null) { String content = originalTitleYear.attr("content"); int startOfYear = content.lastIndexOf("("); if (startOfYear > 0) { // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page... // String originalTitle = content.substring(0, startOfYear - 1).trim(); // md.setOriginalTitle(originalTitle); String yearText = content.substring(startOfYear); // search year Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}"); Matcher matcher = yearPattern.matcher(yearText); while (matcher.find()) { if (matcher.group(0) != null) { String movieYear = matcher.group(0); try { md.setYear(Integer.parseInt(movieYear)); break; } catch (Exception ignored) { } } } } } // poster Element poster = doc.getElementsByAttributeValue("property", "og:image").first(); if (poster != null) { String posterUrl = poster.attr("content"); int fileStart = posterUrl.lastIndexOf("/"); if (fileStart > 0) { int parameterStart = posterUrl.indexOf("_", fileStart); if (parameterStart > 0) { int startOfExtension = posterUrl.lastIndexOf("."); if (startOfExtension > parameterStart) { posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension); } } } processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { md.setRating(Float.valueOf(ratingAsString)); } catch (Exception ignored) { } Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim(); try { md.setVoteCount(Integer.parseInt(countAsString)); } catch (Exception ignored) { } } } // top250 Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first(); if (topRatedElement != null) { Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(topRatedElement.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { try { String top250Text = matcher.group(1); md.setTop250(Integer.parseInt(top250Text)); } catch (Exception ignored) { } } } } // releasedate Element releaseDateElement = doc .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo") .first(); if (releaseDateElement != null) { String releaseDateText = releaseDateElement.ownText(); int startOfCountry = releaseDateText.indexOf("("); if (startOfCountry > 0) { releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim(); } try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException ignored) { } } } Elements elements = doc.getElementsByClass("ipl-zebra-list__label"); for (Element element : elements) { // only parse tds if (!"td".equals(element.tag().getName())) { continue; } String elementText = element.ownText(); if (elementText.equals("Taglines")) { if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) { Element taglineElement = element.nextElementSibling(); if (taglineElement != null) { String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.setTagline(tagline); } } } if (elementText.equals("Genres")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/"); for (Element genreElement : genreElements) { String genreText = genreElement.ownText(); md.addGenre(getTmmGenre(genreText)); } } } /* * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) * | 178 min (extended cut)</div></div> */ if (elementText.equals("Runtime")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first(); if (runtimeElement != null) { String first = runtimeElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.setRuntime(runtime); } } } if (elementText.equals("Country")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/"); Pattern pattern = Pattern.compile("/country/(.*)"); for (Element countryElement : countryElements) { Matcher matcher = pattern.matcher(countryElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addCountry(LanguageUtils.getLocalizedCountryForLanguage( options.getLanguage().getLanguage(), countryElement.text(), matcher.group(1))); } else { md.addCountry(matcher.group(1)); } } } } } if (elementText.equals("Language")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements languageElements = nextElement.getElementsByAttributeValueStarting("href", "/language/"); Pattern pattern = Pattern.compile("/language/(.*)"); for (Element languageElement : languageElements) { Matcher matcher = pattern.matcher(languageElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString( options.getLanguage(), languageElement.text(), matcher.group(1))); } else { md.addSpokenLanguage(matcher.group(1)); } } } } } if (elementText.equals("Certification")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { String languageCode = options.getCountry().getAlpha2(); Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=" + languageCode); boolean done = false; for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); done = true; break; } } if (!done && languageCode.equals("DE")) { certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=XWG"); for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); break; } } } } } } // director Element directorsElement = doc.getElementById("directors"); while (directorsElement != null && directorsElement.tag().getName() != "header") { directorsElement = directorsElement.parent(); } if (directorsElement != null) { directorsElement = directorsElement.nextElementSibling(); } if (directorsElement != null) { for (Element directorElement : directorsElement.getElementsByClass("name")) { String director = directorElement.text().trim(); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director); md.addCastMember(cm); } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } // writers Element writersElement = doc.getElementById("writers"); while (writersElement != null && writersElement.tag().getName() != "header") { writersElement = writersElement.parent(); } if (writersElement != null) { writersElement = writersElement.nextElementSibling(); } if (writersElement != null) { Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element writerElement : writersElements) { String writer = cleanString(writerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer); md.addCastMember(cm); } } // producers Element producersElement = doc.getElementById("producers"); while (producersElement != null && producersElement.tag().getName() != "header") { producersElement = producersElement.parent(); } if (producersElement != null) { producersElement = producersElement.nextElementSibling(); } if (producersElement != null) { Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element producerElement : producersElements) { String producer = cleanString(producerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER); cm.setName(producer); md.addCastMember(cm); } } // producers Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title"); Element prodCompHeaderElement = null; for (Element possibleProdCompHeaderEl : prodCompHeaderElements) { if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) { prodCompHeaderElement = possibleProdCompHeaderEl; break; } } while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") { prodCompHeaderElement = prodCompHeaderElement.parent(); } if (prodCompHeaderElement != null) { prodCompHeaderElement = prodCompHeaderElement.nextElementSibling(); } if (prodCompHeaderElement != null) { Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href", "/company/"); for (Element prodCompElement : prodCompElements) { String prodComp = prodCompElement.ownText(); md.addProductionCompany(prodComp); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { // imdb.com has another site structure if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) { // first check synopsis content // Element zebraList = doc.getElementById("plot-synopsis-content"); // if (zebraList != null) { // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); // if (!p.isEmpty()) { // Element em = p.get(0); // if (!"no-synopsis-content".equals(em.id())) { // String plot = cleanString(em.text()); // md.setPlot(plot); // }//from w ww . j a va 2 s .c o m // } // } // NOPE: synopsis contains spoilers // just take first summary // <li class="ipl-zebra-list__item" id="summary-ps21700000"> // <p>text text text text </p> // <div class="author-container"> // <em>—<a href="/search/title?plot_author=author">Author Name</a></em> // </div> // </li> Element zebraList = doc.getElementById("plot-summaries-content"); if (zebraList != null) { Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); if (!p.isEmpty()) { Element em = p.get(0); // remove author Elements authors = em.getElementsByClass("author-container"); if (!authors.isEmpty()) { authors.get(0).remove(); } if (!"no-summary-content".equals(em.id())) { String plot = cleanString(em.text()); md.setPlot(plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.setPlot(plot); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
/** * get the episode metadata./*from w w w.ja v a 2 s . c o m*/ * * @param options * the scrape options * @return the MediaMetaData * @throws Exception */ MediaMetadata getEpisodeMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = options.getImdbId(); if (StringUtils.isBlank(imdbId)) { return md; } // get episode number and season number int seasonNr = -1; int episodeNr = -1; try { seasonNr = Integer.parseInt(options.getId(MediaMetadata.SEASON_NR)); episodeNr = Integer.parseInt(options.getId(MediaMetadata.EPISODE_NR)); } catch (Exception e) { LOGGER.warn("error parsing season/episode number"); } if (seasonNr == -1 || episodeNr == -1) { return md; } // first get the base episode metadata which can be gathered via // getEpisodeList() List<MediaEpisode> episodes = getEpisodeList(options); MediaEpisode wantedEpisode = null; for (MediaEpisode episode : episodes) { if (episode.season == seasonNr && episode.episode == episodeNr) { wantedEpisode = episode; break; } } // we did not find the episode; return if (wantedEpisode == null) { return md; } md.setId(providerInfo.getId(), wantedEpisode.ids.get(providerInfo.getId())); md.setEpisodeNumber(wantedEpisode.episode); md.setSeasonNumber(wantedEpisode.season); md.setTitle(wantedEpisode.title); md.setPlot(wantedEpisode.plot); md.setRating(wantedEpisode.rating); md.setVoteCount(wantedEpisode.voteCount); try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM. yyyy", Locale.US); md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException e) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy", Locale.US); // without "dot" - "May" for example md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException ign) { LOGGER.warn("Could not parse date format: {}", wantedEpisode.firstAired); } } // and finally the cast which needed to be fetched from the fullcredits page if (wantedEpisode.ids.get(providerInfo.getId()) instanceof String && StringUtils.isNotBlank((String) wantedEpisode.ids.get(providerInfo.getId()))) { Url url = new Url( imdbSite.getSite() + "/title/" + wantedEpisode.ids.get(providerInfo.getId()) + "/fullcredits"); url.addHeader("Accept-Language", "en"); // force EN for parsing by HTMl texts Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); // director & writer Element fullcredits = doc.getElementById("fullcredits_content"); if (fullcredits != null) { Elements tables = fullcredits.getElementsByTag("table"); // first table are directors if (tables.get(0) != null) { for (Element director : tables.get(0).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director.text()); md.addCastMember(cm); } } // second table are writers if (tables.get(1) != null) { for (Element writer : tables.get(1).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer.text()); md.addCastMember(cm); } } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } } return md; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // we can only work further if we got a search result on zelluloid.de if (options.getResult() == null) { throw new Exception("Scrape with Zelluloid.de without prior search is not supported"); }//from w ww.j a v a 2s . c o m MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; // preset values from searchresult (if we have them) md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, Utils.removeSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle())); md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear()); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle()); String id = ""; if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } String detailurl = options.getResult().getUrl(); if (StringUtils.isEmpty(detailurl)) { detailurl = BASE_URL + "/filme/index.php3?id=" + id; } Url url; try { LOGGER.debug("get details page"); url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.storeMetadata(MediaMetadata.PLOT, plot); md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src")); } // parse year if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) { el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { md.storeMetadata(MediaMetadata.YEAR, el.get(0).text()); } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); sdf = new SimpleDateFormat("yyyy-MM-dd"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d)); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); } if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(detailsUrl); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { mcm.setCharacter(el.get(0).text()); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); // System.out.println("Cast: " + mcm.getCharacter() + " - " + // mcm.getName()); md.addCastMember(mcm); // TODO: parse actor detail pages :/ } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { mcm.setPart(lastRole); } else { mcm.setPart(crewrole); lastRole = crewrole; } if (crewrole.equals("Regie")) { mcm.setType(MediaCastMember.CastType.DIRECTOR); } else if (crewrole.equals("Drehbuch")) { mcm.setType(MediaCastMember.CastType.WRITER); } else { mcm.setType(MediaCastMember.CastType.OTHER); } mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); // System.out.println("Crew: " + mcm.getPart() + " - " + // mcm.getName()); md.addCastMember(mcm); } } else if (header == 3) { // production md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(linksUrl); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDBID, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + options.getResult().getUrl()); // clear cache CachedUrl.removeCachedFileForUrl(detailurl); throw e; } return md; }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void processTranslationTable(@NotNull String queryString, @NotNull Document document, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage, @NotNull Language targetLanguage) { // Find main table (german to X) String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-" + targetLanguage.getIdentifier().toLowerCase(); Element translationTable = document.getElementById("dictionary-" + languageIdentifier); // Process the main table with its entries if (translationTable != null) { // Find all relevant entries, filter them by class and process them translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0) .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage)); // Extract synonyms Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table"); if (synonymTableCandidates.size() > 0) { extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage); }//from w w w . j a va2 s . c o m } else { LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier, targetLanguage.getIdentifier(), queryString); } }
From source file:poe.trade.assist.SearchForm.java
private String removeAllExceptSearchForm(String html) { String htmlDirectory = htmlDirectory(); Document doc = Jsoup.parse(html); // Remove stuff outside of id="main" // doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove()); Element head = doc.head();/* w ww .jav a 2 s .c o m*/ // Replace everthing in the <head> head.children().stream().forEach(e -> e.remove()); head.appendElement("meta").attr("charset", "utf-8"); head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width"); head.appendElement("title").text("poe.trade.assist"); head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js"); head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css"); // Show search form Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", "")); Optional.ofNullable(doc.getElementById("search")) .ifPresent(e -> e.attr("action", "http://poe.trade/search")); // // Element mainElement = doc.getElementById("main"); // Element topDivContainer = mainElement.child(0); // // // Remove everthing that is not id="content" or h2 // topDivContainer.children().stream() // .filter(e -> !"content".equalsIgnoreCase(e.id())) // .filter(e -> !e.tag().getName().equalsIgnoreCase("h2")) // .forEach(e -> e.remove()); // // // Clean up stuff inside id="content" // // // Remove "Show search form", "search/import" // Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst(); // searchFormElem.ifPresent(e -> e.remove()); // // Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst(); // searchOrImportDiv.ifPresent(e -> e.remove()); // // // Remove search results Elements searchResultBlocks = doc.getElementsByClass("search-results-block"); if (searchResultBlocks.size() > 0) { searchResultBlocks.get(0).remove(); } // append assist as the last element in body // doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js"); String cleanHtml = doc.toString(); return cleanHtml; }
From source file:wo.trade.SearchPageScraper.java
public List<TradeItem> parse() { List<TradeItem> tradeItems = new LinkedList<>(); Document doc = Jsoup.parse(page, "UTF-8"); Element content = doc.getElementById("content"); Elements items = null;// w w w. jav a2s . co m if (content == null) { items = doc.getElementsByClass("item"); } else { items = content.getElementsByClass("item"); } for (Element element : items) { TradeItem item = new TradeItem(); item.id = element.attr("id"); item.id = StringUtils.remove(item.id, "item-container-"); item.seller = element.attr("data-seller"); item.thread = element.attr("data-thread"); item.sellerid = element.attr("data-sellerid"); item.buyout = element.attr("data-buyout"); item.ign = element.attr("data-ign"); item.league = element.attr("data-league"); item.name = element.attr("data-name"); item.corrupted = element.getElementsByClass("corrupted").size() > 0; item.identified = element.getElementsByClass("item-unid").size() == 0; // System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name)); Element sockElem = element.getElementsByClass("sockets-raw").get(0); item.socketsRaw = sockElem.text(); Elements accntAgeElement = element.getElementsByAttributeValue("title", "account age and highest level"); if (accntAgeElement != null && !accntAgeElement.isEmpty()) { item.ageAndHighLvl = accntAgeElement.get(0).text(); } // ----- Requirements ----- // Element reqElem = element.getElementsByClass("requirements").get(0); List<TextNode> reqNodes = reqElem.textNodes(); for (TextNode reqNode : reqNodes) { // sample [ Level: 37 , Strength: 42 , Intelligence: 42 ] String req = StringUtils.trimToEmpty(reqNode.getWholeText()); req = req.replaceAll(regex_horizontal_whitespace, ""); req = Util.removeThoseDamnWhiteSpace(req); String separator = ":"; String reqType = trim(substringBefore(req, separator)); switch (reqType) { case "Level": item.reqLvl = trim(substringAfter(req, separator)); break; case "Strength": item.reqStr = trim(substringAfter(req, separator)); break; case "Intelligence": item.reqInt = trim(substringAfter(req, separator)); break; case "Dexterity": item.reqDex = trim(substringAfter(req, separator)); break; } } item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst() .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:")) .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("") .replaceAll(regex_horizontal_whitespace, "").trim(); // ----- Rarity by checking the item name link class ----- // // itemframe0 - normal // itemframe1 - magic // itemframe2 - rare // itemframe3 - unique // itemframe4 - gems // itemframe5 - currency // itemframe6 - divination card String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class")) .orElse(null); itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1); if (itemframeStr != null) { int frame = Integer.parseInt(itemframeStr); item.rarity = Rarity.valueOf(frame); } else { item.rarity = Rarity.unknown; } // ----- Verify ----- // item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream() .findFirst().map(n -> n.attr("data-hash")).orElse("").trim(); // ----- Mods ----- // Elements itemModsElements = element.getElementsByClass("item-mods"); if (itemModsElements != null && itemModsElements.size() > 0) { Element itemMods = itemModsElements.get(0); if (itemMods.getElementsByClass("bullet-item").size() != 0) { Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0); Elements ulMods = bulletItem.getElementsByTag("ul"); if (ulMods.size() == 2) { // implicit mod Elements implicitLIs = ulMods.get(0).getElementsByTag("li"); Element implicitLi = implicitLIs.last(); Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value")); item.implicitMod = impMod; } int indexOfExplicitMods = ulMods.size() - 1; Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li"); for (Element modLi : modsLi) { // explicit mods Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value")); item.explicitMods.add(mod); } } } // ----- Properties ----- // // this is the third column data (the first col is the image, second is the mods, reqs) item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield") .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim(); item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src"); item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream() .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "=")) .orElse(null); Elements onlineSpans = element.getElementsMatchingText("online"); if (!onlineSpans.isEmpty()) { item.online = "Online"; } else { item.online = ""; } tradeItems.add(item); } // System.out.println("DONE --- Items"); return tradeItems; }