Example usage for org.jsoup.nodes Document getElementById

List of usage examples for org.jsoup.nodes Document getElementById

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementById.

Prototype

public Element getElementById(String id) 

Source Link

Document

Find an element by ID, including or under this element.

Usage

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    LOGGER.debug("search() " + query.toString());
    /*/*from  ww  w  .  j av a2 s .  co m*/
     * IMDb matches seem to come in several "flavours".
     * 
     * Firstly, if there is one exact match it returns the matching IMDb page.
     * 
     * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles
     * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results)
     * 
     * We should check the Exact match section first, then the poplar titles and finally the partial matches.
     * 
     * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek"
     */

    Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/");

    List<MediaSearchResult> result = new ArrayList<MediaSearchResult>();

    String searchTerm = "";

    if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) {
        searchTerm = query.get(SearchParam.IMDBID);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.QUERY);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.TITLE);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        return result;
    }

    // parse out language and coutry from the scraper options
    String language = query.get(SearchParam.LANGUAGE);
    String myear = query.get(SearchParam.YEAR);
    String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    StringBuilder sb = new StringBuilder(imdbSite.getSite());
    sb.append("find?q=");
    try {
        // search site was everytime in UTF-8
        sb.append(URLEncoder.encode(searchTerm, "UTF-8"));
    } catch (UnsupportedEncodingException ex) {
        // Failed to encode the movie name for some reason!
        LOGGER.debug("Failed to encode search term: " + searchTerm);
        sb.append(searchTerm);
    }

    // we need to search for all - otherwise we do not find TV movies
    sb.append(CAT_TITLE);

    LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString());
    Document doc;
    try {
        CachedUrl url = new CachedUrl(sb.toString());
        url.addHeader("Accept-Language", getAcceptLanguage(language, country));
        doc = Jsoup.parse(url.getInputStream(), "UTF-8", "");
    } catch (Exception e) {
        LOGGER.debug("tried to fetch search response", e);

        // clear Cache
        CachedUrl.removeCachedFileForUrl(sb.toString());

        return result;
    }

    // check if it was directly redirected to the site
    Elements elements = doc.getElementsByAttributeValue("rel", "canonical");
    for (Element element : elements) {
        MediaMetadata md = null;
        // we have been redirected to the movie site
        String movieName = null;
        String movieId = null;

        String href = element.attr("href");
        Matcher matcher = imdbIdPattern.matcher(href);
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                movieId = matcher.group(1);
            }
        }

        // get full information
        if (!StringUtils.isEmpty(movieId)) {
            MediaScrapeOptions options = new MediaScrapeOptions();
            options.setImdbId(movieId);
            options.setLanguage(MediaLanguages.valueOf(language));
            options.setCountry(CountryCode.valueOf(country));
            options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO)));
            options.setScrapeImdbForeignLanguage(
                    Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE)));
            md = getMetadata(options);
            if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) {
                movieName = md.getStringValue(MediaMetadata.TITLE);
            }
        }

        // if a movie name/id was found - return it
        if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) {
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            sr.setTitle(movieName);
            sr.setIMDBId(movieId);
            sr.setYear(md.getStringValue(MediaMetadata.YEAR));
            sr.setMetadata(md);
            sr.setScore(1);

            // and parse out the poster
            String posterUrl = "";
            Element td = doc.getElementById("img_primary");
            if (td != null) {
                Elements imgs = td.getElementsByTag("img");
                for (Element img : imgs) {
                    posterUrl = img.attr("src");
                    posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                    posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                    posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
                }
            }
            if (StringUtils.isNotBlank(posterUrl)) {
                sr.setPosterUrl(posterUrl);
            }

            result.add(sr);
            return result;
        }
    }

    // parse results
    // elements = doc.getElementsByClass("result_text");
    elements = doc.getElementsByClass("findResult");
    for (Element tr : elements) {
        // we only want the tr's
        if (!"tr".equalsIgnoreCase(tr.tagName())) {
            continue;
        }

        // find the id / name
        String movieName = "";
        String movieId = "";
        String year = "";
        Elements tds = tr.getElementsByClass("result_text");
        for (Element element : tds) {
            // we only want the td's
            if (!"td".equalsIgnoreCase(element.tagName())) {
                continue;
            }

            // filter out unwanted results
            Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*|
            Matcher matcher = unwanted.matcher(element.text());
            if (matcher.find()) {
                continue;
            }

            // is there a localized name? (aka)
            String localizedName = "";
            Elements italics = element.getElementsByTag("i");
            if (italics.size() > 0) {
                localizedName = italics.text().replace("\"", "");
            }

            // get the name inside the link
            Elements anchors = element.getElementsByTag("a");
            for (Element a : anchors) {
                if (StringUtils.isNotEmpty(a.text())) {
                    // movie name
                    if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) {
                        // take AKA as title, but only if not EN
                        movieName = localizedName;
                    } else {
                        movieName = a.text();
                    }

                    // parse id
                    String href = a.attr("href");
                    matcher = imdbIdPattern.matcher(href);
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            movieId = matcher.group(1);
                        }
                    }

                    // try to parse out the year
                    Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
                    matcher = yearPattern.matcher(element.text());
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            year = matcher.group(1);
                            break;
                        }
                    }
                    break;
                }
            }
        }

        // if an id/name was found - parse the poster image
        String posterUrl = "";
        tds = tr.getElementsByClass("primary_photo");
        for (Element element : tds) {
            Elements imgs = element.getElementsByTag("img");
            for (Element img : imgs) {
                posterUrl = img.attr("src");
                posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
            }
        }

        // if no movie name/id was found - continue
        if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) {
            continue;
        }

        MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
        sr.setTitle(movieName);
        sr.setIMDBId(movieId);
        sr.setYear(year);
        sr.setPosterUrl(posterUrl);

        // populate extra args
        MetadataUtil.copySearchQueryToSearchResult(query, sr);

        if (movieId.equals(query.get(SearchParam.IMDBID))) {
            // perfect match
            sr.setScore(1);
        } else {
            // compare score based on names
            float score = MetadataUtil.calculateScore(searchTerm, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                LOGGER.debug("no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) {
                LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01");
                score = score - 0.01f;
            }
            sr.setScore(score);
        }

        result.add(sr);

        // only get 40 results
        if (result.size() >= 40) {
            break;
        }
    }
    Collections.sort(result);
    Collections.reverse(result);

    return result;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception {
    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMediaMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMediaMetadata();
    }// w ww  .j  a  v  a2 s .c om

    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(providerInfo.getId(), imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor);

    // worker for imdb request (/reference) (everytime from www.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/reference");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    Future<Document> futureReference = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureReference.get();
    parseReferencePage(doc, options, md);

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = futurePlotsummary.get();
    parsePlotsummaryPage(doc, options, md);

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        Element title = doc.getElementById("tn15title");
        if (title != null) {
            Element element;
            // title
            Elements elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.setTitle(movieTitle);
            }
        }
    }

    // get the release info page
    Future<Document> futureReleaseinfo;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/releaseinfo");
    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futureReleaseinfo = compSvcImdb.submit(worker);
    doc = futureReleaseinfo.get();
    // parse original title here!!
    parseReleaseinfoPageAKAs(doc, options, md);

    // did we get a release date?
    if (md.getReleaseDate() == null
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) {
        parseReleaseinfoPage(doc, options, md);
    }

    // get data from tmdb?
    if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) {
        try {
            MediaMetadata tmdbMd = futureTmdb.get();
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) {
                // tmdbid
                md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB));
                // title
                if (StringUtils.isNotBlank(tmdbMd.getTitle())) {
                    md.setTitle(tmdbMd.getTitle());
                }
                // original title
                if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) {
                    md.setOriginalTitle(tmdbMd.getOriginalTitle());
                }
                // tagline
                if (StringUtils.isNotBlank(tmdbMd.getTagline())) {
                    md.setTagline(tmdbMd.getTagline());
                }
                // plot
                if (StringUtils.isNotBlank(tmdbMd.getPlot())) {
                    md.setPlot(tmdbMd.getPlot());
                }
                // collection info
                if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) {
                    md.setCollectionName(tmdbMd.getCollectionName());
                    md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                }
            }
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")
                    && tmdbMd != null) {
                md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                md.setCollectionName(tmdbMd.getCollectionName());
            }
            md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId()));
        } catch (Exception ignored) {
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getOriginalTitle())) {
        md.setOriginalTitle(md.getTitle());
    }

    // populate id
    md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId);

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    Date releaseDate = null;/*from w w w  .  j  a v  a 2s  . c om*/
    Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})");

    // old way
    Element tableReleaseDates = doc.getElementById("release_dates");
    if (tableReleaseDates != null) {
        Elements rows = tableReleaseDates.getElementsByTag("tr");
        // first round: check the release date for the first one with the requested country
        for (Element row : rows) {
            // get the anchor
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release_date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                }
            }
        }
    }

    // new way; iterating over class name items
    if (releaseDate == null) {
        Elements rows = doc.getElementsByClass("release-date-item");
        for (Element row : rows) {
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release-date-item__date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                } else {
                    LOGGER.trace("country {} does not match ours {}", matcher.group(1),
                            options.getCountry().getAlpha2());
                }
            }
        }
    }

    // no matching local release date found; take the first one
    if (releaseDate == null) {
        Element column = doc.getElementsByClass("release_date").first();
        if (column == null) {
            column = doc.getElementsByClass("release-date-item__date").first();
        }
        if (column != null) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                releaseDate = sdf.parse(column.text());
            } catch (ParseException otherformat) {
                try {
                    SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                    releaseDate = sdf.parse(column.text());
                } catch (ParseException ignored) {
                }
            }
        }
    }

    if (releaseDate != null) {
        md.setReleaseDate(releaseDate);
    }
    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    /*//from w ww. ja  v a 2 s.com
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // title
    Element title = doc.getElementsByAttributeValue("name", "title").first();
    if (title != null) {
        String movieTitle = cleanString(title.attr("content"));
        int yearStart = movieTitle.lastIndexOf("(");
        if (yearStart > 0) {
            movieTitle = movieTitle.substring(0, yearStart - 1).trim();
            md.setTitle(movieTitle);
        }
    }

    // original title and year
    Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first();
    if (originalTitleYear != null) {
        String content = originalTitleYear.attr("content");
        int startOfYear = content.lastIndexOf("(");
        if (startOfYear > 0) {
            // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page...
            // String originalTitle = content.substring(0, startOfYear - 1).trim();
            // md.setOriginalTitle(originalTitle);

            String yearText = content.substring(startOfYear);

            // search year
            Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}");
            Matcher matcher = yearPattern.matcher(yearText);
            while (matcher.find()) {
                if (matcher.group(0) != null) {
                    String movieYear = matcher.group(0);
                    try {
                        md.setYear(Integer.parseInt(movieYear));
                        break;
                    } catch (Exception ignored) {
                    }
                }
            }
        }
    }

    // poster
    Element poster = doc.getElementsByAttributeValue("property", "og:image").first();
    if (poster != null) {
        String posterUrl = poster.attr("content");

        int fileStart = posterUrl.lastIndexOf("/");
        if (fileStart > 0) {
            int parameterStart = posterUrl.indexOf("_", fileStart);
            if (parameterStart > 0) {
                int startOfExtension = posterUrl.lastIndexOf(".");
                if (startOfExtension > parameterStart) {
                    posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension);

                }
            }
        }
        processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first();
    if (ratingElement != null) {
        String ratingAsString = ratingElement.ownText().replace(",", ".");
        try {
            md.setRating(Float.valueOf(ratingAsString));
        } catch (Exception ignored) {
        }

        Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first();
        if (votesElement != null) {
            String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim();
            try {
                md.setVoteCount(Integer.parseInt(countAsString));
            } catch (Exception ignored) {
            }
        }
    }
    // top250
    Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first();
    if (topRatedElement != null) {
        Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})");
        Matcher matcher = topPattern.matcher(topRatedElement.ownText());
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                try {
                    String top250Text = matcher.group(1);
                    md.setTop250(Integer.parseInt(top250Text));
                } catch (Exception ignored) {
                }
            }
        }
    }

    // releasedate
    Element releaseDateElement = doc
            .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo")
            .first();
    if (releaseDateElement != null) {
        String releaseDateText = releaseDateElement.ownText();
        int startOfCountry = releaseDateText.indexOf("(");
        if (startOfCountry > 0) {
            releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim();
        }
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
            Date parsedDate = sdf.parse(releaseDateText);
            md.setReleaseDate(parsedDate);
        } catch (ParseException otherformat) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                Date parsedDate = sdf.parse(releaseDateText);
                md.setReleaseDate(parsedDate);
            } catch (ParseException ignored) {
            }
        }
    }

    Elements elements = doc.getElementsByClass("ipl-zebra-list__label");
    for (Element element : elements) {
        // only parse tds
        if (!"td".equals(element.tag().getName())) {
            continue;
        }

        String elementText = element.ownText();

        if (elementText.equals("Taglines")) {
            if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) {
                Element taglineElement = element.nextElementSibling();
                if (taglineElement != null) {
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.setTagline(tagline);
                }
            }
        }

        if (elementText.equals("Genres")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/");

                for (Element genreElement : genreElements) {
                    String genreText = genreElement.ownText();
                    md.addGenre(getTmmGenre(genreText));
                }
            }
        }

        /*
         * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition)
         * | 178 min (extended cut)</div></div>
         */
        if (elementText.equals("Runtime")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first();
                if (runtimeElement != null) {
                    String first = runtimeElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.setRuntime(runtime);
                }
            }
        }

        if (elementText.equals("Country")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/");
                Pattern pattern = Pattern.compile("/country/(.*)");

                for (Element countryElement : countryElements) {
                    Matcher matcher = pattern.matcher(countryElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addCountry(LanguageUtils.getLocalizedCountryForLanguage(
                                    options.getLanguage().getLanguage(), countryElement.text(),
                                    matcher.group(1)));
                        } else {
                            md.addCountry(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Language")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements languageElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/language/");
                Pattern pattern = Pattern.compile("/language/(.*)");

                for (Element languageElement : languageElements) {
                    Matcher matcher = pattern.matcher(languageElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString(
                                    options.getLanguage(), languageElement.text(), matcher.group(1)));
                        } else {
                            md.addSpokenLanguage(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Certification")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                String languageCode = options.getCountry().getAlpha2();
                Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/search/title?certificates=" + languageCode);
                boolean done = false;
                for (Element certificationElement : certificationElements) {
                    String certText = certificationElement.ownText();
                    int startOfCert = certText.indexOf(":");
                    if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                        certText = certText.substring(startOfCert + 1);
                    }

                    Certification certification = Certification.getCertification(options.getCountry(),
                            certText);
                    if (certification != null) {
                        md.addCertification(certification);
                        done = true;
                        break;
                    }
                }

                if (!done && languageCode.equals("DE")) {
                    certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                            "/search/title?certificates=XWG");
                    for (Element certificationElement : certificationElements) {
                        String certText = certificationElement.ownText();
                        int startOfCert = certText.indexOf(":");
                        if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                            certText = certText.substring(startOfCert + 1);
                        }

                        Certification certification = Certification.getCertification(options.getCountry(),
                                certText);
                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }

            }
        }
    }

    // director
    Element directorsElement = doc.getElementById("directors");
    while (directorsElement != null && directorsElement.tag().getName() != "header") {
        directorsElement = directorsElement.parent();
    }
    if (directorsElement != null) {
        directorsElement = directorsElement.nextElementSibling();
    }
    if (directorsElement != null) {
        for (Element directorElement : directorsElement.getElementsByClass("name")) {
            String director = directorElement.text().trim();

            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
            cm.setName(director);
            md.addCastMember(cm);
        }
    }

    // actors
    Element castTableElement = doc.getElementsByClass("cast_list").first();
    if (castTableElement != null) {
        Elements tr = castTableElement.getElementsByTag("tr");
        for (Element row : tr) {
            MediaCastMember cm = parseCastMember(row);
            if (cm != null && StringUtils.isNotEmpty(cm.getName())
                    && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    // writers
    Element writersElement = doc.getElementById("writers");
    while (writersElement != null && writersElement.tag().getName() != "header") {
        writersElement = writersElement.parent();
    }
    if (writersElement != null) {
        writersElement = writersElement.nextElementSibling();
    }
    if (writersElement != null) {
        Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element writerElement : writersElements) {
            String writer = cleanString(writerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
            cm.setName(writer);
            md.addCastMember(cm);
        }
    }

    // producers
    Element producersElement = doc.getElementById("producers");
    while (producersElement != null && producersElement.tag().getName() != "header") {
        producersElement = producersElement.parent();
    }
    if (producersElement != null) {
        producersElement = producersElement.nextElementSibling();
    }
    if (producersElement != null) {
        Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element producerElement : producersElements) {
            String producer = cleanString(producerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER);
            cm.setName(producer);
            md.addCastMember(cm);
        }
    }

    // producers
    Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title");
    Element prodCompHeaderElement = null;

    for (Element possibleProdCompHeaderEl : prodCompHeaderElements) {
        if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) {
            prodCompHeaderElement = possibleProdCompHeaderEl;
            break;
        }
    }

    while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") {
        prodCompHeaderElement = prodCompHeaderElement.parent();
    }
    if (prodCompHeaderElement != null) {
        prodCompHeaderElement = prodCompHeaderElement.nextElementSibling();
    }
    if (prodCompHeaderElement != null) {
        Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href",
                "/company/");

        for (Element prodCompElement : prodCompElements) {
            String prodComp = prodCompElement.ownText();
            md.addProductionCompany(prodComp);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // imdb.com has another site structure
    if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) {

        // first check synopsis content
        // Element zebraList = doc.getElementById("plot-synopsis-content");
        // if (zebraList != null) {
        // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
        // if (!p.isEmpty()) {
        // Element em = p.get(0);
        // if (!"no-synopsis-content".equals(em.id())) {
        // String plot = cleanString(em.text());
        // md.setPlot(plot);
        // }//from w  ww .  j  a  va 2  s .c o  m
        // }
        // }
        // NOPE: synopsis contains spoilers

        // just take first summary
        // <li class="ipl-zebra-list__item" id="summary-ps21700000">
        // <p>text text text text </p>
        // <div class="author-container">
        // <em>&mdash;<a href="/search/title?plot_author=author">Author Name</a></em>
        // </div>
        // </li>
        Element zebraList = doc.getElementById("plot-summaries-content");
        if (zebraList != null) {
            Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
            if (!p.isEmpty()) {
                Element em = p.get(0);

                // remove author
                Elements authors = em.getElementsByClass("author-container");
                if (!authors.isEmpty()) {
                    authors.get(0).remove();
                }

                if (!"no-summary-content".equals(em.id())) {
                    String plot = cleanString(em.text());
                    md.setPlot(plot);
                }
            }
        }

    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.setPlot(plot);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

/**
 * get the episode metadata./*from  w  w  w.ja v a 2 s  . c  o m*/
 * 
 * @param options
 *          the scrape options
 * @return the MediaMetaData
 * @throws Exception
 */
MediaMetadata getEpisodeMetadata(MediaScrapeOptions options) throws Exception {
    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    String imdbId = options.getImdbId();
    if (StringUtils.isBlank(imdbId)) {
        return md;
    }

    // get episode number and season number
    int seasonNr = -1;
    int episodeNr = -1;

    try {
        seasonNr = Integer.parseInt(options.getId(MediaMetadata.SEASON_NR));
        episodeNr = Integer.parseInt(options.getId(MediaMetadata.EPISODE_NR));
    } catch (Exception e) {
        LOGGER.warn("error parsing season/episode number");
    }

    if (seasonNr == -1 || episodeNr == -1) {
        return md;
    }

    // first get the base episode metadata which can be gathered via
    // getEpisodeList()
    List<MediaEpisode> episodes = getEpisodeList(options);

    MediaEpisode wantedEpisode = null;
    for (MediaEpisode episode : episodes) {
        if (episode.season == seasonNr && episode.episode == episodeNr) {
            wantedEpisode = episode;
            break;
        }
    }

    // we did not find the episode; return
    if (wantedEpisode == null) {
        return md;
    }

    md.setId(providerInfo.getId(), wantedEpisode.ids.get(providerInfo.getId()));
    md.setEpisodeNumber(wantedEpisode.episode);
    md.setSeasonNumber(wantedEpisode.season);
    md.setTitle(wantedEpisode.title);
    md.setPlot(wantedEpisode.plot);
    md.setRating(wantedEpisode.rating);
    md.setVoteCount(wantedEpisode.voteCount);

    try {
        SimpleDateFormat sdf = new SimpleDateFormat("d MMM. yyyy", Locale.US);
        md.setReleaseDate(sdf.parse(wantedEpisode.firstAired));
    } catch (ParseException e) {
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy", Locale.US); // without "dot" - "May" for example
            md.setReleaseDate(sdf.parse(wantedEpisode.firstAired));
        } catch (ParseException ign) {
            LOGGER.warn("Could not parse date format: {}", wantedEpisode.firstAired);
        }
    }

    // and finally the cast which needed to be fetched from the fullcredits page
    if (wantedEpisode.ids.get(providerInfo.getId()) instanceof String
            && StringUtils.isNotBlank((String) wantedEpisode.ids.get(providerInfo.getId()))) {
        Url url = new Url(
                imdbSite.getSite() + "/title/" + wantedEpisode.ids.get(providerInfo.getId()) + "/fullcredits");
        url.addHeader("Accept-Language", "en"); // force EN for parsing by HTMl texts
        Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), "");

        // director & writer
        Element fullcredits = doc.getElementById("fullcredits_content");
        if (fullcredits != null) {
            Elements tables = fullcredits.getElementsByTag("table");

            // first table are directors
            if (tables.get(0) != null) {
                for (Element director : tables.get(0).getElementsByClass("name")) {
                    MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
                    cm.setName(director.text());
                    md.addCastMember(cm);
                }
            }

            // second table are writers
            if (tables.get(1) != null) {
                for (Element writer : tables.get(1).getElementsByClass("name")) {
                    MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
                    cm.setName(writer.text());
                    md.addCastMember(cm);
                }
            }
        }

        // actors
        Element castTableElement = doc.getElementsByClass("cast_list").first();
        if (castTableElement != null) {
            Elements tr = castTableElement.getElementsByTag("tr");
            for (Element row : tr) {
                MediaCastMember cm = parseCastMember(row);
                if (cm != null && StringUtils.isNotEmpty(cm.getName())
                        && StringUtils.isNotEmpty(cm.getCharacter())) {
                    cm.setType(MediaCastMember.CastType.ACTOR);
                    md.addCastMember(cm);
                }
            }
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    // we can only work further if we got a search result on zelluloid.de
    if (options.getResult() == null) {
        throw new Exception("Scrape with Zelluloid.de without prior search is not supported");
    }//from  w  ww.j a v  a  2s .  c o m

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    // preset values from searchresult (if we have them)
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
            Utils.removeSortableName(options.getResult().getOriginalTitle()));
    md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle()));
    md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear());
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle());

    String id = "";
    if (StringUtils.isEmpty(options.getResult().getId())) {
        id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)");
    } else {
        id = options.getResult().getId();
    }

    String detailurl = options.getResult().getUrl();
    if (StringUtils.isEmpty(detailurl)) {
        detailurl = BASE_URL + "/filme/index.php3?id=" + id;
    }

    Url url;
    try {
        LOGGER.debug("get details page");
        url = new CachedUrl(detailurl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();

        // parse plot
        String plot = doc.getElementsByAttributeValue("class", "bigtext").text();
        md.storeMetadata(MediaMetadata.PLOT, plot);
        md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot);

        // parse poster
        el = doc.getElementsByAttributeValueStarting("src", "/images/poster");
        if (el.size() == 1) {
            md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src"));
        }

        // parse year
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) {
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                md.storeMetadata(MediaMetadata.YEAR, el.get(0).text());
            }
        }

        // parse cinema release
        el = doc.getElementsByAttributeValueContaining("href", "?v=w");
        if (el.size() > 0) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy");
                Date d = sdf.parse(el.get(0).text());
                sdf = new SimpleDateFormat("yyyy-MM-dd");
                md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d));
            } catch (Exception e) {
                LOGGER.warn("cannot parse cinema release date: " + el.get(0).text());
            }
        }

        // parse original title
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                    StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<"));
        }
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }

        // parse runtime
        String rt = (StrgUtils.substr(doc.toString(), "ca.&nbsp;(.*?)&nbsp;min"));
        if (!rt.isEmpty()) {
            try {
                md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt));
            } catch (Exception e2) {
                LOGGER.warn("cannot convert runtime: " + rt);
            }
        }

        // parse genres
        el = doc.getElementsByAttributeValueContaining("href", "az.php3?g=");
        for (Element g : el) {
            String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1);
            md.addGenre(getTmmGenre(gid));
        }

        // parse cert
        // FSK: ab 12, $230 Mio. Budget
        String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]");
        if (!fsk.isEmpty()) {
            md.addCertification(Certification.findCertification(fsk));
        }

        // parse rating
        Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable");
        if (ratings.size() == 2) { // get user rating
            Element e = ratings.get(1);
            // <div>87%</div>
            String r = e.getElementsByTag("div").text().replace("%", "");
            try {
                md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10
            } catch (Exception e2) {
                LOGGER.warn("cannot convert rating: " + r);
            }
        }

        // details page
        doc = null;
        String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id;
        try {
            url = new CachedUrl(detailsUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get details: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(detailsUrl);
        }

        if (doc != null) {
            Element tab = doc.getElementById("ccdetails");
            int header = 0;
            String lastRole = "";
            for (Element tr : tab.getElementsByTag("tr")) {
                if (tr.toString().contains("dyngfx")) { // header gfx
                    if (tr.toString().contains("Besetzung")) {
                        header = 1;
                    } else if (tr.toString().contains("Crew")) {
                        header = 2;
                    } else if (tr.toString().contains("Produktion")) {
                        header = 3;
                    } else if (tr.toString().contains("Verleih")) {
                        header = 4;
                    } else if (tr.toString().contains("Alternativtitel")) {
                        header = 5;
                    }
                    continue;
                } else {
                    // no header gfx, so data
                    MediaCastMember mcm = new MediaCastMember();
                    el = tr.getElementsByTag("td");
                    if (header == 1) {
                        // actors
                        if (el.size() == 2) {
                            mcm.setCharacter(el.get(0).text());
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            mcm.setType(MediaCastMember.CastType.ACTOR);
                            // System.out.println("Cast: " + mcm.getCharacter() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                            // TODO: parse actor detail pages :/
                        }
                    } else if (header == 2) {
                        // crew
                        if (el.size() == 2) {
                            String crewrole = el.get(0).html().trim();
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            if (crewrole.equals("&nbsp;")) {
                                mcm.setPart(lastRole);
                            } else {
                                mcm.setPart(crewrole);
                                lastRole = crewrole;
                            }
                            if (crewrole.equals("Regie")) {
                                mcm.setType(MediaCastMember.CastType.DIRECTOR);
                            } else if (crewrole.equals("Drehbuch")) {
                                mcm.setType(MediaCastMember.CastType.WRITER);
                            } else {
                                mcm.setType(MediaCastMember.CastType.OTHER);
                            }
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            // System.out.println("Crew: " + mcm.getPart() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                        }
                    } else if (header == 3) {
                        // production
                        md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text());
                    }
                }
            }
        }

        // get links page
        doc = null;
        String linksUrl = BASE_URL + "/filme/links.php3?id=" + id;
        try {
            url = new CachedUrl(linksUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get links page: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(linksUrl);
        }

        if (doc != null) {
            el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com");
            if (el != null && el.size() > 0) {
                String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})");
                if (imdb.isEmpty()) {
                    imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)");
                }
                md.setId(MediaMetadata.IMDBID, imdb);
            }
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + options.getResult().getUrl());

        // clear cache
        CachedUrl.removeCachedFileForUrl(detailurl);

        throw e;
    }

    return md;
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void processTranslationTable(@NotNull String queryString, @NotNull Document document,
        @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
        @NotNull Language targetLanguage) {
    // Find main table (german to X)
    String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-"
            + targetLanguage.getIdentifier().toLowerCase();

    Element translationTable = document.getElementById("dictionary-" + languageIdentifier);

    // Process the main table with its entries
    if (translationTable != null) {
        // Find all relevant entries, filter them by class and process them
        translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0)
                .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage));
        // Extract synonyms
        Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table");
        if (synonymTableCandidates.size() > 0) {
            extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage);
        }//from   w w  w  . j a va2 s .  c  o m

    } else {
        LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier,
                targetLanguage.getIdentifier(), queryString);
    }
}

From source file:poe.trade.assist.SearchForm.java

private String removeAllExceptSearchForm(String html) {
    String htmlDirectory = htmlDirectory();
    Document doc = Jsoup.parse(html);

    // Remove stuff outside of id="main"
    //      doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove());

    Element head = doc.head();/* w ww .jav  a 2 s  .c o  m*/

    // Replace everthing in the <head>
    head.children().stream().forEach(e -> e.remove());
    head.appendElement("meta").attr("charset", "utf-8");
    head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width");
    head.appendElement("title").text("poe.trade.assist");
    head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js");
    head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css");

    // Show search form
    Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", ""));

    Optional.ofNullable(doc.getElementById("search"))
            .ifPresent(e -> e.attr("action", "http://poe.trade/search"));
    //      
    //      Element mainElement = doc.getElementById("main");
    //      Element topDivContainer = mainElement.child(0);
    //
    //      // Remove everthing that is not id="content" or h2
    //      topDivContainer.children().stream()
    //         .filter(e -> !"content".equalsIgnoreCase(e.id()))
    //         .filter(e -> !e.tag().getName().equalsIgnoreCase("h2"))
    //         .forEach(e -> e.remove());
    //      
    //      // Clean up stuff inside id="content"
    //      
    //         // Remove "Show search form", "search/import"
    //         Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst();
    //         searchFormElem.ifPresent(e -> e.remove());
    //         
    //         Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst();
    //         searchOrImportDiv.ifPresent(e -> e.remove());
    //
    //         // Remove search results
    Elements searchResultBlocks = doc.getElementsByClass("search-results-block");
    if (searchResultBlocks.size() > 0) {
        searchResultBlocks.get(0).remove();
    }

    // append assist as the last element in body
    //       doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js");

    String cleanHtml = doc.toString();
    return cleanHtml;
}

From source file:wo.trade.SearchPageScraper.java

public List<TradeItem> parse() {
    List<TradeItem> tradeItems = new LinkedList<>();
    Document doc = Jsoup.parse(page, "UTF-8");

    Element content = doc.getElementById("content");

    Elements items = null;//  w  w  w. jav  a2s . co  m
    if (content == null) {
        items = doc.getElementsByClass("item");
    } else {
        items = content.getElementsByClass("item");
    }

    for (Element element : items) {

        TradeItem item = new TradeItem();

        item.id = element.attr("id");
        item.id = StringUtils.remove(item.id, "item-container-");
        item.seller = element.attr("data-seller");
        item.thread = element.attr("data-thread");
        item.sellerid = element.attr("data-sellerid");
        item.buyout = element.attr("data-buyout");
        item.ign = element.attr("data-ign");
        item.league = element.attr("data-league");
        item.name = element.attr("data-name");
        item.corrupted = element.getElementsByClass("corrupted").size() > 0;
        item.identified = element.getElementsByClass("item-unid").size() == 0;

        //         System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name));

        Element sockElem = element.getElementsByClass("sockets-raw").get(0);
        item.socketsRaw = sockElem.text();

        Elements accntAgeElement = element.getElementsByAttributeValue("title",
                "account age and highest level");
        if (accntAgeElement != null && !accntAgeElement.isEmpty()) {
            item.ageAndHighLvl = accntAgeElement.get(0).text();
        }

        // ----- Requirements ----- //
        Element reqElem = element.getElementsByClass("requirements").get(0);
        List<TextNode> reqNodes = reqElem.textNodes();
        for (TextNode reqNode : reqNodes) {
            // sample [ Level:&nbsp;37 ,  Strength:&nbsp;42 ,  Intelligence:&nbsp;42 ] 
            String req = StringUtils.trimToEmpty(reqNode.getWholeText());
            req = req.replaceAll(regex_horizontal_whitespace, "");
            req = Util.removeThoseDamnWhiteSpace(req);
            String separator = ":";
            String reqType = trim(substringBefore(req, separator));
            switch (reqType) {
            case "Level":
                item.reqLvl = trim(substringAfter(req, separator));
                break;
            case "Strength":
                item.reqStr = trim(substringAfter(req, separator));
                break;
            case "Intelligence":
                item.reqInt = trim(substringAfter(req, separator));
                break;
            case "Dexterity":
                item.reqDex = trim(substringAfter(req, separator));
                break;
            }
        }
        item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst()
                .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:"))
                .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("")
                .replaceAll(regex_horizontal_whitespace, "").trim();

        // ----- Rarity by checking the item name link class ----- //
        // itemframe0 - normal
        // itemframe1 - magic
        // itemframe2 - rare
        // itemframe3 - unique
        // itemframe4 - gems
        // itemframe5 - currency
        // itemframe6 - divination card
        String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class"))
                .orElse(null);
        itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1);
        if (itemframeStr != null) {
            int frame = Integer.parseInt(itemframeStr);
            item.rarity = Rarity.valueOf(frame);
        } else {
            item.rarity = Rarity.unknown;
        }

        // ----- Verify ----- //
        item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream()
                .findFirst().map(n -> n.attr("data-hash")).orElse("").trim();

        // ----- Mods ----- //
        Elements itemModsElements = element.getElementsByClass("item-mods");
        if (itemModsElements != null && itemModsElements.size() > 0) {
            Element itemMods = itemModsElements.get(0);
            if (itemMods.getElementsByClass("bullet-item").size() != 0) {
                Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0);
                Elements ulMods = bulletItem.getElementsByTag("ul");
                if (ulMods.size() == 2) {
                    // implicit mod
                    Elements implicitLIs = ulMods.get(0).getElementsByTag("li");
                    Element implicitLi = implicitLIs.last();
                    Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value"));
                    item.implicitMod = impMod;
                }
                int indexOfExplicitMods = ulMods.size() - 1;
                Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li");
                for (Element modLi : modsLi) {
                    // explicit mods
                    Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value"));
                    item.explicitMods.add(mod);
                }
            }
        }

        // ----- Properties ----- //
        // this is the third column data (the first col is the image, second is the mods, reqs)
        item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield")
                .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src");
        item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream()
                .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "="))
                .orElse(null);

        Elements onlineSpans = element.getElementsMatchingText("online");
        if (!onlineSpans.isEmpty()) {
            item.online = "Online";
        } else {
            item.online = "";
        }

        tradeItems.add(item);
    }
    //      System.out.println("DONE --- Items");

    return tradeItems;
}