Example usage for org.jsoup.nodes Document getElementsByAttributeValueStarting

List of usage examples for org.jsoup.nodes Document getElementsByAttributeValueStarting

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByAttributeValueStarting.

Prototype

public Elements getElementsByAttributeValueStarting(String key, String valuePrefix) 

Source Link

Document

Find elements that have attributes that start with the value prefix.

Usage

From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java

@Override
protected String[] getOtherNames(Document doc) {
    String names = ((Element) doc.getElementsByAttributeValueStarting("src", "./docs/illustrations/").get(0)
            .parent().parent().childNodes().get(5)).select("td").get(1).select("font").text();
    String fName = names.substring(0, names.indexOf("(") - 1);
    return new String[] { fName };
}

From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java

@Override
protected float getPrice(Document doc) {

    String price = ((Element) doc.getElementsByAttributeValueStarting("src", "./docs/illustrations/").get(0)
            .parent().parent().childNodes().get(5)).select("tbody").get(1).select("font").get(0).select("b")
                    .text();/*w w w  .  ja  v a 2  s. c o  m*/
    if (price == null || price.equals("")) {
        price = doc.select("form").get(2).select("b").get(0).text();
    }
    price = cleanPriceString(price);
    return Float.parseFloat(price);
}

From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java

@Override
protected String getName(Document doc) throws UnsupportedCardException {
    String names = ((Element) doc.getElementsByAttributeValueStarting("src", "./docs/illustrations/").get(0)
            .parent().parent().childNodes().get(5)).select("td").get(1).select("font").text();
    String vname = names.substring(names.indexOf("(") + 2, names.indexOf(")") - 1);
    if (names.contains("Foil")) {
        throw new UnsupportedCardException(names);
    }//w w  w .ja v a 2 s .co  m

    return vname;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    // we can only work further if we got a search result on zelluloid.de
    if (options.getResult() == null) {
        throw new Exception("Scrape with Zelluloid.de without prior search is not supported");
    }/*from  www  .j a  va  2s . c  om*/

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    // preset values from searchresult (if we have them)
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
            Utils.removeSortableName(options.getResult().getOriginalTitle()));
    md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle()));
    md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear());
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle());

    String id = "";
    if (StringUtils.isEmpty(options.getResult().getId())) {
        id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)");
    } else {
        id = options.getResult().getId();
    }

    String detailurl = options.getResult().getUrl();
    if (StringUtils.isEmpty(detailurl)) {
        detailurl = BASE_URL + "/filme/index.php3?id=" + id;
    }

    Url url;
    try {
        LOGGER.debug("get details page");
        url = new CachedUrl(detailurl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();

        // parse plot
        String plot = doc.getElementsByAttributeValue("class", "bigtext").text();
        md.storeMetadata(MediaMetadata.PLOT, plot);
        md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot);

        // parse poster
        el = doc.getElementsByAttributeValueStarting("src", "/images/poster");
        if (el.size() == 1) {
            md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src"));
        }

        // parse year
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) {
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                md.storeMetadata(MediaMetadata.YEAR, el.get(0).text());
            }
        }

        // parse cinema release
        el = doc.getElementsByAttributeValueContaining("href", "?v=w");
        if (el.size() > 0) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy");
                Date d = sdf.parse(el.get(0).text());
                sdf = new SimpleDateFormat("yyyy-MM-dd");
                md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d));
            } catch (Exception e) {
                LOGGER.warn("cannot parse cinema release date: " + el.get(0).text());
            }
        }

        // parse original title
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                    StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<"));
        }
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }

        // parse runtime
        String rt = (StrgUtils.substr(doc.toString(), "ca.&nbsp;(.*?)&nbsp;min"));
        if (!rt.isEmpty()) {
            try {
                md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt));
            } catch (Exception e2) {
                LOGGER.warn("cannot convert runtime: " + rt);
            }
        }

        // parse genres
        el = doc.getElementsByAttributeValueContaining("href", "az.php3?g=");
        for (Element g : el) {
            String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1);
            md.addGenre(getTmmGenre(gid));
        }

        // parse cert
        // FSK: ab 12, $230 Mio. Budget
        String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]");
        if (!fsk.isEmpty()) {
            md.addCertification(Certification.findCertification(fsk));
        }

        // parse rating
        Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable");
        if (ratings.size() == 2) { // get user rating
            Element e = ratings.get(1);
            // <div>87%</div>
            String r = e.getElementsByTag("div").text().replace("%", "");
            try {
                md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10
            } catch (Exception e2) {
                LOGGER.warn("cannot convert rating: " + r);
            }
        }

        // details page
        doc = null;
        String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id;
        try {
            url = new CachedUrl(detailsUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get details: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(detailsUrl);
        }

        if (doc != null) {
            Element tab = doc.getElementById("ccdetails");
            int header = 0;
            String lastRole = "";
            for (Element tr : tab.getElementsByTag("tr")) {
                if (tr.toString().contains("dyngfx")) { // header gfx
                    if (tr.toString().contains("Besetzung")) {
                        header = 1;
                    } else if (tr.toString().contains("Crew")) {
                        header = 2;
                    } else if (tr.toString().contains("Produktion")) {
                        header = 3;
                    } else if (tr.toString().contains("Verleih")) {
                        header = 4;
                    } else if (tr.toString().contains("Alternativtitel")) {
                        header = 5;
                    }
                    continue;
                } else {
                    // no header gfx, so data
                    MediaCastMember mcm = new MediaCastMember();
                    el = tr.getElementsByTag("td");
                    if (header == 1) {
                        // actors
                        if (el.size() == 2) {
                            mcm.setCharacter(el.get(0).text());
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            mcm.setType(MediaCastMember.CastType.ACTOR);
                            // System.out.println("Cast: " + mcm.getCharacter() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                            // TODO: parse actor detail pages :/
                        }
                    } else if (header == 2) {
                        // crew
                        if (el.size() == 2) {
                            String crewrole = el.get(0).html().trim();
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            if (crewrole.equals("&nbsp;")) {
                                mcm.setPart(lastRole);
                            } else {
                                mcm.setPart(crewrole);
                                lastRole = crewrole;
                            }
                            if (crewrole.equals("Regie")) {
                                mcm.setType(MediaCastMember.CastType.DIRECTOR);
                            } else if (crewrole.equals("Drehbuch")) {
                                mcm.setType(MediaCastMember.CastType.WRITER);
                            } else {
                                mcm.setType(MediaCastMember.CastType.OTHER);
                            }
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            // System.out.println("Crew: " + mcm.getPart() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                        }
                    } else if (header == 3) {
                        // production
                        md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text());
                    }
                }
            }
        }

        // get links page
        doc = null;
        String linksUrl = BASE_URL + "/filme/links.php3?id=" + id;
        try {
            url = new CachedUrl(linksUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get links page: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(linksUrl);
        }

        if (doc != null) {
            el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com");
            if (el != null && el.size() > 0) {
                String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})");
                if (imdb.isEmpty()) {
                    imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)");
                }
                md.setId(MediaMetadata.IMDBID, imdb);
            }
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + options.getResult().getUrl());

        // clear cache
        CachedUrl.removeCachedFileForUrl(detailurl);

        throw e;
    }

    return md;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {//from w  ww  . j a  v a 2 s  .c o  m
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}