Java tutorial
/* * Copyright 2012 - 2015 Manuel Laggner * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tinymediamanager.scraper.zelluloid; import java.io.InputStream; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.tinymediamanager.core.Constants; import org.tinymediamanager.core.Utils; import org.tinymediamanager.scraper.Certification; import org.tinymediamanager.scraper.IMediaMetadataProvider; import org.tinymediamanager.scraper.IMediaTrailerProvider; import org.tinymediamanager.scraper.MediaCastMember; import org.tinymediamanager.scraper.MediaGenres; import org.tinymediamanager.scraper.MediaMetadata; import org.tinymediamanager.scraper.MediaProviderInfo; import org.tinymediamanager.scraper.MediaScrapeOptions; import org.tinymediamanager.scraper.MediaSearchOptions; import org.tinymediamanager.scraper.MediaSearchResult; import org.tinymediamanager.scraper.MediaTrailer; import org.tinymediamanager.scraper.MediaType; import org.tinymediamanager.scraper.MetadataUtil; import org.tinymediamanager.scraper.util.CachedUrl; import org.tinymediamanager.scraper.util.StrgUtils; import org.tinymediamanager.scraper.util.Url; /** * The Class ZelluloidMetadataProvider. A meta data provider for the site zelluloid.de * * @author Myron Boyle (myron0815@gmx.net) */ public class ZelluloidMetadataProvider implements IMediaMetadataProvider, IMediaTrailerProvider { private static final Logger LOGGER = LoggerFactory.getLogger(ZelluloidMetadataProvider.class); private static final String BASE_URL = "http://www.zelluloid.de"; private static final String PAGE_ENCODING = "ISO-8859-1"; private static ZelluloidMetadataProvider instance; private static MediaProviderInfo providerInfo = new MediaProviderInfo(Constants.ZELLULOIDID, "zelluloid.de", "Scraper for german zelluloid.de which is able to scrape movie metadata"); public static synchronized ZelluloidMetadataProvider getInstance() { if (instance == null) { instance = new ZelluloidMetadataProvider(); } return instance; } public ZelluloidMetadataProvider() { } @Override public MediaProviderInfo getProviderInfo() { return providerInfo; } @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // we can only work further if we got a search result on zelluloid.de if (options.getResult() == null) { throw new Exception("Scrape with Zelluloid.de without prior search is not supported"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; // preset values from searchresult (if we have them) md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, Utils.removeSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle())); md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear()); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle()); String id = ""; if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } String detailurl = options.getResult().getUrl(); if (StringUtils.isEmpty(detailurl)) { detailurl = BASE_URL + "/filme/index.php3?id=" + id; } Url url; try { LOGGER.debug("get details page"); url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.storeMetadata(MediaMetadata.PLOT, plot); md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src")); } // parse year if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) { el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { md.storeMetadata(MediaMetadata.YEAR, el.get(0).text()); } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); sdf = new SimpleDateFormat("yyyy-MM-dd"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d)); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); } if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(detailsUrl); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { mcm.setCharacter(el.get(0).text()); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); // System.out.println("Cast: " + mcm.getCharacter() + " - " + // mcm.getName()); md.addCastMember(mcm); // TODO: parse actor detail pages :/ } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { mcm.setPart(lastRole); } else { mcm.setPart(crewrole); lastRole = crewrole; } if (crewrole.equals("Regie")) { mcm.setType(MediaCastMember.CastType.DIRECTOR); } else if (crewrole.equals("Drehbuch")) { mcm.setType(MediaCastMember.CastType.WRITER); } else { mcm.setType(MediaCastMember.CastType.OTHER); } mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); // System.out.println("Crew: " + mcm.getPart() + " - " + // mcm.getName()); md.addCastMember(mcm); } } else if (header == 3) { // production md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(linksUrl); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDBID, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + options.getResult().getUrl()); // clear cache CachedUrl.removeCachedFileForUrl(detailurl); throw e; } return md; } private String cleanSearch(String q) { q = " " + q.toLowerCase() + " "; // easier regex // TODO: doppelte hintereinander funzen so nicht q = q.replaceAll("(?i)( a | the | der | die | das |\\(\\d+\\))", " "); q = q.replaceAll("[^A-Za-z0-9 ]", " "); q = q.replaceAll(" ", ""); return q.trim(); } @Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else { LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; } @Override public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception { // http://www.zelluloid.de/filme/trailer.php3?id=7614 return null; } private MediaGenres getTmmGenre(String genre) { MediaGenres g = null; if (genre.isEmpty()) { return g; } try { int gid = Integer.parseInt(genre); // @formatter:off switch (gid) { case 2: g = MediaGenres.COMEDY; break; // Komdie case 3: g = MediaGenres.ACTION; break; // Action case 4: g = MediaGenres.THRILLER; break; // Thriller case 5: g = MediaGenres.WAR; break; // Krieg case 6: g = MediaGenres.SCIENCE_FICTION; break; // Science-Fiction case 7: g = MediaGenres.FANTASY; break; // Fantasy case 9: g = MediaGenres.ANIMATION; break; // Zeichentrick case 10: g = MediaGenres.ANIMATION; break; // Computeranim... case 11: g = null; break; // Remake case 13: g = MediaGenres.ANIMATION; break; // Anime case 14: g = MediaGenres.DRAMA; break; // Drama case 15: g = MediaGenres.DOCUMENTARY; break; // Dokumentation case 16: g = MediaGenres.ADVENTURE; break; // Abenteuer case 17: g = MediaGenres.ROMANCE; break; // Lovestory case 18: g = MediaGenres.ANIMATION; break; // Comicverfilmung case 19: g = MediaGenres.ROAD_MOVIE; break; // Roadmovie case 22: g = MediaGenres.HORROR; break; // Horror case 23: g = MediaGenres.EROTIC; break; // Erotik case 25: g = MediaGenres.DISASTER; break; // Katastrophe case 26: g = MediaGenres.THRILLER; break; // Spionage case 27: g = MediaGenres.SPORT; break; // Kampfsport case 28: g = MediaGenres.BIOGRAPHY; break; // Biografie case 29: g = MediaGenres.HISTORY; break; // Ritter case 30: g = MediaGenres.SCIENCE_FICTION; break; // Endzeit case 31: g = MediaGenres.SCIENCE_FICTION; break; // Cyberspace case 32: g = MediaGenres.SCIENCE_FICTION; break; // Computer case 33: g = MediaGenres.WESTERN; break; // Western case 34: g = MediaGenres.CRIME; break; // Gericht case 35: g = MediaGenres.WAR; break; // U-Boot case 36: g = MediaGenres.CRIME; break; // Krimi case 37: g = MediaGenres.HORROR; break; // Splatter case 38: g = MediaGenres.MUSICAL; break; // Musical case 39: g = MediaGenres.MUSIC; break; // Musik case 40: g = MediaGenres.FAMILY; break; // Familie case 42: g = MediaGenres.MYSTERY; break; // Mystery case 43: g = MediaGenres.SPORT; break; // Sport case 44: g = MediaGenres.REALITY_TV; break; // Schule case 45: g = MediaGenres.WAR; break; // Militr case 46: g = MediaGenres.ANIMATION; break; // Trick case 47: g = MediaGenres.INDIE; break; // Experimental... case 48: g = MediaGenres.HORROR; break; // Vampire case 49: g = MediaGenres.SCIENCE_FICTION; break; // Zeitreise case 50: g = MediaGenres.FANTASY; break; // Mrchen case 51: g = MediaGenres.CRIME; break; // Serienkiller case 52: g = MediaGenres.SILENT_MOVIE; break; // Stummfilm case 53: g = MediaGenres.SHORT; break; // Kurzfilm case 54: g = MediaGenres.INDIE; break; // Blaxploitation case 55: g = MediaGenres.FAMILY; break; // Heimat case 56: g = MediaGenres.SCIENCE_FICTION; break; // Spielverfilmung case 59: g = MediaGenres.FAMILY; break; // Weihnachten case 61: g = MediaGenres.SERIES; break; // Soap case 62: g = MediaGenres.HISTORY; break; // Piraten case 63: g = MediaGenres.FOREIGN; break; // Bollywood case 64: g = MediaGenres.GAME_SHOW; break; // Show case 65: g = null; break; // 3D case 68: g = MediaGenres.HORROR; break; // Zombies } // @formatter:on } catch (Exception e) { g = MediaGenres.getGenre(genre); } return g; } }