List of usage examples for org.jsoup.select Elements select
public Elements select(String query)
From source file:org.loklak.api.search.EventBriteCrawlerService.java
public static SusiThought crawlEventBrite(String url) { Document htmlPage = null;/*from w w w. j a va 2 s .c o m*/ try { htmlPage = Jsoup.connect(url).get(); } catch (Exception e) { e.printStackTrace(); } String eventID = null; String eventName = null; String eventDescription = null; // TODO Fetch Event Color String eventColor = null; String imageLink = null; String eventLocation = null; String startingTime = null; String endingTime = null; String ticketURL = null; Elements tagSection = null; Elements tagSpan = null; String[][] tags = new String[5][2]; String topic = null; // By default String closingDateTime = null; String schedulePublishedOn = null; JSONObject creator = new JSONObject(); String email = null; Float latitude = null; Float longitude = null; String privacy = "public"; // By Default String state = "completed"; // By Default String eventType = ""; String temp; Elements t; eventID = htmlPage.getElementsByTag("body").attr("data-event-id"); eventName = htmlPage.getElementsByClass("listing-hero-body").text(); eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text(); eventColor = null; imageLink = htmlPage.getElementsByTag("picture").attr("content"); eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text(); temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); if (temp.length() >= 20) { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content") .substring(0, 19); } else { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); } temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); if (temp.length() >= 20) { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content") .substring(0, 19); } else { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); } ticketURL = url + "#tickets"; // TODO Tags to be modified to fit in the format of Open Event "topic" tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs"); tagSpan = tagSection.select("span"); topic = ""; int iterator = 0, k = 0; for (Element e : tagSpan) { if (iterator % 2 == 0) { tags[k][1] = "www.eventbrite.com" + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href"); } else { tags[k][0] = e.text(); k++; } iterator++; } creator.put("email", ""); creator.put("id", "1"); // By Default temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"); if (temp.length() > 0) { latitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content")); } temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"); if (temp.length() > 0) { longitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content")); } // TODO This returns: "events.event" which is not supported by Open // Event Generator // eventType = htmlPage.getElementsByAttributeValue("property", // "og:type").attr("content"); String organizerName = null; String organizerLink = null; String organizerProfileLink = null; String organizerWebsite = null; String organizerContactInfo = null; String organizerDescription = null; String organizerFacebookFeedLink = null; String organizerTwitterFeedLink = null; String organizerFacebookAccountLink = null; String organizerTwitterAccountLink = null; temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text(); if (temp.length() >= 5) { organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text() .substring(4); } else { organizerName = ""; } organizerLink = url + "#listing-organizer"; organizerProfileLink = htmlPage .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden") .attr("href"); organizerContactInfo = url + "#lightbox_contact"; Document orgProfilePage = null; try { orgProfilePage = Jsoup.connect(organizerProfileLink).get(); } catch (Exception e) { e.printStackTrace(); } if (orgProfilePage != null) { t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website"); if (t != null) { organizerWebsite = orgProfilePage .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text(); } else { organizerWebsite = ""; } t = orgProfilePage.select("div.js-long-text.organizer-description"); if (t != null) { organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text(); } else { organizerDescription = ""; } organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed"; organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed"; t = orgProfilePage.getElementsByAttributeValue("class", "fb-page"); if (t != null) { organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page") .attr("data-href"); } else { organizerFacebookAccountLink = ""; } t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline"); if (t != null) { organizerTwitterAccountLink = orgProfilePage .getElementsByAttributeValue("class", "twitter-timeline").attr("href"); } else { organizerTwitterAccountLink = ""; } } JSONArray socialLinks = new JSONArray(); JSONObject fb = new JSONObject(); fb.put("id", "1"); fb.put("name", "Facebook"); fb.put("link", organizerFacebookAccountLink); socialLinks.put(fb); JSONObject tw = new JSONObject(); tw.put("id", "2"); tw.put("name", "Twitter"); tw.put("link", organizerTwitterAccountLink); socialLinks.put(tw); JSONArray jsonArray = new JSONArray(); JSONObject event = new JSONObject(); event.put("event_url", url); event.put("id", eventID); event.put("name", eventName); event.put("description", eventDescription); event.put("color", eventColor); event.put("background_url", imageLink); event.put("closing_datetime", closingDateTime); event.put("creator", creator); event.put("email", email); event.put("location_name", eventLocation); event.put("latitude", latitude); event.put("longitude", longitude); event.put("start_time", startingTime); event.put("end_time", endingTime); event.put("logo", imageLink); event.put("organizer_description", organizerDescription); event.put("organizer_name", organizerName); event.put("privacy", privacy); event.put("schedule_published_on", schedulePublishedOn); event.put("state", state); event.put("type", eventType); event.put("ticket_url", ticketURL); event.put("social_links", socialLinks); event.put("topic", topic); jsonArray.put(event); JSONObject org = new JSONObject(); org.put("organizer_name", organizerName); org.put("organizer_link", organizerLink); org.put("organizer_profile_link", organizerProfileLink); org.put("organizer_website", organizerWebsite); org.put("organizer_contact_info", organizerContactInfo); org.put("organizer_description", organizerDescription); org.put("organizer_facebook_feed_link", organizerFacebookFeedLink); org.put("organizer_twitter_feed_link", organizerTwitterFeedLink); org.put("organizer_facebook_account_link", organizerFacebookAccountLink); org.put("organizer_twitter_account_link", organizerTwitterAccountLink); jsonArray.put(org); JSONArray microlocations = new JSONArray(); jsonArray.put(new JSONObject().put("microlocations", microlocations)); JSONArray customForms = new JSONArray(); jsonArray.put(new JSONObject().put("customForms", customForms)); JSONArray sessionTypes = new JSONArray(); jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes)); JSONArray sessions = new JSONArray(); jsonArray.put(new JSONObject().put("sessions", sessions)); JSONArray sponsors = new JSONArray(); jsonArray.put(new JSONObject().put("sponsors", sponsors)); JSONArray speakers = new JSONArray(); jsonArray.put(new JSONObject().put("speakers", speakers)); JSONArray tracks = new JSONArray(); jsonArray.put(new JSONObject().put("tracks", tracks)); String userHome = System.getProperty("user.home"); String path = userHome + "/Downloads/EventBriteInfo"; new File(path).mkdir(); try (FileWriter file = new FileWriter(path + "/event.json")) { file.write(event.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/org.json")) { file.write(org.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/social_links.json")) { file.write(socialLinks.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/microlocations.json")) { file.write(microlocations.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/custom_forms.json")) { file.write(customForms.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/session_types.json")) { file.write(sessionTypes.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sessions.json")) { file.write(sessions.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sponsors.json")) { file.write(sponsors.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/speakers.json")) { file.write(speakers.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/tracks.json")) { file.write(tracks.toString()); } catch (IOException e1) { e1.printStackTrace(); } SusiThought json = new SusiThought(); json.setData(jsonArray); return json; }
From source file:org.metaservice.demo.wordpress.WordpressParser.java
@Override public List<VersionEntry> parse(Reader s, ArchiveAddress archiveParameters) throws ParserException { try {/*from w ww .j a v a 2s .c o m*/ Document document = Jsoup.parse(IOUtils.toString(s), "http://wordpress.org/download/release-archive/"); ArrayList<VersionEntry> result = new ArrayList<>(); Elements tables = document.select("table.widefat"); for (Element table : tables) { Elements rows = table.select("tr"); // System.err.println(rows); for (Element row : rows) { Elements columns = row.select("td"); if (columns.size() > 0) { VersionEntry versionEntry = new VersionEntry(); versionEntry.setName(columns.get(0).text().trim()); versionEntry.setZip(columns.select("a[href$=zip]").attr("href")); versionEntry.setTar(columns.select("a[href$=tar.gz]").attr("href")); versionEntry.setIis(columns.select("a[href$=IIS.zip]").attr("href")); result.add(versionEntry); } } } return result; } catch (IOException e) { throw new ParserException(e); } }
From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java
@Override protected void fillCollectionSet(String urlString, Request request, CollectionAgent agent, XmlCollectionSet collectionSet, XmlSource source) throws Exception { XmlCollectionResource nodeResource = new XmlSingleInstanceCollectionResource(agent); Document doc = getJsoupDocument(urlString, request); for (XmlGroup group : source.getXmlGroups()) { LOG.debug("fillCollectionSet: getting resources for XML group {} using selector {}", group.getName(), group.getResourceXpath()); Date timestamp = getTimeStamp(doc, group); Elements elements = doc.select(group.getResourceXpath()); LOG.debug("fillCollectionSet: {} => {}", group.getResourceXpath(), elements); String resourceName = getResourceName(elements, group); LOG.debug("fillCollectionSet: processing XML resource {}", resourceName); XmlCollectionResource collectionResource; if (group.getResourceType().equalsIgnoreCase(CollectionResource.RESOURCE_TYPE_NODE)) { collectionResource = nodeResource; } else {/*from w ww. j a va 2 s .c om*/ collectionResource = getCollectionResource(agent, resourceName, group.getResourceType(), timestamp); } LOG.debug("fillCollectionSet: processing resource {}", collectionResource); AttributeGroupType attribGroupType = new AttributeGroupType(group.getName(), group.getIfType()); for (XmlObject object : group.getXmlObjects()) { Elements el = elements.select(object.getXpath()); XmlCollectionAttributeType attribType = new XmlCollectionAttributeType(object, attribGroupType); collectionResource.setAttributeValue(attribType, el == null ? null : el.html()); } processXmlResource(collectionResource, attribGroupType); collectionSet.getCollectionResources().add(collectionResource); } }
From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java
/** * Gets the resource name.//from w w w.j a v a 2 s . c o m * * @param elements the JSoup elements * @param group the group * @return the resource name */ private String getResourceName(Elements elements, XmlGroup group) { // Processing multiple-key resource name. if (group.hasMultipleResourceKey()) { List<String> keys = new ArrayList<String>(); for (String key : group.getXmlResourceKey().getKeyXpathList()) { LOG.debug("getResourceName: getting key for resource's name using selector {}", key); Elements el = elements.select(key); if (el != null) { keys.add(el.html()); } } return StringUtils.join(keys, "_"); } // If key-xpath doesn't exist or not found, a node resource will be assumed. if (group.getKeyXpath() == null) { return "node"; } // Processing single-key resource name. LOG.debug("getResourceName: getting key for resource's name using selector {}", group.getKeyXpath()); Elements el = elements.select(group.getKeyXpath()); return el == null ? null : el.html(); }
From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java
/** * ????/*from w w w . j a v a 2s .co m*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) throws ConfigurationException { Elements extractElement = doc.select("extract"); super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } this.templates.add(extractTemplate); } return this; }
From source file:org.sbs.goodcrawler.jobconf.FetchConfig.java
/** * ???/*from w w w .j a v a 2 s . c om*/ * @param confFile * @return */ public FetchConfig loadConfig(Document confDoc) throws ConfigurationException { try { Document doc = confDoc; super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); Elements e = doc.select("fetch"); this.type = e.select("type").text(); this.agent = e.select("agent").text(); String temp = e.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } temp = e.select("delayBetweenRequests").text(); if (StringUtils.isNotBlank(temp)) { this.delayBetweenRequests = Integer.parseInt(temp); } temp = e.select("maxDepthOfCrawling").text(); if (StringUtils.isNotBlank(temp)) { this.maxDepthOfCrawling = Integer.parseInt(temp); } temp = e.select("fetchBinaryContent").text(); if (StringUtils.isNotBlank(temp)) { this.fetchBinaryContent = Boolean.parseBoolean(temp); } if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) { this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text()); } temp = e.select("fileSuffix").text(); if (StringUtils.isNotBlank(temp)) { this.fileSuffix = temp; } temp = e.select("maxDownloadSizePerPage").text(); if (StringUtils.isNotBlank(temp)) { this.maxDownloadSizePerPage = Integer.parseInt(temp); } temp = e.select("https").text(); if (StringUtils.isNotBlank(temp)) { this.https = Boolean.parseBoolean(temp); } temp = e.select("onlyDomain").text(); if (StringUtils.isNotBlank(temp)) { this.onlyDomain = Boolean.parseBoolean(temp); } temp = e.select("socketTimeoutMilliseconds").text(); if (StringUtils.isNotBlank(temp)) { this.socketTimeoutMilliseconds = Integer.parseInt(temp); } temp = e.select("connectionTimeout").text(); if (StringUtils.isNotBlank(temp)) { this.connectionTimeout = Integer.parseInt(temp); } temp = e.select("maxTotalConnections").text(); if (StringUtils.isNotBlank(temp)) { this.maxTotalConnections = Integer.parseInt(temp); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text()); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(temp); } if (StringUtils.isNotBlank(e.select("proxyHost").text())) { this.proxyHost = e.select("proxyHost").text(); } if (StringUtils.isNotBlank(e.select("proxyPort").text())) { this.proxyPort = Integer.parseInt(e.select("proxyPort").text()); } if (StringUtils.isNotBlank(e.select("proxyUsername").text())) { this.proxyUsername = e.select("proxyUsername").text(); } if (StringUtils.isNotBlank(e.select("proxyPassword").text())) { this.proxyPassword = e.select("proxyPassword").text(); } if (StringUtils.isNotBlank(e.select("proxyHost").text())) { this.proxyHost = e.select("proxyHost").text(); } // seed Elements seeds = doc.select("fetch seeds seed"); for (Element element : seeds) { WebURL url = new WebURL(); String seed = element.text(); this.seeds.add(seed); url.setURL(seed); url.setJobName(jobName); url.setDepth((short) 0); try { PendingManager.getPendingUlr(jobName).addElement(url); BloomfilterHelper.getInstance().add(url.getURL()); } catch (QueueException e1) { e1.printStackTrace(); } } /* * ??Url */ Elements fetchUrlFilters = doc.select("fetchUrlFilters filter"); for (Element element : fetchUrlFilters) { this.fetchUrlFilters.add(element.text()); } /* * ?????Url */ Elements extractUrlfilters = doc.select("extractUrlfilters filter"); for (Element element : extractUrlfilters) { this.extractUrlfilters.add(element.text()); } } catch (NumberFormatException e) { throw new ConfigurationException("?" + e.getMessage()); } return this; }
From source file:org.sbs.goodcrawler.jobconf.StoreConfig.java
public StoreConfig loadConfig(Document confDoc) { Document doc = confDoc;/*from www. jav a 2s. c om*/ jobName = doc.select("job").attr("name"); indexName = doc.select("job").attr("indexName"); Elements e = doc.select("store"); this.type = e.select("type").text(); if (StringUtils.isNotBlank(e.select("threadNum").text())) { this.threadNum = Integer.parseInt(e.select("threadNum").text()); } String className = e.select("plugin").text(); if (StringUtils.isNotBlank(className)) { this.pluginClass = className; } // id? String idPolicy = e.select("idPolicy").text(); if (StringUtils.isNotBlank(idPolicy)) { id = EnumUtils.getEnum(IDPolicy.class, idPolicy); if (!IDPolicy.auto.equals(id)) { String pref = e.select("ref").text(); if (StringUtils.isNotBlank(pref)) { this.policyRef = pref; } if (StringUtils.isBlank(this.policyRef)) { try { throw new ConfigurationException("ID??"); } catch (Exception e2) { e2.printStackTrace(); } } } } return this; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java
@Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try {/* www . j a v a2 s.co m*/ Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Get movie meta data from aebn.net.//from w w w .ja v a 2 s . c om * */ @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("AEBN: getMetadata() {}", options); // check if there is already meta data present in the result if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) { LOGGER.debug("AEBN: return metadata from cache"); return options.getResult().getMediaMetadata(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Elements elements = null; Element element = null; Integer aebnId = 0; // get AebnId from previous search result if ((options.getResult() != null) && (options.getResult().getId() != null)) { aebnId = Integer.parseInt(options.getResult().getId()); LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId); // preset some values from search result (if there is one) // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy". md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, StrgUtils.removeCommonSortableName(options.getResult().getTitle())); } // or get AebnId from options if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) { LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID)); aebnId = Integer.parseInt(options.getId(AEBNID)); } if (!isValidAebnId(aebnId)) { LOGGER.warn("AEBN: no or incorrect aebnId, aborting"); return md; } // ID md.setId(providerInfo.getId(), aebnId); LOGGER.debug("AEBN: aebnId({})", aebnId); // Base download url for data scraping String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId; String locale = options.getLanguage().name(); if (!StringUtils.isBlank(locale)) { downloadUrl = downloadUrl + "&locale=" + locale; LOGGER.debug("AEBN: used locale({})", locale); } // begin download and scrape try { LOGGER.debug("AEBN: download movie detail page"); Url url = new Url(downloadUrl); InputStream in = url.getInputStream(); Document document = Jsoup.parse(in, "UTF-8", ""); in.close(); // Title // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1> LOGGER.debug("AEBN: parse title"); elements = document.getElementsByAttributeValue("class", "md-movieTitle"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieTitle = cleanString(element.text()); LOGGER.debug("AEBN: title({})", movieTitle); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // Poster // front cover: // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg"; md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl); // Fanart/Background // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..." // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." /> LOGGER.debug("AEBN: parse fanart / scene thumbs"); elements = document.getElementsByAttributeValue("class", "SceneThumbnail"); LOGGER.debug("AEBN: {} elements found", elements.size()); int i = 1; for (Element anchor : elements) { String backgroundUrl = anchor.attr("src"); LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl); md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl); i++; } // Runtime LOGGER.debug("AEBN: parse runtime"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieRuntime = cleanString(element.attr("content")); movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M"); LOGGER.debug("AEBN: runtime({})", movieRuntime); md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime); } // Year LOGGER.debug("AEBN: parse year"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieYear = cleanString(element.attr("content")); movieYear = StrgUtils.substr(movieYear, "(\\d+)-"); LOGGER.debug("AEBN: year({})", movieYear); md.storeMetadata(MediaMetadata.YEAR, movieYear); } // Series (Collection) LOGGER.debug("AEBN: parse collection"); elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieCollection = cleanString(element.text()); // Fake a TMDB_SET based on the hash value of the collection name int movieCollectionHash = movieCollection.hashCode(); md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection); md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash); LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash); } // Studio LOGGER.debug("AEBN: parse studio"); elements = document.getElementsByAttributeValue("id", "md-details") .select("[itemprop=productionCompany]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String movieStudio = cleanString(elements.first().text()); LOGGER.debug("AEBN: studio({})", movieStudio); md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio); } // Genre LOGGER.debug("AEBN: parse genre"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]"); for (Element g : elements) { md.addGenre(getTmmGenre(g.text())); } // add basic genre, since all genres at AEBN could be summarised // into this one md.addGenre(MediaGenres.EROTIC); // Certification // no data scrapeable---but obviously it's adult only, so simply // generate it String movieCertification = null; Certification certification = null; String country = options.getCountry().getAlpha2(); LOGGER.debug("AEBN: generate certification for {}", country); // @formatter:off if (country.equals("DE")) { movieCertification = "FSK 18"; } if (country.equals("US")) { movieCertification = "NC-17"; } if (country.equals("GB")) { movieCertification = "R18"; } if (country.equals("FR")) { movieCertification = "18"; } if (country.equals("ES")) { movieCertification = "PX"; } if (country.equals("JP")) { movieCertification = "R18+"; } if (country.equals("IT")) { movieCertification = "V.M.18"; } if (country.equals("NL")) { movieCertification = "16"; } // @formatter:on certification = Certification.getCertification(options.getCountry(), movieCertification); if (certification != null) { LOGGER.debug("AEBN: certification({})", certification); md.addCertification(certification); } // Plot and Tagline LOGGER.debug("AEBN: parse plot"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String moviePlot = cleanString(elements.first().text()); md.storeMetadata(MediaMetadata.PLOT, moviePlot); // no separate tagline available, so extract the first sentence // from the movie plot String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])"); LOGGER.debug("AEBN: tagline(" + movieTagline + ")"); md.storeMetadata(MediaMetadata.TAGLINE, movieTagline); } // Actors LOGGER.debug("AEBN: parse actors"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]"); LOGGER.debug("AEBN: {} actors found", elements.size()); for (Element anchor : elements) { String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)"); String actorname = cleanString(anchor.select("[itemprop=name]").first().text()); String actordetailsurl = BASE_DATAURL + anchor.attr("href"); if (!actorname.isEmpty()) { LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname, actordetailsurl); MediaCastMember cm = new MediaCastMember(); cm.setType(MediaCastMember.CastType.ACTOR); cm.setName(actorname); if (!actorid.isEmpty()) { cm.setId(actorid); } // Actor detail page try { Url starurl = new Url(actordetailsurl); InputStream starurlstream = starurl.getInputStream(); Document stardocument = Jsoup.parse(starurlstream, "UTF-8", ""); starurlstream.close(); Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo"); if (elements2.size() == 0) { LOGGER.debug("AEBN: no additional actor details found"); } else { // Actor image String actorimage = elements2.select("[itemprop=image]").first().attr("src"); LOGGER.debug("AEBN: actor image({})", actorimage); if (!actorimage.isEmpty()) { cm.setImageUrl(actorimage); } // Actor 'fanart' images // unsure if this is ever shown in tmm elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery") .select("a"); LOGGER.debug("AEBN: {} gallery images found", elements2.size()); for (Element thumbnail : elements2) { LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href")); cm.addFanart(thumbnail.attr("href")); } } } catch (Exception e) { LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e); } md.addCastMember(cm); } } // Director LOGGER.debug("AEBN: parse director"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)"); String directorname = cleanString(elements.select("[itemprop=name]").first().text()); if (!directorname.isEmpty()) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(directorname); if (!directorid.isEmpty()) { cm.setId(directorid); } cm.setImageUrl(""); md.addCastMember(cm); LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname); } } // Original Title // if we have no original title, just copy the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } } catch (Exception e) { LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e); } return md; }
From source file:perflab.LoadrunnerWrapper.java
/** * @param htmlSummaryFile - load runner analysis html report file to parse * @param summaryFile - location of summary file to be generated out of loadrunner html analysis *//*from w w w . ja va 2 s . c o m*/ protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) { try { File input = new File(htmlSummaryFile); Document document = Jsoup.parse(input, "UTF-8"); Document parse = Jsoup.parse(document.html()); Elements table = parse.select("table").select("[summary=Transactions statistics summary table]"); Elements rows = table.select("tr"); getLog().info("number of rows in summary file=" + rows.size()); for (Element row : rows) { //getLog().info("table element = " + row.toString()); String name = row.select("td[headers=LraTransaction Name]").select("span").text(); if (!name.isEmpty()) { float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text()); float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text()); float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text()); int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text() .replace(".", "").replace(",", "")); int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text() .replace(".", "").replace(",", "")); int failedPrecentage = failed / (failed + passed) * 100; getLog().info("Saving Transaction [" + name + "]"); this.transactions.add( new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed, failedPrecentage)); } } } catch (IOException e) { getLog().error("Can't read LoadRunner Analysis html report " + e.getMessage()); } }