List of usage examples for org.jsoup.nodes Document getElementsByAttributeValue
public Elements getElementsByAttributeValue(String key, String value)
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * // ww w . j a v a 2s. c o m * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }
From source file:autoInsurance.BeiJPiccImpl.java
public String login(String in) { // TODO Auto-generated method stub String out = ""; JSONObject jsonObject = JSONObject.fromObject(in); String ukey = jsonObject.getString("ukey"); String loginName = jsonObject.getString("loginName"); String password = jsonObject.getString("password"); String url = "http://10.134.136.48:8000/prpall/index.jsp"; String httpOrgCreateTestRtn = httpClientUtil.doPost(url, new HashMap<String, String>(), charset); if (httpOrgCreateTestRtn == null) { return "{\"success\": false, \"msg\": \"\"}"; }/*from w w w.j a va 2s . c o m*/ // write2Html(httpOrgCreateTestRtn); Document doc = Jsoup.parse(httpOrgCreateTestRtn); System.out.println(doc.title()); if (doc.title().contains("PICC")) return "{\"success\": false, \"msg\": \"!\"}"; String action = ""; if (doc.getElementById("fm") != null) action = doc.getElementById("fm").attr("action"); url = "https://10.134.136.48:8888" + action; String lt = doc.getElementsByAttributeValue("name", "lt").get(0).attr("value"); String postData = "PTAVersion=&toSign=&Signature=&rememberFlag=0&userMac=&key=yes&errorKey=null&loginMethod=nameAndPwd&username=" + loginName + "&password=" + password + "<=" + lt + "&_eventId=submit&pcguid=&button.x=20&button.y=17"; Map<String, String> map = null; try { map = parse2Map(postData); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } String respStr = httpClientUtil.doPost(url, map, charset); write2Html(respStr); doc = Jsoup.parse(respStr); System.out.println(doc.title()); // httpOrgCreateTestRtn = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/business/quickProposal.do?bizType=PROPOSAL&editType=NEW&is4S=Y",null,charset); // doc = Jsoup.parse(httpOrgCreateTestRtn); // try { // init(doc); // } catch (Exception e1) { // // TODO Auto-generated catch block // e1.printStackTrace(); // } String comCode = templateData.get("prpCmain.comCode"); // String handler1Code = templateData.get("prpCmain.handler1Code");// String agentCode = templateData.get("prpCmain.agentCode");// String businessNature = templateData.get("prpCmain.businessNature");// String param = "actionType=query&fieldIndex=206&fieldValue=" + agentCode + "&codeMethod=change&codeType=select&codeRelation=0%2C1%2C2&isClear=Y&otherCondition=operateDate%3D" + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "%2CriskCode%3DDAA%2CcomCode%3D" + comCode + "%2CbusinessNature%3D" + businessNature + "&typeParam=&callBackMethod=MainTotal.setAgentCode()%3BMainTotal.clearForAgentType()%3BItemCar.checkSelectKYFlag()%3B&getDataMethod=getAgents"; respStr = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/common/changeCodeInput.do?" + param, new HashMap<String, String>(), charset); //System.out.println(respStr);// 11003O100375_FIELD_SEPARATOR__FIELD_SEPARATOR_3O1000 String[] _field_separator = respStr.split("_FIELD_SEPARATOR_"); if (_field_separator.length < 3) return "{\"success\": false, \"msg\": \"\"}"; String agentName = _field_separator[1]; String agentType = _field_separator[2]; templateData.put("agentType", agentType); try { param = "comCode=" + URLEncoder.encode(comCode, charset) + "&handler1Code=" + URLEncoder.encode(handler1Code, charset) + "&agentCode=" + URLEncoder.encode(agentCode, charset) + "&businessNature=" + URLEncoder.encode(businessNature, charset); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } respStr = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/business/getCheckUserMsg.do?" + param, new HashMap<String, String>(), charset); Map retMap = JackJson.fromJsonToObject(respStr, Map.class); //System.out.println(retMap); String b = templateData.get("isCqp"); String qualificationName = ""; if (templateData.get("qualificationName") != null) { if (StringUtils.equals("1", b)) { qualificationName = templateData.get("prpQmainVoagentName"); } else { qualificationName = templateData.get("prpCmainagentName"); } } templateData.put("qualificationName", qualificationName); templateData.put("qualificationNo", (String) ((Map) ((List) retMap.get("data")).get(0)).get("permitNo")); templateData.put("prpCmainCommon.queryArea", "110000"); templateData.put("queryArea", ""); templateData.put("prpCinsureds[0].countryCode", "CHN"); templateData.put("resident[0]", "0"); templateData.put("LicenseColorCodeDes", ""); templateData.put("prpCitemCar.licenseColorCode", "01"); // 115192BJ templateData.put("agentCodeValidValue", ukey); templateData.put("agentCodeValidType", "U"); out = "{\"success\": true, \"msg\": \"" + loginName + "," + agentName + ",\"}"; return out; }
From source file:org.fox.ttrss.OnlineActivity.java
@Override public boolean onContextItemSelected(android.view.MenuItem item) { /* AdapterContextMenuInfo info = (AdapterContextMenuInfo) item .getMenuInfo(); *///from ww w . j a v a2 s. c o m final ArticlePager ap = (ArticlePager) getSupportFragmentManager().findFragmentByTag(FRAG_ARTICLE); switch (item.getItemId()) { case R.id.article_img_open: if (getLastContentImageHitTestUrl() != null) { try { Intent intent = new Intent(Intent.ACTION_VIEW, Uri.parse(getLastContentImageHitTestUrl())); startActivity(intent); } catch (Exception e) { e.printStackTrace(); toast(R.string.error_other_error); } } return true; case R.id.article_img_copy: if (getLastContentImageHitTestUrl() != null) { copyToClipboard(getLastContentImageHitTestUrl()); } return true; case R.id.article_img_share: if (getLastContentImageHitTestUrl() != null) { Intent intent = new Intent(Intent.ACTION_SEND); intent.setType("image/png"); intent.putExtra(Intent.EXTRA_SUBJECT, getLastContentImageHitTestUrl()); intent.putExtra(Intent.EXTRA_TEXT, getLastContentImageHitTestUrl()); startActivity(Intent.createChooser(intent, getLastContentImageHitTestUrl())); } return true; case R.id.article_img_view_caption: if (getLastContentImageHitTestUrl() != null) { // Android doesn't give us an easy way to access title tags; // we'll use Jsoup on the body text to grab the title text // from the first image tag with this url. This will show // the wrong text if an image is used multiple times. Document doc = Jsoup.parse(ap.getSelectedArticle().content); Elements es = doc.getElementsByAttributeValue("src", getLastContentImageHitTestUrl()); if (es.size() > 0) { if (es.get(0).hasAttr("title")) { Dialog dia = new Dialog(this); if (es.get(0).hasAttr("alt")) { dia.setTitle(es.get(0).attr("alt")); } else { dia.setTitle(es.get(0).attr("title")); } TextView titleText = new TextView(this); if (android.os.Build.VERSION.SDK_INT >= android.os.Build.VERSION_CODES.JELLY_BEAN) { titleText.setPaddingRelative(24, 24, 24, 24); } else { titleText.setPadding(24, 24, 24, 24); } titleText.setTextSize(16); titleText.setText(es.get(0).attr("title")); dia.setContentView(titleText); dia.show(); } else { toast(R.string.no_caption_to_display); } } else { toast(R.string.no_caption_to_display); } } return true; case R.id.article_link_share: if (ap != null && ap.getSelectedArticle() != null) { shareArticle(ap.getSelectedArticle()); } return true; case R.id.article_link_copy: Log.d(TAG, "article_link_copy"); if (ap != null && ap.getSelectedArticle() != null) { copyToClipboard(ap.getSelectedArticle().link); } return true; default: Log.d(TAG, "onContextItemSelected, unhandled id=" + item.getItemId()); return super.onContextItemSelected(item); } }
From source file:org.loklak.api.search.EventBriteCrawlerService.java
public static SusiThought crawlEventBrite(String url) { Document htmlPage = null; try {//from w w w .jav a2 s. c o m htmlPage = Jsoup.connect(url).get(); } catch (Exception e) { e.printStackTrace(); } String eventID = null; String eventName = null; String eventDescription = null; // TODO Fetch Event Color String eventColor = null; String imageLink = null; String eventLocation = null; String startingTime = null; String endingTime = null; String ticketURL = null; Elements tagSection = null; Elements tagSpan = null; String[][] tags = new String[5][2]; String topic = null; // By default String closingDateTime = null; String schedulePublishedOn = null; JSONObject creator = new JSONObject(); String email = null; Float latitude = null; Float longitude = null; String privacy = "public"; // By Default String state = "completed"; // By Default String eventType = ""; String temp; Elements t; eventID = htmlPage.getElementsByTag("body").attr("data-event-id"); eventName = htmlPage.getElementsByClass("listing-hero-body").text(); eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text(); eventColor = null; imageLink = htmlPage.getElementsByTag("picture").attr("content"); eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text(); temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); if (temp.length() >= 20) { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content") .substring(0, 19); } else { startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content"); } temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); if (temp.length() >= 20) { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content") .substring(0, 19); } else { endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content"); } ticketURL = url + "#tickets"; // TODO Tags to be modified to fit in the format of Open Event "topic" tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs"); tagSpan = tagSection.select("span"); topic = ""; int iterator = 0, k = 0; for (Element e : tagSpan) { if (iterator % 2 == 0) { tags[k][1] = "www.eventbrite.com" + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href"); } else { tags[k][0] = e.text(); k++; } iterator++; } creator.put("email", ""); creator.put("id", "1"); // By Default temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"); if (temp.length() > 0) { latitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content")); } temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"); if (temp.length() > 0) { longitude = Float.valueOf( htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content")); } // TODO This returns: "events.event" which is not supported by Open // Event Generator // eventType = htmlPage.getElementsByAttributeValue("property", // "og:type").attr("content"); String organizerName = null; String organizerLink = null; String organizerProfileLink = null; String organizerWebsite = null; String organizerContactInfo = null; String organizerDescription = null; String organizerFacebookFeedLink = null; String organizerTwitterFeedLink = null; String organizerFacebookAccountLink = null; String organizerTwitterAccountLink = null; temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text(); if (temp.length() >= 5) { organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text() .substring(4); } else { organizerName = ""; } organizerLink = url + "#listing-organizer"; organizerProfileLink = htmlPage .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden") .attr("href"); organizerContactInfo = url + "#lightbox_contact"; Document orgProfilePage = null; try { orgProfilePage = Jsoup.connect(organizerProfileLink).get(); } catch (Exception e) { e.printStackTrace(); } if (orgProfilePage != null) { t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website"); if (t != null) { organizerWebsite = orgProfilePage .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text(); } else { organizerWebsite = ""; } t = orgProfilePage.select("div.js-long-text.organizer-description"); if (t != null) { organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text(); } else { organizerDescription = ""; } organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed"; organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed"; t = orgProfilePage.getElementsByAttributeValue("class", "fb-page"); if (t != null) { organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page") .attr("data-href"); } else { organizerFacebookAccountLink = ""; } t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline"); if (t != null) { organizerTwitterAccountLink = orgProfilePage .getElementsByAttributeValue("class", "twitter-timeline").attr("href"); } else { organizerTwitterAccountLink = ""; } } JSONArray socialLinks = new JSONArray(); JSONObject fb = new JSONObject(); fb.put("id", "1"); fb.put("name", "Facebook"); fb.put("link", organizerFacebookAccountLink); socialLinks.put(fb); JSONObject tw = new JSONObject(); tw.put("id", "2"); tw.put("name", "Twitter"); tw.put("link", organizerTwitterAccountLink); socialLinks.put(tw); JSONArray jsonArray = new JSONArray(); JSONObject event = new JSONObject(); event.put("event_url", url); event.put("id", eventID); event.put("name", eventName); event.put("description", eventDescription); event.put("color", eventColor); event.put("background_url", imageLink); event.put("closing_datetime", closingDateTime); event.put("creator", creator); event.put("email", email); event.put("location_name", eventLocation); event.put("latitude", latitude); event.put("longitude", longitude); event.put("start_time", startingTime); event.put("end_time", endingTime); event.put("logo", imageLink); event.put("organizer_description", organizerDescription); event.put("organizer_name", organizerName); event.put("privacy", privacy); event.put("schedule_published_on", schedulePublishedOn); event.put("state", state); event.put("type", eventType); event.put("ticket_url", ticketURL); event.put("social_links", socialLinks); event.put("topic", topic); jsonArray.put(event); JSONObject org = new JSONObject(); org.put("organizer_name", organizerName); org.put("organizer_link", organizerLink); org.put("organizer_profile_link", organizerProfileLink); org.put("organizer_website", organizerWebsite); org.put("organizer_contact_info", organizerContactInfo); org.put("organizer_description", organizerDescription); org.put("organizer_facebook_feed_link", organizerFacebookFeedLink); org.put("organizer_twitter_feed_link", organizerTwitterFeedLink); org.put("organizer_facebook_account_link", organizerFacebookAccountLink); org.put("organizer_twitter_account_link", organizerTwitterAccountLink); jsonArray.put(org); JSONArray microlocations = new JSONArray(); jsonArray.put(new JSONObject().put("microlocations", microlocations)); JSONArray customForms = new JSONArray(); jsonArray.put(new JSONObject().put("customForms", customForms)); JSONArray sessionTypes = new JSONArray(); jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes)); JSONArray sessions = new JSONArray(); jsonArray.put(new JSONObject().put("sessions", sessions)); JSONArray sponsors = new JSONArray(); jsonArray.put(new JSONObject().put("sponsors", sponsors)); JSONArray speakers = new JSONArray(); jsonArray.put(new JSONObject().put("speakers", speakers)); JSONArray tracks = new JSONArray(); jsonArray.put(new JSONObject().put("tracks", tracks)); String userHome = System.getProperty("user.home"); String path = userHome + "/Downloads/EventBriteInfo"; new File(path).mkdir(); try (FileWriter file = new FileWriter(path + "/event.json")) { file.write(event.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/org.json")) { file.write(org.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/social_links.json")) { file.write(socialLinks.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/microlocations.json")) { file.write(microlocations.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/custom_forms.json")) { file.write(customForms.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/session_types.json")) { file.write(sessionTypes.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sessions.json")) { file.write(sessions.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/sponsors.json")) { file.write(sponsors.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/speakers.json")) { file.write(speakers.toString()); } catch (IOException e1) { e1.printStackTrace(); } try (FileWriter file = new FileWriter(path + "/tracks.json")) { file.write(tracks.toString()); } catch (IOException e1) { e1.printStackTrace(); } SusiThought json = new SusiThought(); json.setData(jsonArray); return json; }
From source file:org.loklak.api.search.GithubProfileScraper.java
public static SusiThought scrapeGithub(String profile) { Document html = null; JSONObject githubProfile = new JSONObject(); try {/*w w w . java 2 s. c o m*/ html = Jsoup.connect("https://github.com/" + profile).get(); } catch (IOException e) { URI uri = null; try { uri = new URI("https://api.github.com/search/users?q=" + profile); } catch (URISyntaxException e1) { e1.printStackTrace(); } JSONTokener tokener = null; try { tokener = new JSONTokener(uri.toURL().openStream()); } catch (Exception e1) { e1.printStackTrace(); } JSONObject obj = new JSONObject(tokener); JSONArray arr = new JSONArray(); arr.put(obj); SusiThought json = new SusiThought(); json.setData(arr); return json; } String avatarUrl = html.getElementsByAttributeValue("class", "avatar rounded-2").attr("src"); githubProfile.put("avatar_url", avatarUrl); String fullName = html.getElementsByAttributeValue("class", "vcard-fullname").text(); githubProfile.put("full_name", fullName); String userName = html.getElementsByAttributeValue("class", "vcard-username").text(); githubProfile.put("user_name", userName); String bio = html.getElementsByAttributeValue("class", "user-profile-bio").text(); githubProfile.put("bio", bio); String atomFeedLink = html.getElementsByAttributeValue("type", "application/atom+xml").attr("href"); githubProfile.put("atom_feed_link", "https://github.com" + atomFeedLink); String worksFor = html.getElementsByAttributeValue("itemprop", "worksFor").text(); githubProfile.put("works_for", worksFor); String homeLocation = html.getElementsByAttributeValue("itemprop", "homeLocation").attr("title"); githubProfile.put("home_location", homeLocation); String email = html.getElementsByAttributeValue("itemprop", "email").text(); githubProfile.put("email", email); String specialLink = html.getElementsByAttributeValue("itemprop", "url").text(); githubProfile.put("special_link", specialLink); String joiningDate = html.getElementsByAttributeValue("class", "join-date").attr("datetime"); githubProfile.put("joining_date", joiningDate); /* If Individual User */ if (html.getElementsByAttributeValue("class", "vcard-stat").size() != 0) { String followersUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(0).attr("href"); githubProfile.put("followers_url", "https://github.com" + followersUrl); String followers = html.getElementsByAttributeValue("class", "vcard-stat").get(0).tagName("strong") .text(); githubProfile.put("followers", followers); String starredUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(1).attr("href"); githubProfile.put("starred_url", "https://github.com" + starredUrl); String starred = html.getElementsByAttributeValue("class", "vcard-stat").get(1).tagName("strong") .text(); githubProfile.put("starred", starred); String followingUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(2).attr("href"); githubProfile.put("following_url", "https://github.com" + followingUrl); String following = html.getElementsByAttributeValue("class", "vcard-stat").get(2).tagName("strong") .text(); githubProfile.put("following", following); } String gistsUrl = "https://api.github.com/users/" + profile + "/gists"; githubProfile.put("gists_url", gistsUrl); String subscriptionsUrl = "https://api.github.com/users/" + profile + "/subscriptions"; githubProfile.put("subscriptions_url", subscriptionsUrl); String reposUrl = "https://api.github.com/users/" + profile + "/repos"; githubProfile.put("repos_url", reposUrl); String eventsUrl = "https://api.github.com/users/" + profile + "/events"; githubProfile.put("events_url", eventsUrl); String receivedEventsUrl = "https://api.github.com/users/" + profile + "/received_events"; githubProfile.put("received_events_url", receivedEventsUrl); JSONArray organizations = new JSONArray(); Elements orgs = html.getElementsByAttributeValue("itemprop", "follows"); for (Element e : orgs) { JSONObject obj = new JSONObject(); String label = e.attr("aria-label"); obj.put("label", label); String link = e.attr("href"); obj.put("link", "https://github.com" + link); String imgLink = e.children().attr("src"); obj.put("img_link", imgLink); String imgAlt = e.children().attr("alt"); obj.put("img_Alt", imgAlt); organizations.put(obj); } githubProfile.put("organizations", organizations); /* If Organization */ Elements navigation = html.getElementsByAttributeValue("class", "orgnav"); for (Element e : navigation) { String orgRepositoriesLink = e.child(0).tagName("a").attr("href"); githubProfile.put("organization_respositories_link", "https://github.com" + orgRepositoriesLink); String orgPeopleLink = e.child(1).tagName("a").attr("href"); githubProfile.put("organization_people_link", "https://github.com" + orgPeopleLink); String orgPeopleNumber = e.child(1).tagName("a").child(1).text(); githubProfile.put("organization_people_number", orgPeopleNumber); } JSONArray jsonArray = new JSONArray(); jsonArray.put(githubProfile); SusiThought json = new SusiThought(); json.setData(jsonArray); return json; }
From source file:org.loklak.api.search.MeetupsCrawlerService.java
public static SusiThought crawlMeetups(String url) { Document meetupHTML = null; String meetupGroupName = null; String meetupType = null;// w w w . j a v a2s. co m String groupDescription = null; String groupLocality = null; String groupCountry = null; String latitude = null; String longitude = null; String imageLink = null; Elements topicList = null; String[] topicListArray = new String[100]; Integer numberOfTopics = 0; Elements recentMeetupsSection = null; Integer numberOfRecentMeetupsShown = 0; Integer i = 0, j = 0; String recentMeetupsResult[][] = new String[100][3]; // recentMeetupsResult[i][0] == date && time // recentMeetupsResult[i][1] == Attendance && Review // recentMeetupsResult[i][2] == Information JSONObject result = new JSONObject(); try { meetupHTML = Jsoup.connect(url).userAgent("Mozilla)").get(); } catch (Exception e) { e.printStackTrace(); } meetupGroupName = meetupHTML.getElementsByAttributeValue("property", "og:title").attr("content"); result.put("group_name", meetupGroupName); meetupType = meetupHTML.getElementsByAttributeValue("property", "og:type").attr("content"); result.put("meetup_type", meetupType); groupDescription = meetupHTML.getElementById("groupDesc").text(); result.put("group_description", groupDescription); groupLocality = meetupHTML.getElementsByAttributeValue("property", "og:locality").attr("content"); result.put("group_locality", groupLocality); groupCountry = meetupHTML.getElementsByAttributeValue("property", "og:country-name").attr("content"); result.put("group_country_code", groupCountry); latitude = meetupHTML.getElementsByAttributeValue("property", "og:latitude").attr("content"); result.put("group_latitude", latitude); longitude = meetupHTML.getElementsByAttributeValue("property", "og:longitude").attr("content"); result.put("group_longitude", longitude); imageLink = meetupHTML.getElementsByAttributeValue("property", "og:image").attr("content"); result.put("group_imageLink", imageLink); topicList = meetupHTML.getElementById("topic-box-2012").getElementsByTag("a"); int p = 0; for (Element topicListStringsIterator : topicList) { topicListArray[p] = topicListStringsIterator.text().toString(); p++; } numberOfTopics = p; JSONArray groupTopics = new JSONArray(); for (int l = 0; l < numberOfTopics; l++) { groupTopics.put(l, topicListArray[l]); } result.put("group_topics", groupTopics); recentMeetupsSection = meetupHTML.getElementById("recentMeetups").getElementsByTag("p"); i = 0; j = 0; for (Element recentMeetups : recentMeetupsSection) { if (j % 3 == 0) { j = 0; i++; } recentMeetupsResult[i][j] = recentMeetups.text().toString(); j++; } numberOfRecentMeetupsShown = i; JSONArray recentMeetups = new JSONArray(); for (int k = 1; k < numberOfRecentMeetupsShown; k++) { JSONObject obj = new JSONObject(); obj.put("recent_meetup_number", k); obj.put("date_time", recentMeetupsResult[k][0]); obj.put("attendance", recentMeetupsResult[k][1]); obj.put("information", recentMeetupsResult[k][2]); recentMeetups.put(obj); } result.put("recent_meetups", recentMeetups); JSONArray meetupsCrawlerResultArray = new JSONArray(); meetupsCrawlerResultArray.put(result); SusiThought json = new SusiThought(); json.setData(meetupsCrawlerResultArray); return json; }
From source file:org.loklak.api.search.WeiboUserInfo.java
@Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Query post = RemoteAccess.evaluate(request); // manage DoS if (post.isDoS_blackout()) { response.sendError(503, "your request frequency is too high"); return;//ww w.j a v a 2 s . c o m } String url = post.get("url", ""); JSONObject obj = new JSONObject(); Document doc = Jsoup.connect(url).get(); Elements infos; infos = doc.getElementsByAttributeValue("class", "li_1 clearfix"); if (infos != null) { Element info; String profile; for (int i = 0; i < infos.size(); i++) { info = infos.get(i); if (info.getElementsByAttributeValueContaining("href", "loc=infblog").size() == 0) { profile = info.getElementsByAttributeValue("class", "pt_detail").first().text().trim(); obj.put("pro", profile); switch (info.getElementsByAttributeValue("class", "pt_title S_txt2").first().text()) { case "Nickname": obj.put("username", profile); break; case "Location": obj.put("Address", profile); break; case "Gender": obj.put("Gender", profile); break; case "??": obj.put("Sexuality", profile.replace("t", "").replace("rn", "")); break; case "": obj.put("Relationship", profile.replace("t", "").replace("rn", "")); break; case "Birthday": obj.put("Birthday", profile); break; case "": obj.put("Blood", profile); break; case "Domain Name": if (info.getElementsByAttributeValueContaining("href", "loc=infdomain").size() != 0) profile = info.select("a").text(); obj.put("Personaldomain", profile); break; case "": obj.put("Profile", profile); break; case "Registration": obj.put("Registertime", profile.replace("t", "").replace("rn", "")); break; case "Email": obj.put("Email", profile); break; case "QQ": obj.put("Qq", profile); break; case "": obj.put("College", profile.replace("t", "").replace("rn", "")); break; case "Tags": obj.put("Tag", profile.replace("t", "").replace("rn", "")); break; } } else { String blogurl = info.select("a").text(); obj.put("Blog", blogurl); } } } //print JSON response.setCharacterEncoding("UTF-8"); PrintWriter sos = response.getWriter(); sos.print(obj.toString(2)); sos.println(); }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Get movie meta data from aebn.net./*from w w w . j a v a 2 s . c o m*/ * */ @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("AEBN: getMetadata() {}", options); // check if there is already meta data present in the result if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) { LOGGER.debug("AEBN: return metadata from cache"); return options.getResult().getMediaMetadata(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Elements elements = null; Element element = null; Integer aebnId = 0; // get AebnId from previous search result if ((options.getResult() != null) && (options.getResult().getId() != null)) { aebnId = Integer.parseInt(options.getResult().getId()); LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId); // preset some values from search result (if there is one) // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy". md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, StrgUtils.removeCommonSortableName(options.getResult().getTitle())); } // or get AebnId from options if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) { LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID)); aebnId = Integer.parseInt(options.getId(AEBNID)); } if (!isValidAebnId(aebnId)) { LOGGER.warn("AEBN: no or incorrect aebnId, aborting"); return md; } // ID md.setId(providerInfo.getId(), aebnId); LOGGER.debug("AEBN: aebnId({})", aebnId); // Base download url for data scraping String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId; String locale = options.getLanguage().name(); if (!StringUtils.isBlank(locale)) { downloadUrl = downloadUrl + "&locale=" + locale; LOGGER.debug("AEBN: used locale({})", locale); } // begin download and scrape try { LOGGER.debug("AEBN: download movie detail page"); Url url = new Url(downloadUrl); InputStream in = url.getInputStream(); Document document = Jsoup.parse(in, "UTF-8", ""); in.close(); // Title // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1> LOGGER.debug("AEBN: parse title"); elements = document.getElementsByAttributeValue("class", "md-movieTitle"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieTitle = cleanString(element.text()); LOGGER.debug("AEBN: title({})", movieTitle); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // Poster // front cover: // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg"; md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl); // Fanart/Background // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..." // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." /> LOGGER.debug("AEBN: parse fanart / scene thumbs"); elements = document.getElementsByAttributeValue("class", "SceneThumbnail"); LOGGER.debug("AEBN: {} elements found", elements.size()); int i = 1; for (Element anchor : elements) { String backgroundUrl = anchor.attr("src"); LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl); md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl); i++; } // Runtime LOGGER.debug("AEBN: parse runtime"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieRuntime = cleanString(element.attr("content")); movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M"); LOGGER.debug("AEBN: runtime({})", movieRuntime); md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime); } // Year LOGGER.debug("AEBN: parse year"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieYear = cleanString(element.attr("content")); movieYear = StrgUtils.substr(movieYear, "(\\d+)-"); LOGGER.debug("AEBN: year({})", movieYear); md.storeMetadata(MediaMetadata.YEAR, movieYear); } // Series (Collection) LOGGER.debug("AEBN: parse collection"); elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieCollection = cleanString(element.text()); // Fake a TMDB_SET based on the hash value of the collection name int movieCollectionHash = movieCollection.hashCode(); md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection); md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash); LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash); } // Studio LOGGER.debug("AEBN: parse studio"); elements = document.getElementsByAttributeValue("id", "md-details") .select("[itemprop=productionCompany]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String movieStudio = cleanString(elements.first().text()); LOGGER.debug("AEBN: studio({})", movieStudio); md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio); } // Genre LOGGER.debug("AEBN: parse genre"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]"); for (Element g : elements) { md.addGenre(getTmmGenre(g.text())); } // add basic genre, since all genres at AEBN could be summarised // into this one md.addGenre(MediaGenres.EROTIC); // Certification // no data scrapeable---but obviously it's adult only, so simply // generate it String movieCertification = null; Certification certification = null; String country = options.getCountry().getAlpha2(); LOGGER.debug("AEBN: generate certification for {}", country); // @formatter:off if (country.equals("DE")) { movieCertification = "FSK 18"; } if (country.equals("US")) { movieCertification = "NC-17"; } if (country.equals("GB")) { movieCertification = "R18"; } if (country.equals("FR")) { movieCertification = "18"; } if (country.equals("ES")) { movieCertification = "PX"; } if (country.equals("JP")) { movieCertification = "R18+"; } if (country.equals("IT")) { movieCertification = "V.M.18"; } if (country.equals("NL")) { movieCertification = "16"; } // @formatter:on certification = Certification.getCertification(options.getCountry(), movieCertification); if (certification != null) { LOGGER.debug("AEBN: certification({})", certification); md.addCertification(certification); } // Plot and Tagline LOGGER.debug("AEBN: parse plot"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String moviePlot = cleanString(elements.first().text()); md.storeMetadata(MediaMetadata.PLOT, moviePlot); // no separate tagline available, so extract the first sentence // from the movie plot String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])"); LOGGER.debug("AEBN: tagline(" + movieTagline + ")"); md.storeMetadata(MediaMetadata.TAGLINE, movieTagline); } // Actors LOGGER.debug("AEBN: parse actors"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]"); LOGGER.debug("AEBN: {} actors found", elements.size()); for (Element anchor : elements) { String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)"); String actorname = cleanString(anchor.select("[itemprop=name]").first().text()); String actordetailsurl = BASE_DATAURL + anchor.attr("href"); if (!actorname.isEmpty()) { LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname, actordetailsurl); MediaCastMember cm = new MediaCastMember(); cm.setType(MediaCastMember.CastType.ACTOR); cm.setName(actorname); if (!actorid.isEmpty()) { cm.setId(actorid); } // Actor detail page try { Url starurl = new Url(actordetailsurl); InputStream starurlstream = starurl.getInputStream(); Document stardocument = Jsoup.parse(starurlstream, "UTF-8", ""); starurlstream.close(); Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo"); if (elements2.size() == 0) { LOGGER.debug("AEBN: no additional actor details found"); } else { // Actor image String actorimage = elements2.select("[itemprop=image]").first().attr("src"); LOGGER.debug("AEBN: actor image({})", actorimage); if (!actorimage.isEmpty()) { cm.setImageUrl(actorimage); } // Actor 'fanart' images // unsure if this is ever shown in tmm elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery") .select("a"); LOGGER.debug("AEBN: {} gallery images found", elements2.size()); for (Element thumbnail : elements2) { LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href")); cm.addFanart(thumbnail.attr("href")); } } } catch (Exception e) { LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e); } md.addCastMember(cm); } } // Director LOGGER.debug("AEBN: parse director"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)"); String directorname = cleanString(elements.select("[itemprop=name]").first().text()); if (!directorname.isEmpty()) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(directorname); if (!directorid.isEmpty()) { cm.setId(directorid); } cm.setImageUrl(""); md.addCastMember(cm); LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname); } } // Original Title // if we have no original title, just copy the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } } catch (Exception e) { LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e); } return md; }
From source file:org.tinymediamanager.scraper.hdtrailersnet.HDTrailersNet.java
@Override public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception { LOGGER.debug("getTrailers() " + options.toString()); List<MediaTrailer> trailers = new ArrayList<MediaTrailer>(); MediaMetadata md = options.getMetadata(); if (md == null || StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { LOGGER.warn("no originalTitle served"); return trailers; }/*from w ww . j a va2 s .co m*/ String ot = md.getStringValue(MediaMetadata.ORIGINAL_TITLE); // check if the original title is not empty if (StringUtils.isEmpty(ot)) { return trailers; } // best guess String search = "http://www.hd-trailers.net/movie/" + ot.replaceAll("[^a-zA-Z0-9]", "-").replaceAll("--", "-").toLowerCase() + "/"; try { LOGGER.debug("Guessed HD-Trailers Url: " + search); Url url = new CachedUrl(search); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); Elements tr = doc.getElementsByAttributeValue("itemprop", "trailer"); /* * <tr style="" itemprop="trailer" itemscope itemtype="http://schema.org/VideoObject"> <td class="bottomTableDate" rowspan="2">2012-03-30</td> * <td class="bottomTableName" rowspan="2"><span class="standardTrailerName" itemprop="name">Trailer 2</span> <a href= * "http://blog.hd-trailers.net/how-to-download-hd-trailers-from-apple/#workarounds" ><img src="http://static.hd-trailers.net/images/error.png" * width="16" height="16" style="border:0px;vertical-align:middle" alt="Apple Direct Download Unavailable" * title="Apple Direct Download Unavailable" /></a></td> * * <td class="bottomTableResolution"><a href= "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h480p.mov" * rel="lightbox[res480p 852 480]" title="Men in Black 3 - Trailer 2 - 480p">480p</a></td> <td class="bottomTableResolution"><a href= * "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h720p.mov" rel="lightbox[res720p 1280 720]" * title="Men in Black 3 - Trailer 2 - 720p">720p</a></td> <td class="bottomTableResolution"><a href= * "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h1080p.mov" rel="lightbox[res1080p 1920 1080]" * title="Men in Black 3 - Trailer 2 - 1080p">1080p</a></td> <td class="bottomTableIcon"> <a * href="http://trailers.apple.com/trailers/sony_pictures/meninblack3/" target="_blank"> <img * src="http://static.hd-trailers.net/images/apple.ico" alt="Apple" height="16px" width="16px"/></a></td> </tr> <tr> <td * class="bottomTableFileSize">36 MB</td> <td class="bottomTableFileSize">111 MB</td> <td class="bottomTableFileSize">181 MB</td> <td * class="bottomTableEmbed"><a href= * "/embed-code.php?movieId=men-in-black-3&source=1&trailerName=Trailer 2&resolutions=480;720;1080" rel="lightbox[embed 600 600]" * title="Embed this video on your website">embed</a></td> </tr> */ for (Element t : tr) { try { String date = t.select("td.bottomTableDate").first().text(); String title = t.select("td.bottomTableName > span").first().text(); // apple.com urls currently not working (according to hd-trailers) String tr0qual = t.select("td.bottomTableResolution > a").get(0).text(); String tr0url = t.select("td.bottomTableResolution > a").get(0).attr("href"); MediaTrailer trailer = new MediaTrailer(); trailer.setName(title + " (" + date + ")"); trailer.setDate(date); trailer.setUrl(tr0url); trailer.setQuality(tr0qual); trailer.setProvider(getProviderFromUrl(tr0url)); LOGGER.debug(trailer.toString()); trailers.add(trailer); String tr1qual = t.select("td.bottomTableResolution > a").get(1).text(); String tr1url = t.select("td.bottomTableResolution > a").get(1).attr("href"); trailer = new MediaTrailer(); trailer.setName(title + " (" + date + ")"); trailer.setDate(date); trailer.setUrl(tr1url); trailer.setQuality(tr1qual); trailer.setProvider(getProviderFromUrl(tr1url)); LOGGER.debug(trailer.toString()); trailers.add(trailer); String tr2qual = t.select("td.bottomTableResolution > a").get(2).text(); String tr2url = t.select("td.bottomTableResolution > a").get(2).attr("href"); trailer = new MediaTrailer(); trailer.setName(title + " (" + date + ")"); trailer.setDate(date); trailer.setUrl(tr2url); trailer.setQuality(tr2qual); trailer.setProvider(getProviderFromUrl(tr2url)); LOGGER.debug(trailer.toString()); trailers.add(trailer); } catch (IndexOutOfBoundsException i) { // ignore parse errors per line LOGGER.warn("Error parsing HD-Trailers line. Possible missing quality."); } } } catch (Exception e) { LOGGER.error("cannot parse HD-Trailers movie: " + ot, e); // clear cache CachedUrl.removeCachedFileForUrl(search); } finally { } return trailers; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("search() " + query.toString()); /*// ww w . j a va 2s . c o m * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/"); List<MediaSearchResult> result = new ArrayList<MediaSearchResult>(); String searchTerm = ""; if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) { searchTerm = query.get(SearchParam.IMDBID); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.QUERY); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.TITLE); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper options String language = query.get(SearchParam.LANGUAGE); String myear = query.get(SearchParam.YEAR); String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(imdbSite.getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! LOGGER.debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(CAT_TITLE); LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { CachedUrl url = new CachedUrl(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { LOGGER.debug("tried to fetch search response", e); // clear Cache CachedUrl.removeCachedFileForUrl(sb.toString()); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(); options.setImdbId(movieId); options.setLanguage(MediaLanguages.valueOf(language)); options.setCountry(CountryCode.valueOf(country)); options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO))); options.setScrapeImdbForeignLanguage( Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE))); md = getMetadata(options); if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) { movieName = md.getStringValue(MediaMetadata.TITLE); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getStringValue(MediaMetadata.YEAR)); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Element td = doc.getElementById("img_primary"); if (td != null) { Elements imgs = td.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; String year = ""; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*| Matcher matcher = unwanted.matcher(element.text()); if (matcher.find()) { continue; } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { year = matcher.group(1); break; } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); // populate extra args MetadataUtil.copySearchQueryToSearchResult(query, sr); if (movieId.equals(query.get(SearchParam.IMDBID))) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) { LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }