Example usage for org.jsoup.nodes Document getElementsByClass

List of usage examples for org.jsoup.nodes Document getElementsByClass

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByClass.

Prototype

public Elements getElementsByClass(String className) 

Source Link

Document

Find elements that have this class, including or under this element.

Usage

From source file:ca.appvelopers.mcgillmobile.model.retrofit.CourseResultConverter.java

@Override
public List<CourseResult> convert(ResponseBody value) throws IOException {
    String html = value.string();
    List<CourseResult> courses = new ArrayList<>();
    Document document = Jsoup.parse(html, "UTF-8");
    //Parse the response body into a list of rows
    Elements rows = document.getElementsByClass("dddefault");

    // Parse the term from the page header
    Element header = document.getElementsByClass("staticheaders").get(0);
    Term term = Term.parseTerm(header.childNode(2).toString());

    // Get the table in the form of a set of rows
    Element table = document.getElementsByClass("datadisplaytable").get(0).select("tbody").get(0);

    // Go through the rows in the table
    for (Element row : table.select("tr")) {
        // Check that there at least 19 elements in the row
        Elements rowElements = row.select("td");
        if (rowElements.size() < 19) {
            // If there aren't, it must not be a course row
            continue;
        }/* ww  w  .  j  av a2 s . co  m*/

        // Create a new course object with the default values
        double credits = 99;
        String subject = null;
        String number = null;
        String title = "";
        String type = "";
        List<DayOfWeek> days = new ArrayList<>();
        int crn = 0;
        String instructor = "";
        String location = "";
        //So that the rounded start time will be 0
        LocalTime startTime = ScheduleConverter.getDefaultStartTime();
        LocalTime endTime = ScheduleConverter.getDefaultEndTime();
        int capacity = 0;
        int seatsRemaining = 0;
        int waitlistRemaining = 0;
        LocalDate startDate = LocalDate.now();
        LocalDate endDate = LocalDate.now();

        try {
            for (int i = 0; i < rowElements.size(); i++) {
                if (rowElements.get(i).toString().contains("&nbsp;")) {
                    // Empty row: continue
                    continue;
                }
                String rowString = rowElements.get(i).text();

                switch (i) {
                // CRN
                case 1:
                    crn = Integer.parseInt(rowString);
                    break;
                // Subject
                case 2:
                    subject = rowString;
                    break;
                // Number
                case 3:
                    number = rowString;
                    break;
                // Type
                case 5:
                    type = rowString;
                    break;
                // Number of credits
                case 6:
                    credits = Double.parseDouble(rowString);
                    break;
                // Course title
                case 7:
                    //Remove the extra period at the end of the course title
                    title = rowString.substring(0, rowString.length() - 1);
                    break;
                // Days of the week
                case 8:
                    if (rowString.equals("TBA")) {
                        // TBA Stuff: no time associated so skip the next one
                        // and add a dummy to keep the index correct
                        rowElements.add(9, null);
                        i++;
                    } else {
                        // Day Parsing
                        rowString = rowString.replace('\u00A0', ' ').trim();
                        for (int k = 0; k < rowString.length(); k++) {
                            days.add(DayUtils.getDay(rowString.charAt(k)));
                        }
                    }
                    break;
                // Time
                case 9:
                    String[] times = rowString.split("-");
                    try {
                        int startHour = Integer.parseInt(times[0].split(" ")[0].split(":")[0]);
                        int startMinute = Integer.parseInt(times[0].split(" ")[0].split(":")[1]);
                        int endHour = Integer.parseInt(times[1].split(" ")[0].split(":")[0]);
                        int endMinute = Integer.parseInt(times[1].split(" ")[0].split(":")[1]);

                        //If it's PM, then add 12 hours to the hours for 24 hours format
                        //Make sure it isn't noon
                        String startPM = times[0].split(" ")[1];
                        if (startPM.equals("PM") && startHour != 12) {
                            startHour += 12;
                        }

                        String endPM = times[1].split(" ")[1];
                        if (endPM.equals("PM") && endHour != 12) {
                            endHour += 12;
                        }

                        startTime = LocalTime.of(startHour, startMinute);
                        endTime = LocalTime.of(endHour, endMinute);
                    } catch (NumberFormatException e) {
                        //Courses sometimes don't have assigned times
                        startTime = ScheduleConverter.getDefaultStartTime();
                        endTime = ScheduleConverter.getDefaultEndTime();
                    }
                    break;
                // Capacity
                case 10:
                    capacity = Integer.parseInt(rowString);
                    break;
                // Seats remaining
                case 12:
                    seatsRemaining = Integer.parseInt(rowString);
                    break;
                // Waitlist remaining
                case 15:
                    waitlistRemaining = Integer.parseInt(rowString);
                    break;
                // Instructor
                case 16:
                    instructor = rowString;
                    break;
                // Start/end date
                case 17:
                    Pair<LocalDate, LocalDate> dates = parseDateRange(term, rowString);
                    startDate = dates.first;
                    endDate = dates.second;
                    break;
                // Location
                case 18:
                    location = rowString;
                    break;
                }
            }
        } catch (Exception e) {
            Timber.e(e, "Course Results Parser Error");
        }

        // Don't add any courses with errors
        if (subject != null && number != null) {
            // Create a new course object and add it to list
            // TODO Should we be parsing the course section?
            courses.add(new CourseResult(term, subject, number, title, crn, "", startTime, endTime, days, type,
                    location, instructor, credits, startDate, endDate, capacity, seatsRemaining,
                    waitlistRemaining));
        }
    }

    return courses;
}

From source file:ca.zadrox.dota2esportticker.service.UpdateMatchService.java

private void updateMatches(boolean doResults) {

    if (!checkForConnectivity()) {
        LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_NO_CONNECTIVITY));
        return;/*  www.  jav  a  2 s. c o  m*/
    }

    LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_STARTED));

    final String BASE_URL = "http://www.gosugamers.net/dota2/gosubet";
    final String MATCH_LINK_URL_BASE = "http://www.gosugamers.net";

    try {

        String rawHtml = new OkHttpClient().newCall(new Request.Builder().url(BASE_URL).build()).execute()
                .body().string();

        rawHtml = rawHtml.substring(rawHtml.indexOf("<div id=\"col1\" class=\"rows\">"),
                rawHtml.indexOf("<div id=\"col2\" class=\"rows\">"));
        Document doc = Jsoup.parse(rawHtml);

        Elements tables = doc.getElementsByClass("matches");

        ArrayList<ArrayList<String>> matchLinks = new ArrayList<ArrayList<String>>(tables.size());

        int numSeries = 0;
        for (Element table : tables) {
            Elements links = table.getElementsByClass("match");
            if (links.size() != 0) {
                ArrayList<String> innerMatchLink = new ArrayList<String>(links.size());
                for (Element link : links) {
                    String linkHref = link.attr("href");
                    innerMatchLink.add(MATCH_LINK_URL_BASE + linkHref);
                    numSeries++;
                }
                matchLinks.add(innerMatchLink);
            }
        }

        // needed if there are massive reschedules to update content properly.
        Uri resultsUri = MatchContract.SeriesEntry.buildSeriesUriWithAfterTime(TimeUtils.getUTCTime());

        Cursor c = getContentResolver().query(resultsUri,
                new String[] { MatchContract.SeriesEntry.COLUMN_GG_MATCH_PAGE }, null, null, null);

        while (c.moveToNext()) {
            if (!matchLinks.get(0).contains(c.getString(0))) {
                matchLinks.get(0).add(c.getString(0));
            }
        }

        Iterator<ArrayList<String>> iterator = matchLinks.iterator();
        int numResults = 0;
        ExecutorService executorService = Executors.newFixedThreadPool(10);
        ArrayList<Future<BundledMatchItem>> seriesItemFutures = new ArrayList<Future<BundledMatchItem>>(
                numSeries);

        LogUtils.LOGD(TAG, "Starting Retrieval, num elements gathered: " + numSeries);
        int i = 0;
        while (iterator.hasNext()) {

            ArrayList<String> matchList = iterator.next();
            for (String matchUrl : matchList) {
                boolean hasResult = !iterator.hasNext();
                if (!doResults && hasResult) {
                    continue;
                } else if (hasResult) {
                    numResults++;
                }
                seriesItemFutures.add(executorService.submit(new MatchGetter(matchUrl, hasResult)));
                i++;
            }
        }
        executorService.shutdown();
        executorService.awaitTermination(20L, TimeUnit.SECONDS);
        LogUtils.LOGD(TAG, "Stopping Retrieval, elements submitted for fetching: " + i);

        ContentValues[] seriesEntries = new ContentValues[i];
        ContentValues[] resultEntries = new ContentValues[numResults];
        int seriesEntryWriteIndex = 0;
        int resultEntryWriteIndex = 0;

        for (Future<BundledMatchItem> seriesItemFuture : seriesItemFutures) {
            try {
                BundledMatchItem seriesItem = seriesItemFuture.get();
                if (seriesItem != null) {
                    seriesEntries[seriesEntryWriteIndex] = seriesItem.mMatch;
                    seriesEntryWriteIndex++;
                    if (seriesItem.hasResult) {
                        resultEntries[resultEntryWriteIndex] = seriesItem.mResult;
                        resultEntryWriteIndex++;
                    }
                }
            } catch (ExecutionException e) {
                Log.e(TAG, "Should never get here");
            }
        }

        this.getContentResolver().bulkInsert(MatchContract.SeriesEntry.CONTENT_URI, seriesEntries);

        if (doResults)
            this.getContentResolver().bulkInsert(MatchContract.ResultEntry.CONTENT_URI, resultEntries);

        PrefUtils.setLastUpdateTime(this, TimeUtils.getUTCTime());

    } catch (IOException e) {
        Log.e(TAG, e.getMessage(), e);
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    LocalBroadcastManager.getInstance(this).sendBroadcast(new Intent(UPDATE_COMPLETE));

    PrefUtils.setLastResultsUpdateTime(this, TimeUtils.getUTCTime());
}

From source file:im.ene.lab.attiq.ui.activities.ItemDetailActivity.java

@SuppressWarnings("unused")
public void onEventMainThread(ItemCommentsEvent event) {
    if (!UIUtil.isEmpty(event.comments)) {
        mCommentsView.setVisibility(View.VISIBLE);
        List<Comment> comments = event.comments;

        mCommentCount.setText(comments.size() + "");

        String info = comments.size() == 1 ? getString(R.string.comment_singular)
                : getString(R.string.comment_plural);
        // FIXME should use plural strings
        mCommentInfo.setText(getString(R.string.article_comment, comments.size(), info));

        final String html;
        try {//from  w w w  .j  a va 2 s.  com
            html = IOUtil.readAssets("html/comments.html");

            Document fullBody = Jsoup.parse(html);
            Element content = fullBody.getElementById("content");

            for (Comment comment : comments) {
                String commentHtml = IOUtil.readAssets("html/comment.html");
                commentHtml = commentHtml.replace("{user_icon_url}", comment.getUser().getProfileImageUrl())
                        .replace("{user_name}", comment.getUser().getId())
                        .replace("{comment_time}", TimeUtil.commentTime(comment.getCreatedAt()))
                        .replace("{article_uuid}", mItemUuid).replace("{comment_id}", comment.getId());

                Document commentDoc = Jsoup.parse(commentHtml);
                Element eComment = commentDoc.getElementsByClass("comment-box").first();
                eComment.getElementsByClass("message").first().append(comment.getRenderedBody());
                // remove comment edit block if it is not from current user
                if (mMyProfile == null || !mMyProfile.getId().equals(comment.getUser().getId())) {
                    String commentId = "comment_{comment_id}_{user_name}"
                            .replace("{comment_id}", comment.getId())
                            .replace("{user_name}", comment.getUser().getId());
                    Element commentEditor = commentDoc.getElementById(commentId);
                    commentEditor.remove();
                }

                content.appendChild(eComment);
            }

            String result = fullBody.outerHtml();
            mCommentsView.loadDataWithBaseURL("http://qiita.com/", result, null, null, null);
        } catch (IOException e) {
            e.printStackTrace();
        }
    } else {
        mCommentCount.setText("0");
        mCommentInfo.setText(getString(R.string.article_comment, 0, getString(R.string.comment_plural)));
        mCommentsView.setVisibility(View.GONE);
    }
}

From source file:net.yoomai.virgo.spider.parsers.ICISParser.java

@Override
public List<Estimate> parser(File htmlFile) {
    Document doc = null;
    try {/*from   w ww .  jav a2  s.  c  om*/
        doc = Jsoup.parse(htmlFile, "UTF-8");
    } catch (IOException e) {
        log.error(e.getMessage() + " : " + e.getCause());
    }

    log.info("doc: " + doc);
    if (doc != null) {
        Elements elements = doc.getElementsByClass("price-datalist");
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            log.info(element.toString());
        }
    }

    return null;
}

From source file:org.keionline.keionline.ArticleView.java

private String getContent(String url) throws IOException {
    Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
    Element data = doc.getElementsByClass("node").first();// get the third content div,
    Elements select = data.select("img");
    // Change the links to absolute!! so that images work
    for (Element e : select) {
        e.attr("src", e.absUrl("src"));
    }/*w  ww  .ja v  a  2 s .c  o m*/
    select = data.select("a");
    for (Element e : select) {
        e.attr("href", e.absUrl("href"));
    }
    Element info = data.getElementsByClass("submitted").first();
    info.after("<hr>");
    String cont = data.toString();
    cont = CSS + cont + "</body>";
    content = cont;
    return cont;
}

From source file:org.loklak.api.search.EventBriteCrawlerService.java

public static SusiThought crawlEventBrite(String url) {
    Document htmlPage = null;

    try {/*from ww  w. j a va 2s. c  o m*/
        htmlPage = Jsoup.connect(url).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    String eventID = null;
    String eventName = null;
    String eventDescription = null;

    // TODO Fetch Event Color
    String eventColor = null;

    String imageLink = null;

    String eventLocation = null;

    String startingTime = null;
    String endingTime = null;

    String ticketURL = null;

    Elements tagSection = null;
    Elements tagSpan = null;
    String[][] tags = new String[5][2];
    String topic = null; // By default

    String closingDateTime = null;
    String schedulePublishedOn = null;
    JSONObject creator = new JSONObject();
    String email = null;

    Float latitude = null;
    Float longitude = null;

    String privacy = "public"; // By Default
    String state = "completed"; // By Default
    String eventType = "";

    String temp;
    Elements t;

    eventID = htmlPage.getElementsByTag("body").attr("data-event-id");
    eventName = htmlPage.getElementsByClass("listing-hero-body").text();
    eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text();

    eventColor = null;

    imageLink = htmlPage.getElementsByTag("picture").attr("content");

    eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text();

    temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    if (temp.length() >= 20) {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content")
                .substring(0, 19);
    } else {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    if (temp.length() >= 20) {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content")
                .substring(0, 19);
    } else {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    }

    ticketURL = url + "#tickets";

    // TODO Tags to be modified to fit in the format of Open Event "topic"
    tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs");
    tagSpan = tagSection.select("span");
    topic = "";

    int iterator = 0, k = 0;
    for (Element e : tagSpan) {
        if (iterator % 2 == 0) {
            tags[k][1] = "www.eventbrite.com"
                    + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href");
        } else {
            tags[k][0] = e.text();
            k++;
        }
        iterator++;
    }

    creator.put("email", "");
    creator.put("id", "1"); // By Default

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content");
    if (temp.length() > 0) {
        latitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"));
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content");
    if (temp.length() > 0) {
        longitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"));
    }

    // TODO This returns: "events.event" which is not supported by Open
    // Event Generator
    // eventType = htmlPage.getElementsByAttributeValue("property",
    // "og:type").attr("content");

    String organizerName = null;
    String organizerLink = null;
    String organizerProfileLink = null;
    String organizerWebsite = null;
    String organizerContactInfo = null;
    String organizerDescription = null;
    String organizerFacebookFeedLink = null;
    String organizerTwitterFeedLink = null;
    String organizerFacebookAccountLink = null;
    String organizerTwitterAccountLink = null;

    temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text();
    if (temp.length() >= 5) {
        organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text()
                .substring(4);
    } else {
        organizerName = "";
    }
    organizerLink = url + "#listing-organizer";
    organizerProfileLink = htmlPage
            .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden")
            .attr("href");
    organizerContactInfo = url + "#lightbox_contact";

    Document orgProfilePage = null;

    try {
        orgProfilePage = Jsoup.connect(organizerProfileLink).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (orgProfilePage != null) {

        t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website");
        if (t != null) {
            organizerWebsite = orgProfilePage
                    .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text();
        } else {
            organizerWebsite = "";
        }

        t = orgProfilePage.select("div.js-long-text.organizer-description");
        if (t != null) {
            organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text();
        } else {
            organizerDescription = "";
        }

        organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed";
        organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed";

        t = orgProfilePage.getElementsByAttributeValue("class", "fb-page");
        if (t != null) {
            organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page")
                    .attr("data-href");
        } else {
            organizerFacebookAccountLink = "";
        }

        t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline");
        if (t != null) {
            organizerTwitterAccountLink = orgProfilePage
                    .getElementsByAttributeValue("class", "twitter-timeline").attr("href");
        } else {
            organizerTwitterAccountLink = "";
        }

    }

    JSONArray socialLinks = new JSONArray();

    JSONObject fb = new JSONObject();
    fb.put("id", "1");
    fb.put("name", "Facebook");
    fb.put("link", organizerFacebookAccountLink);
    socialLinks.put(fb);

    JSONObject tw = new JSONObject();
    tw.put("id", "2");
    tw.put("name", "Twitter");
    tw.put("link", organizerTwitterAccountLink);
    socialLinks.put(tw);

    JSONArray jsonArray = new JSONArray();

    JSONObject event = new JSONObject();
    event.put("event_url", url);
    event.put("id", eventID);
    event.put("name", eventName);
    event.put("description", eventDescription);
    event.put("color", eventColor);
    event.put("background_url", imageLink);
    event.put("closing_datetime", closingDateTime);
    event.put("creator", creator);
    event.put("email", email);
    event.put("location_name", eventLocation);
    event.put("latitude", latitude);
    event.put("longitude", longitude);
    event.put("start_time", startingTime);
    event.put("end_time", endingTime);
    event.put("logo", imageLink);
    event.put("organizer_description", organizerDescription);
    event.put("organizer_name", organizerName);
    event.put("privacy", privacy);
    event.put("schedule_published_on", schedulePublishedOn);
    event.put("state", state);
    event.put("type", eventType);
    event.put("ticket_url", ticketURL);
    event.put("social_links", socialLinks);
    event.put("topic", topic);
    jsonArray.put(event);

    JSONObject org = new JSONObject();
    org.put("organizer_name", organizerName);
    org.put("organizer_link", organizerLink);
    org.put("organizer_profile_link", organizerProfileLink);
    org.put("organizer_website", organizerWebsite);
    org.put("organizer_contact_info", organizerContactInfo);
    org.put("organizer_description", organizerDescription);
    org.put("organizer_facebook_feed_link", organizerFacebookFeedLink);
    org.put("organizer_twitter_feed_link", organizerTwitterFeedLink);
    org.put("organizer_facebook_account_link", organizerFacebookAccountLink);
    org.put("organizer_twitter_account_link", organizerTwitterAccountLink);
    jsonArray.put(org);

    JSONArray microlocations = new JSONArray();
    jsonArray.put(new JSONObject().put("microlocations", microlocations));

    JSONArray customForms = new JSONArray();
    jsonArray.put(new JSONObject().put("customForms", customForms));

    JSONArray sessionTypes = new JSONArray();
    jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes));

    JSONArray sessions = new JSONArray();
    jsonArray.put(new JSONObject().put("sessions", sessions));

    JSONArray sponsors = new JSONArray();
    jsonArray.put(new JSONObject().put("sponsors", sponsors));

    JSONArray speakers = new JSONArray();
    jsonArray.put(new JSONObject().put("speakers", speakers));

    JSONArray tracks = new JSONArray();
    jsonArray.put(new JSONObject().put("tracks", tracks));

    String userHome = System.getProperty("user.home");
    String path = userHome + "/Downloads/EventBriteInfo";

    new File(path).mkdir();

    try (FileWriter file = new FileWriter(path + "/event.json")) {
        file.write(event.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/org.json")) {
        file.write(org.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/social_links.json")) {
        file.write(socialLinks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/microlocations.json")) {
        file.write(microlocations.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/custom_forms.json")) {
        file.write(customForms.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/session_types.json")) {
        file.write(sessionTypes.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sessions.json")) {
        file.write(sessions.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sponsors.json")) {
        file.write(sponsors.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/speakers.json")) {
        file.write(speakers.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/tracks.json")) {
        file.write(tracks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;

}

From source file:org.loklak.api.search.QuoraProfileScraper.java

public static SusiThought scrapeQuora(String profile) {

    JSONObject quoraProfile = new JSONObject();

    Document userHTML = null;
    String url = "https://www.quora.com/profile/" + profile;

    try {/*from   w  w w . j  a v  a  2 s  .c  o  m*/
        userHTML = Jsoup.connect(url).get();
    } catch (IOException e) {
        e.printStackTrace();
    }

    String bio = userHTML.getElementsByClass("qtext_para").text();
    quoraProfile.put("bio", bio);

    String profileImage = userHTML.select("img.profile_photo_img").attr("data-src");
    quoraProfile.put("profileImage", profileImage);

    String userName = userHTML.select("img.profile_photo_img").attr("alt");
    quoraProfile.put("user", userName);

    String rssFeedLink = url + "/rss";
    quoraProfile.put("rss_feed_link", rssFeedLink);

    JSONArray jsonArray = new JSONArray();
    jsonArray.put(quoraProfile);

    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());
    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMetadata();
    }/*w ww  .  j  ava 2  s.c om*/

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(MediaMetadata.IMDBID, imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>(
            executor);

    // worker for imdb request (/combined) (everytime from akas.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/combined");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(),
            options.getCountry().getAlpha2());
    Future<Document> futureCombined = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary = null;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2());
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureCombined.get();

    /*
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // parse title and year
    Element title = doc.getElementById("tn15title");
    if (title != null) {
        Element element = null;
        // title
        Elements elements = title.getElementsByTag("h1");
        if (elements.size() > 0) {
            element = elements.first();
            String movieTitle = cleanString(element.ownText());
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // year
        elements = title.getElementsByTag("span");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();

            // search year
            Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
            Matcher matcher = yearPattern.matcher(content);
            while (matcher.find()) {
                if (matcher.group(1) != null) {
                    String movieYear = matcher.group(1);
                    md.storeMetadata(MediaMetadata.YEAR, movieYear);
                    break;
                }
            }
        }

        // original title
        elements = title.getElementsByAttributeValue("class", "title-extra");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();
            content = content.replaceAll("\\(original title\\)", "").trim();
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content);
        }
    }

    // poster
    Element poster = doc.getElementById("primary-poster");
    if (poster != null) {
        String posterUrl = poster.attr("src");
        posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
        posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
        processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementById("tn15rating");
    if (ratingElement != null) {
        Elements elements = ratingElement.getElementsByClass("starbar-meta");
        if (elements.size() > 0) {
            Element div = elements.get(0);

            // rating comes in <b> tag
            Elements b = div.getElementsByTag("b");
            if (b.size() == 1) {
                String ratingAsString = b.text();
                Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10");
                Matcher matcher = ratingPattern.matcher(ratingAsString);
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        float rating = 0;
                        try {
                            rating = Float.valueOf(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.RATING, rating);
                        break;
                    }
                }
            }

            // count
            Elements a = div.getElementsByAttributeValue("href", "ratings");
            if (a.size() == 1) {
                String countAsString = a.text().replaceAll("[.,]|votes", "").trim();
                int voteCount = 0;
                try {
                    voteCount = Integer.parseInt(countAsString);
                } catch (Exception e) {
                }
                md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount);
            }
        }

        // top250
        elements = ratingElement.getElementsByClass("starbar-special");
        if (elements.size() > 0) {
            Elements a = elements.get(0).getElementsByTag("a");
            if (a.size() > 0) {
                Element anchor = a.get(0);
                Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})");
                Matcher matcher = topPattern.matcher(anchor.ownText());
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        int top250 = 0;
                        try {
                            top250 = Integer.parseInt(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.TOP_250, top250);
                    }
                }
            }
        }
    }

    // parse all items coming by <div class="info">
    Elements elements = doc.getElementsByClass("info");
    for (Element element : elements) {
        // only parse divs
        if (!"div".equals(element.tag().getName())) {
            continue;
        }

        // elements with h5 are the titles of the values
        Elements h5 = element.getElementsByTag("h5");
        if (h5.size() > 0) {
            Element firstH5 = h5.first();
            String h5Title = firstH5.text();

            // release date
            /*
             * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline"
             * href="/title/tt0114746/releaseinfo"
             * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a>&nbsp;</div></div>
             */
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element releaseDateElement = div.first();
                    String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", ""));
                    Pattern pattern = Pattern.compile("(.*)\\(.*\\)");
                    Matcher matcher = pattern.matcher(releaseDate);
                    if (matcher.find()) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy");
                            Date parsedDate = sdf.parse(matcher.group(1));
                            sdf = new SimpleDateFormat("dd-MM-yyyy");
                            md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate));
                        } catch (Exception e) {
                        }
                    }
                }
            }

            /*
             * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline"
             * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See
             * more</a>&nbsp;&raquo; </div></div>
             */
            // tagline
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*")
                    && !options.isScrapeImdbForeignLanguage()) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.storeMetadata(MediaMetadata.TAGLINE, tagline);
                }
            }

            /*
             * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a
             * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a
             * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a
             * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick=
             * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a>&nbsp;&raquo; </div>
             */
            // genres are only scraped from akas.imdb.com
            if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Elements a = div.first().getElementsByTag("a");
                    for (Element anchor : a) {
                        if (anchor.attr("href").matches("/Sections/Genres/.*")) {
                            md.addGenre(getTmmGenre(anchor.ownText()));
                        }
                    }
                }
            }
            // }

            /*
             * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div>
             */
            // runtime
            // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String first = taglineElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.storeMetadata(MediaMetadata.RUNTIME, runtime);
                }
            }

            /*
             * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a
             * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div>
             */
            // country
            if (h5Title.matches("(?i)Country.*")) {
                Elements a = element.getElementsByTag("a");
                String countries = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/country/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String country = matcher.group(1);
                        if (StringUtils.isNotEmpty(countries)) {
                            countries += ", ";
                        }
                        countries += country.toUpperCase();
                    }
                }
                md.storeMetadata(MediaMetadata.COUNTRY, countries);
            }

            /*
             * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a
             * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div>
             */
            // Spoken languages
            if (h5Title.matches("(?i)Language.*")) {
                Elements a = element.getElementsByTag("a");
                String spokenLanguages = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/language/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String langu = matcher.group(1);
                        if (StringUtils.isNotEmpty(spokenLanguages)) {
                            spokenLanguages += ", ";
                        }
                        spokenLanguages += langu;
                    }
                }
                md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages);
            }

            /*
             * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate
             * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a
             * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a
             * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div>
             */
            // certification
            // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) {
                Elements a = element.getElementsByTag("a");
                for (Element anchor : a) {
                    // certification for the right country
                    if (anchor.attr("href").matches(
                            "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) {
                        Pattern certificationPattern = Pattern.compile(".*:(.*)");
                        Matcher matcher = certificationPattern.matcher(anchor.ownText());
                        Certification certification = null;
                        while (matcher.find()) {
                            if (matcher.group(1) != null) {
                                certification = Certification.getCertification(options.getCountry(),
                                        matcher.group(1));
                            }
                        }

                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }
            }
        }

        /*
         * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick=
         * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div>
         */
        // director
        if ("director-info".equals(element.id())) {
            Elements a = element.getElementsByTag("a");
            for (Element anchor : a) {
                if (anchor.attr("href").matches("/name/nm.*")) {
                    MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                    cm.setName(anchor.ownText());
                    md.addCastMember(cm);
                }
            }
        }
    }

    /*
     * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick=
     * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src=
     * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a
     * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td
     * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a
     * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick=
     * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick=
     * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick=
     * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table>
     */
    // cast
    elements = doc.getElementsByClass("cast");
    if (elements.size() > 0) {
        Elements tr = elements.get(0).getElementsByTag("tr");
        for (Element row : tr) {
            Elements td = row.getElementsByTag("td");
            MediaCastMember cm = new MediaCastMember();
            for (Element column : td) {
                // actor thumb
                if (column.hasClass("hs")) {
                    Elements img = column.getElementsByTag("img");
                    if (img.size() > 0) {
                        String thumbUrl = img.get(0).attr("src");
                        if (thumbUrl.contains("no_photo.png")) {
                            cm.setImageUrl("");
                        } else {
                            thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                            thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", "");
                            cm.setImageUrl(thumbUrl);
                        }
                    }
                }
                // actor name
                if (column.hasClass("nm")) {
                    cm.setName(cleanString(column.text()));
                }
                // character
                if (column.hasClass("char")) {
                    cm.setCharacter(cleanString(column.text()));
                }
            }
            if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    Element content = doc.getElementById("tn15content");
    if (content != null) {
        elements = content.getElementsByTag("table");
        for (Element table : elements) {
            // writers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) {
                Elements anchors = table.getElementsByTag("a");
                for (Element anchor : anchors) {
                    if (anchor.attr("href").matches("/name/nm.*")) {
                        MediaCastMember cm = new MediaCastMember(CastType.WRITER);
                        cm.setName(anchor.ownText());
                        md.addCastMember(cm);
                    }
                }
            }

            // producers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                Elements rows = table.getElementsByTag("tr");
                for (Element row : rows) {
                    if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                        continue;
                    }
                    Elements columns = row.children();
                    if (columns.size() == 0) {
                        continue;
                    }
                    MediaCastMember cm = new MediaCastMember(CastType.PRODUCER);
                    String name = cleanString(columns.get(0).text());
                    if (StringUtils.isBlank(name)) {
                        continue;
                    }
                    cm.setName(name);
                    if (columns.size() >= 3) {
                        cm.setPart(cleanString(columns.get(2).text()));
                    }
                    md.addCastMember(cm);
                }
            }
        }
    }

    // Production companies
    elements = doc.getElementsByClass("blackcatheader");
    for (Element blackcatheader : elements) {
        if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) {
            Elements a = blackcatheader.nextElementSibling().getElementsByTag("a");
            StringBuilder productionCompanies = new StringBuilder();
            for (Element anchor : a) {
                if (StringUtils.isNotEmpty(productionCompanies)) {
                    productionCompanies.append(", ");
                }
                productionCompanies.append(anchor.ownText());
            }
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString());
            break;
        }
    }

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = null;
    doc = futurePlotsummary.get();

    // imdb.com has another site structure
    if (imdbSite == ImdbSiteDefinition.IMDB_COM) {
        Elements zebraList = doc.getElementsByClass("zebraList");
        if (zebraList != null && !zebraList.isEmpty()) {
            Elements odd = zebraList.get(0).getElementsByClass("odd");
            if (odd.isEmpty()) {
                odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even
            }
            if (odd.size() > 0) {
                Elements p = odd.get(0).getElementsByTag("p");
                if (p.size() > 0) {
                    String plot = cleanString(p.get(0).ownText());
                    md.storeMetadata(MediaMetadata.PLOT, plot);
                }
            }
        }
    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.storeMetadata(MediaMetadata.PLOT, plot);
        }
    }

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        title = doc.getElementById("tn15title");
        if (title != null) {
            Element element = null;
            // title
            elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.storeMetadata(MediaMetadata.TITLE, movieTitle);
            }
        }
    }
    // }

    // get data from tmdb?
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        MediaMetadata tmdbMd = futureTmdb.get();
        if (options.isScrapeImdbForeignLanguage() && tmdbMd != null
                && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) {
            // tmdbid
            md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID));
            // title
            md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE));
            // original title
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE));
            // tagline
            md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE));
            // plot
            md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT));
            // collection info
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
        }
        if (options.isScrapeCollectionInfo() && tmdbMd != null) {
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    LOGGER.debug("search() " + query.toString());
    /*/*w  ww. ja va 2 s  . c  o m*/
     * IMDb matches seem to come in several "flavours".
     * 
     * Firstly, if there is one exact match it returns the matching IMDb page.
     * 
     * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles
     * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results)
     * 
     * We should check the Exact match section first, then the poplar titles and finally the partial matches.
     * 
     * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek"
     */

    Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/");

    List<MediaSearchResult> result = new ArrayList<MediaSearchResult>();

    String searchTerm = "";

    if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) {
        searchTerm = query.get(SearchParam.IMDBID);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.QUERY);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.TITLE);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        return result;
    }

    // parse out language and coutry from the scraper options
    String language = query.get(SearchParam.LANGUAGE);
    String myear = query.get(SearchParam.YEAR);
    String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    StringBuilder sb = new StringBuilder(imdbSite.getSite());
    sb.append("find?q=");
    try {
        // search site was everytime in UTF-8
        sb.append(URLEncoder.encode(searchTerm, "UTF-8"));
    } catch (UnsupportedEncodingException ex) {
        // Failed to encode the movie name for some reason!
        LOGGER.debug("Failed to encode search term: " + searchTerm);
        sb.append(searchTerm);
    }

    // we need to search for all - otherwise we do not find TV movies
    sb.append(CAT_TITLE);

    LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString());
    Document doc;
    try {
        CachedUrl url = new CachedUrl(sb.toString());
        url.addHeader("Accept-Language", getAcceptLanguage(language, country));
        doc = Jsoup.parse(url.getInputStream(), "UTF-8", "");
    } catch (Exception e) {
        LOGGER.debug("tried to fetch search response", e);

        // clear Cache
        CachedUrl.removeCachedFileForUrl(sb.toString());

        return result;
    }

    // check if it was directly redirected to the site
    Elements elements = doc.getElementsByAttributeValue("rel", "canonical");
    for (Element element : elements) {
        MediaMetadata md = null;
        // we have been redirected to the movie site
        String movieName = null;
        String movieId = null;

        String href = element.attr("href");
        Matcher matcher = imdbIdPattern.matcher(href);
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                movieId = matcher.group(1);
            }
        }

        // get full information
        if (!StringUtils.isEmpty(movieId)) {
            MediaScrapeOptions options = new MediaScrapeOptions();
            options.setImdbId(movieId);
            options.setLanguage(MediaLanguages.valueOf(language));
            options.setCountry(CountryCode.valueOf(country));
            options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO)));
            options.setScrapeImdbForeignLanguage(
                    Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE)));
            md = getMetadata(options);
            if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) {
                movieName = md.getStringValue(MediaMetadata.TITLE);
            }
        }

        // if a movie name/id was found - return it
        if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) {
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            sr.setTitle(movieName);
            sr.setIMDBId(movieId);
            sr.setYear(md.getStringValue(MediaMetadata.YEAR));
            sr.setMetadata(md);
            sr.setScore(1);

            // and parse out the poster
            String posterUrl = "";
            Element td = doc.getElementById("img_primary");
            if (td != null) {
                Elements imgs = td.getElementsByTag("img");
                for (Element img : imgs) {
                    posterUrl = img.attr("src");
                    posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                    posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                    posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
                }
            }
            if (StringUtils.isNotBlank(posterUrl)) {
                sr.setPosterUrl(posterUrl);
            }

            result.add(sr);
            return result;
        }
    }

    // parse results
    // elements = doc.getElementsByClass("result_text");
    elements = doc.getElementsByClass("findResult");
    for (Element tr : elements) {
        // we only want the tr's
        if (!"tr".equalsIgnoreCase(tr.tagName())) {
            continue;
        }

        // find the id / name
        String movieName = "";
        String movieId = "";
        String year = "";
        Elements tds = tr.getElementsByClass("result_text");
        for (Element element : tds) {
            // we only want the td's
            if (!"td".equalsIgnoreCase(element.tagName())) {
                continue;
            }

            // filter out unwanted results
            Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*|
            Matcher matcher = unwanted.matcher(element.text());
            if (matcher.find()) {
                continue;
            }

            // is there a localized name? (aka)
            String localizedName = "";
            Elements italics = element.getElementsByTag("i");
            if (italics.size() > 0) {
                localizedName = italics.text().replace("\"", "");
            }

            // get the name inside the link
            Elements anchors = element.getElementsByTag("a");
            for (Element a : anchors) {
                if (StringUtils.isNotEmpty(a.text())) {
                    // movie name
                    if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) {
                        // take AKA as title, but only if not EN
                        movieName = localizedName;
                    } else {
                        movieName = a.text();
                    }

                    // parse id
                    String href = a.attr("href");
                    matcher = imdbIdPattern.matcher(href);
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            movieId = matcher.group(1);
                        }
                    }

                    // try to parse out the year
                    Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
                    matcher = yearPattern.matcher(element.text());
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            year = matcher.group(1);
                            break;
                        }
                    }
                    break;
                }
            }
        }

        // if an id/name was found - parse the poster image
        String posterUrl = "";
        tds = tr.getElementsByClass("primary_photo");
        for (Element element : tds) {
            Elements imgs = element.getElementsByTag("img");
            for (Element img : imgs) {
                posterUrl = img.attr("src");
                posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
            }
        }

        // if no movie name/id was found - continue
        if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) {
            continue;
        }

        MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
        sr.setTitle(movieName);
        sr.setIMDBId(movieId);
        sr.setYear(year);
        sr.setPosterUrl(posterUrl);

        // populate extra args
        MetadataUtil.copySearchQueryToSearchResult(query, sr);

        if (movieId.equals(query.get(SearchParam.IMDBID))) {
            // perfect match
            sr.setScore(1);
        } else {
            // compare score based on names
            float score = MetadataUtil.calculateScore(searchTerm, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                LOGGER.debug("no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) {
                LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01");
                score = score - 0.01f;
            }
            sr.setScore(score);
        }

        result.add(sr);

        // only get 40 results
        if (result.size() >= 40) {
            break;
        }
    }
    Collections.sort(result);
    Collections.reverse(result);

    return result;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    Date releaseDate = null;/* w  ww  . j  ava  2 s . c o  m*/
    Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})");

    // old way
    Element tableReleaseDates = doc.getElementById("release_dates");
    if (tableReleaseDates != null) {
        Elements rows = tableReleaseDates.getElementsByTag("tr");
        // first round: check the release date for the first one with the requested country
        for (Element row : rows) {
            // get the anchor
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release_date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                }
            }
        }
    }

    // new way; iterating over class name items
    if (releaseDate == null) {
        Elements rows = doc.getElementsByClass("release-date-item");
        for (Element row : rows) {
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release-date-item__date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                } else {
                    LOGGER.trace("country {} does not match ours {}", matcher.group(1),
                            options.getCountry().getAlpha2());
                }
            }
        }
    }

    // no matching local release date found; take the first one
    if (releaseDate == null) {
        Element column = doc.getElementsByClass("release_date").first();
        if (column == null) {
            column = doc.getElementsByClass("release-date-item__date").first();
        }
        if (column != null) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                releaseDate = sdf.parse(column.text());
            } catch (ParseException otherformat) {
                try {
                    SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                    releaseDate = sdf.parse(column.text());
                } catch (ParseException ignored) {
                }
            }
        }
    }

    if (releaseDate != null) {
        md.setReleaseDate(releaseDate);
    }
    return md;
}