Example usage for org.jsoup.nodes Document getElementsByAttributeValue

List of usage examples for org.jsoup.nodes Document getElementsByAttributeValue

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByAttributeValue.

Prototype

public Elements getElementsByAttributeValue(String key, String value) 

Source Link

Document

Find elements that have an attribute with the specific value.

Usage

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Pulls a text from a Wikipedia URL without images, tags, etc.
 * // ww  w  . j a v a  2s.  c o m
 * @param url
 *       Address of the targetted text.
 * @return
 *       An Article object representing the retrieved object.
 * 
 * @throws ReaderException
 *       Problem while retrieving the text.
 */
@Override
public Article read(URL url) throws ReaderException {
    Article result = null;
    String name = getName(url);

    try { // get the page
        String address = url.toString();
        logger.log("Retrieving page " + address);
        long startTime = System.currentTimeMillis();
        Document document = retrieveSourceCode(name, url);

        // get its title
        Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0);
        String title = firstHeadingElt.text();
        logger.log("Get title: " + title);

        // get raw and linked texts
        logger.log("Get raw and linked texts.");
        StringBuilder rawStr = new StringBuilder();
        StringBuilder linkedStr = new StringBuilder();
        Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0);
        // processing each element in the content part
        boolean ignoringSection = false;
        boolean first = true;
        for (Element element : bodyContentElt.children()) {
            String eltName = element.tag().getName();
            String eltClass = element.attr(XmlNames.ATT_CLASS);

            // section headers
            if (eltName.equals(XmlNames.ELT_H2)) {
                first = false;
                // get section name
                StringBuilder fakeRaw = new StringBuilder();
                StringBuilder fakeLinked = new StringBuilder();
                processParagraphElement(element, fakeRaw, fakeLinked);
                String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH);
                // check section name
                if (IGNORED_SECTIONS.contains(str))
                    ignoringSection = true;
                else {
                    ignoringSection = false;
                    rawStr.append("\n-----");
                    linkedStr.append("\n-----");
                    processParagraphElement(element, rawStr, linkedStr);
                }
            }

            else if (!ignoringSection) { // lower sections
                if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4)
                        || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) {
                    first = false;
                    processParagraphElement(element, rawStr, linkedStr);
                }

                // paragraph
                else if (eltName.equals(XmlNames.ELT_P)) {
                    String str = element.text();
                    // ignore possible initial disambiguation link
                    if (!first || !str.startsWith(PARAGRAPH_FORTHE)) {
                        first = false;
                        processParagraphElement(element, rawStr, linkedStr);
                    }
                }

                // list
                else if (eltName.equals(XmlNames.ELT_UL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, false);
                } else if (eltName.equals(XmlNames.ELT_OL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, true);
                } else if (eltName.equals(XmlNames.ELT_DL)) {
                    first = false;
                    processDescriptionListElement(element, rawStr, linkedStr);
                }

                // tables
                else if (eltName.equals(XmlNames.ELT_TABLE)) {
                    first = !processTableElement(element, rawStr, linkedStr);
                }

                // divisions
                else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture 
                    if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB))
                        first = !processDivisionElement(element, rawStr, linkedStr);
                }

                // we ignore certain types of span (phonetic trancription, WP buttons...) 
                else if (eltName.equals(XmlNames.ELT_SPAN)) {
                    first = !processSpanElement(element, rawStr, linkedStr);
                }

                // hyperlinks must be included in the linked string, provided they are not external
                else if (eltName.equals(XmlNames.ELT_A)) {
                    first = !processHyperlinkElement(element, rawStr, linkedStr);
                }

                // quotes are just processed recursively
                else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                    first = !processQuoteElement(element, rawStr, linkedStr);
                }

                // other tags are ignored
            }
        }

        // create article object
        result = new Article(name);
        result.setTitle(title);
        result.setUrl(url);
        result.initDate();

        // clean text
        String rawText = rawStr.toString();
        rawText = cleanText(rawText);
        //         rawText = ArticleCleaning.replaceChars(rawText);
        result.setRawText(rawText);
        logger.log("Length of the raw text: " + rawText.length() + " chars.");
        String linkedText = linkedStr.toString();
        linkedText = cleanText(linkedText);
        //         linkedText = ArticleCleaning.replaceChars(linkedText);
        result.setLinkedText(linkedText);
        logger.log("Length of the linked text: " + linkedText.length() + " chars.");

        // get original html source code
        logger.log("Get original HTML source code.");
        String originalPage = document.toString();
        result.setOriginalPage(originalPage);
        logger.log("Length of the original page: " + originalPage.length() + " chars.");

        // get the categories of the article 
        List<ArticleCategory> categories = getArticleCategories(result);
        result.setCategories(categories);

        long endTime = System.currentTimeMillis();
        logger.log("Total duration: " + (endTime - startTime) + " ms.");
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (org.json.simple.parser.ParseException e) {
        e.printStackTrace();
    }

    return result;
}

From source file:autoInsurance.BeiJPiccImpl.java

public String login(String in) {
    // TODO Auto-generated method stub
    String out = "";

    JSONObject jsonObject = JSONObject.fromObject(in);
    String ukey = jsonObject.getString("ukey");
    String loginName = jsonObject.getString("loginName");
    String password = jsonObject.getString("password");

    String url = "http://10.134.136.48:8000/prpall/index.jsp";
    String httpOrgCreateTestRtn = httpClientUtil.doPost(url, new HashMap<String, String>(), charset);
    if (httpOrgCreateTestRtn == null) {
        return "{\"success\": false, \"msg\": \"\"}";
    }/*from w  w  w.j a  va  2s .  c o m*/

    //      write2Html(httpOrgCreateTestRtn);

    Document doc = Jsoup.parse(httpOrgCreateTestRtn);
    System.out.println(doc.title());
    if (doc.title().contains("PICC"))
        return "{\"success\": false, \"msg\": \"!\"}";

    String action = "";
    if (doc.getElementById("fm") != null)
        action = doc.getElementById("fm").attr("action");

    url = "https://10.134.136.48:8888" + action;
    String lt = doc.getElementsByAttributeValue("name", "lt").get(0).attr("value");
    String postData = "PTAVersion=&toSign=&Signature=&rememberFlag=0&userMac=&key=yes&errorKey=null&loginMethod=nameAndPwd&username="
            + loginName + "&password=" + password + "&lt=" + lt
            + "&_eventId=submit&pcguid=&button.x=20&button.y=17";

    Map<String, String> map = null;
    try {
        map = parse2Map(postData);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    String respStr = httpClientUtil.doPost(url, map, charset);
    write2Html(respStr);
    doc = Jsoup.parse(respStr);
    System.out.println(doc.title());

    //      httpOrgCreateTestRtn = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/business/quickProposal.do?bizType=PROPOSAL&editType=NEW&is4S=Y",null,charset);
    //      doc = Jsoup.parse(httpOrgCreateTestRtn);
    //      try {
    //         init(doc);
    //      } catch (Exception e1) {
    //         // TODO Auto-generated catch block
    //         e1.printStackTrace();
    //      }

    String comCode = templateData.get("prpCmain.comCode"); // 
    String handler1Code = templateData.get("prpCmain.handler1Code");//
    String agentCode = templateData.get("prpCmain.agentCode");//
    String businessNature = templateData.get("prpCmain.businessNature");//

    String param = "actionType=query&fieldIndex=206&fieldValue=" + agentCode
            + "&codeMethod=change&codeType=select&codeRelation=0%2C1%2C2&isClear=Y&otherCondition=operateDate%3D"
            + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "%2CriskCode%3DDAA%2CcomCode%3D" + comCode
            + "%2CbusinessNature%3D" + businessNature
            + "&typeParam=&callBackMethod=MainTotal.setAgentCode()%3BMainTotal.clearForAgentType()%3BItemCar.checkSelectKYFlag()%3B&getDataMethod=getAgents";
    respStr = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/common/changeCodeInput.do?" + param,
            new HashMap<String, String>(), charset);
    //System.out.println(respStr);// 11003O100375_FIELD_SEPARATOR__FIELD_SEPARATOR_3O1000
    String[] _field_separator = respStr.split("_FIELD_SEPARATOR_");
    if (_field_separator.length < 3)
        return "{\"success\": false, \"msg\": \"\"}";

    String agentName = _field_separator[1];
    String agentType = _field_separator[2];
    templateData.put("agentType", agentType);
    try {
        param = "comCode=" + URLEncoder.encode(comCode, charset) + "&handler1Code="
                + URLEncoder.encode(handler1Code, charset) + "&agentCode="
                + URLEncoder.encode(agentCode, charset) + "&businessNature="
                + URLEncoder.encode(businessNature, charset);
    } catch (UnsupportedEncodingException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    respStr = httpClientUtil.doPost("http://10.134.136.48:8000/prpall/business/getCheckUserMsg.do?" + param,
            new HashMap<String, String>(), charset);
    Map retMap = JackJson.fromJsonToObject(respStr, Map.class);
    //System.out.println(retMap);
    String b = templateData.get("isCqp");
    String qualificationName = "";
    if (templateData.get("qualificationName") != null) {
        if (StringUtils.equals("1", b)) {
            qualificationName = templateData.get("prpQmainVoagentName");
        } else {
            qualificationName = templateData.get("prpCmainagentName");
        }
    }
    templateData.put("qualificationName", qualificationName);
    templateData.put("qualificationNo", (String) ((Map) ((List) retMap.get("data")).get(0)).get("permitNo"));

    templateData.put("prpCmainCommon.queryArea", "110000");
    templateData.put("queryArea", "");
    templateData.put("prpCinsureds[0].countryCode", "CHN");
    templateData.put("resident[0]", "0");
    templateData.put("LicenseColorCodeDes", "");
    templateData.put("prpCitemCar.licenseColorCode", "01");
    // 115192BJ
    templateData.put("agentCodeValidValue", ukey);
    templateData.put("agentCodeValidType", "U");

    out = "{\"success\": true, \"msg\": \"" + loginName + "," + agentName + ",\"}";
    return out;
}

From source file:org.fox.ttrss.OnlineActivity.java

@Override
public boolean onContextItemSelected(android.view.MenuItem item) {
    /* AdapterContextMenuInfo info = (AdapterContextMenuInfo) item
    .getMenuInfo(); *///from  ww w .  j  a v a2 s. c o m

    final ArticlePager ap = (ArticlePager) getSupportFragmentManager().findFragmentByTag(FRAG_ARTICLE);

    switch (item.getItemId()) {
    case R.id.article_img_open:
        if (getLastContentImageHitTestUrl() != null) {
            try {
                Intent intent = new Intent(Intent.ACTION_VIEW, Uri.parse(getLastContentImageHitTestUrl()));
                startActivity(intent);
            } catch (Exception e) {
                e.printStackTrace();
                toast(R.string.error_other_error);
            }
        }
        return true;
    case R.id.article_img_copy:
        if (getLastContentImageHitTestUrl() != null) {
            copyToClipboard(getLastContentImageHitTestUrl());
        }
        return true;
    case R.id.article_img_share:
        if (getLastContentImageHitTestUrl() != null) {
            Intent intent = new Intent(Intent.ACTION_SEND);

            intent.setType("image/png");
            intent.putExtra(Intent.EXTRA_SUBJECT, getLastContentImageHitTestUrl());
            intent.putExtra(Intent.EXTRA_TEXT, getLastContentImageHitTestUrl());

            startActivity(Intent.createChooser(intent, getLastContentImageHitTestUrl()));
        }
        return true;
    case R.id.article_img_view_caption:
        if (getLastContentImageHitTestUrl() != null) {

            // Android doesn't give us an easy way to access title tags;
            // we'll use Jsoup on the body text to grab the title text
            // from the first image tag with this url. This will show
            // the wrong text if an image is used multiple times.
            Document doc = Jsoup.parse(ap.getSelectedArticle().content);
            Elements es = doc.getElementsByAttributeValue("src", getLastContentImageHitTestUrl());
            if (es.size() > 0) {
                if (es.get(0).hasAttr("title")) {
                    Dialog dia = new Dialog(this);
                    if (es.get(0).hasAttr("alt")) {
                        dia.setTitle(es.get(0).attr("alt"));
                    } else {
                        dia.setTitle(es.get(0).attr("title"));
                    }
                    TextView titleText = new TextView(this);

                    if (android.os.Build.VERSION.SDK_INT >= android.os.Build.VERSION_CODES.JELLY_BEAN) {
                        titleText.setPaddingRelative(24, 24, 24, 24);
                    } else {
                        titleText.setPadding(24, 24, 24, 24);
                    }

                    titleText.setTextSize(16);
                    titleText.setText(es.get(0).attr("title"));
                    dia.setContentView(titleText);
                    dia.show();
                } else {
                    toast(R.string.no_caption_to_display);
                }
            } else {
                toast(R.string.no_caption_to_display);
            }
        }
        return true;
    case R.id.article_link_share:
        if (ap != null && ap.getSelectedArticle() != null) {
            shareArticle(ap.getSelectedArticle());
        }
        return true;
    case R.id.article_link_copy:
        Log.d(TAG, "article_link_copy");
        if (ap != null && ap.getSelectedArticle() != null) {
            copyToClipboard(ap.getSelectedArticle().link);
        }
        return true;
    default:
        Log.d(TAG, "onContextItemSelected, unhandled id=" + item.getItemId());
        return super.onContextItemSelected(item);
    }
}

From source file:org.loklak.api.search.EventBriteCrawlerService.java

public static SusiThought crawlEventBrite(String url) {
    Document htmlPage = null;

    try {//from w w w  .jav a2 s.  c  o m
        htmlPage = Jsoup.connect(url).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    String eventID = null;
    String eventName = null;
    String eventDescription = null;

    // TODO Fetch Event Color
    String eventColor = null;

    String imageLink = null;

    String eventLocation = null;

    String startingTime = null;
    String endingTime = null;

    String ticketURL = null;

    Elements tagSection = null;
    Elements tagSpan = null;
    String[][] tags = new String[5][2];
    String topic = null; // By default

    String closingDateTime = null;
    String schedulePublishedOn = null;
    JSONObject creator = new JSONObject();
    String email = null;

    Float latitude = null;
    Float longitude = null;

    String privacy = "public"; // By Default
    String state = "completed"; // By Default
    String eventType = "";

    String temp;
    Elements t;

    eventID = htmlPage.getElementsByTag("body").attr("data-event-id");
    eventName = htmlPage.getElementsByClass("listing-hero-body").text();
    eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text();

    eventColor = null;

    imageLink = htmlPage.getElementsByTag("picture").attr("content");

    eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text();

    temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    if (temp.length() >= 20) {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content")
                .substring(0, 19);
    } else {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    if (temp.length() >= 20) {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content")
                .substring(0, 19);
    } else {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    }

    ticketURL = url + "#tickets";

    // TODO Tags to be modified to fit in the format of Open Event "topic"
    tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs");
    tagSpan = tagSection.select("span");
    topic = "";

    int iterator = 0, k = 0;
    for (Element e : tagSpan) {
        if (iterator % 2 == 0) {
            tags[k][1] = "www.eventbrite.com"
                    + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href");
        } else {
            tags[k][0] = e.text();
            k++;
        }
        iterator++;
    }

    creator.put("email", "");
    creator.put("id", "1"); // By Default

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content");
    if (temp.length() > 0) {
        latitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"));
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content");
    if (temp.length() > 0) {
        longitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"));
    }

    // TODO This returns: "events.event" which is not supported by Open
    // Event Generator
    // eventType = htmlPage.getElementsByAttributeValue("property",
    // "og:type").attr("content");

    String organizerName = null;
    String organizerLink = null;
    String organizerProfileLink = null;
    String organizerWebsite = null;
    String organizerContactInfo = null;
    String organizerDescription = null;
    String organizerFacebookFeedLink = null;
    String organizerTwitterFeedLink = null;
    String organizerFacebookAccountLink = null;
    String organizerTwitterAccountLink = null;

    temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text();
    if (temp.length() >= 5) {
        organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text()
                .substring(4);
    } else {
        organizerName = "";
    }
    organizerLink = url + "#listing-organizer";
    organizerProfileLink = htmlPage
            .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden")
            .attr("href");
    organizerContactInfo = url + "#lightbox_contact";

    Document orgProfilePage = null;

    try {
        orgProfilePage = Jsoup.connect(organizerProfileLink).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (orgProfilePage != null) {

        t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website");
        if (t != null) {
            organizerWebsite = orgProfilePage
                    .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text();
        } else {
            organizerWebsite = "";
        }

        t = orgProfilePage.select("div.js-long-text.organizer-description");
        if (t != null) {
            organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text();
        } else {
            organizerDescription = "";
        }

        organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed";
        organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed";

        t = orgProfilePage.getElementsByAttributeValue("class", "fb-page");
        if (t != null) {
            organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page")
                    .attr("data-href");
        } else {
            organizerFacebookAccountLink = "";
        }

        t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline");
        if (t != null) {
            organizerTwitterAccountLink = orgProfilePage
                    .getElementsByAttributeValue("class", "twitter-timeline").attr("href");
        } else {
            organizerTwitterAccountLink = "";
        }

    }

    JSONArray socialLinks = new JSONArray();

    JSONObject fb = new JSONObject();
    fb.put("id", "1");
    fb.put("name", "Facebook");
    fb.put("link", organizerFacebookAccountLink);
    socialLinks.put(fb);

    JSONObject tw = new JSONObject();
    tw.put("id", "2");
    tw.put("name", "Twitter");
    tw.put("link", organizerTwitterAccountLink);
    socialLinks.put(tw);

    JSONArray jsonArray = new JSONArray();

    JSONObject event = new JSONObject();
    event.put("event_url", url);
    event.put("id", eventID);
    event.put("name", eventName);
    event.put("description", eventDescription);
    event.put("color", eventColor);
    event.put("background_url", imageLink);
    event.put("closing_datetime", closingDateTime);
    event.put("creator", creator);
    event.put("email", email);
    event.put("location_name", eventLocation);
    event.put("latitude", latitude);
    event.put("longitude", longitude);
    event.put("start_time", startingTime);
    event.put("end_time", endingTime);
    event.put("logo", imageLink);
    event.put("organizer_description", organizerDescription);
    event.put("organizer_name", organizerName);
    event.put("privacy", privacy);
    event.put("schedule_published_on", schedulePublishedOn);
    event.put("state", state);
    event.put("type", eventType);
    event.put("ticket_url", ticketURL);
    event.put("social_links", socialLinks);
    event.put("topic", topic);
    jsonArray.put(event);

    JSONObject org = new JSONObject();
    org.put("organizer_name", organizerName);
    org.put("organizer_link", organizerLink);
    org.put("organizer_profile_link", organizerProfileLink);
    org.put("organizer_website", organizerWebsite);
    org.put("organizer_contact_info", organizerContactInfo);
    org.put("organizer_description", organizerDescription);
    org.put("organizer_facebook_feed_link", organizerFacebookFeedLink);
    org.put("organizer_twitter_feed_link", organizerTwitterFeedLink);
    org.put("organizer_facebook_account_link", organizerFacebookAccountLink);
    org.put("organizer_twitter_account_link", organizerTwitterAccountLink);
    jsonArray.put(org);

    JSONArray microlocations = new JSONArray();
    jsonArray.put(new JSONObject().put("microlocations", microlocations));

    JSONArray customForms = new JSONArray();
    jsonArray.put(new JSONObject().put("customForms", customForms));

    JSONArray sessionTypes = new JSONArray();
    jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes));

    JSONArray sessions = new JSONArray();
    jsonArray.put(new JSONObject().put("sessions", sessions));

    JSONArray sponsors = new JSONArray();
    jsonArray.put(new JSONObject().put("sponsors", sponsors));

    JSONArray speakers = new JSONArray();
    jsonArray.put(new JSONObject().put("speakers", speakers));

    JSONArray tracks = new JSONArray();
    jsonArray.put(new JSONObject().put("tracks", tracks));

    String userHome = System.getProperty("user.home");
    String path = userHome + "/Downloads/EventBriteInfo";

    new File(path).mkdir();

    try (FileWriter file = new FileWriter(path + "/event.json")) {
        file.write(event.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/org.json")) {
        file.write(org.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/social_links.json")) {
        file.write(socialLinks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/microlocations.json")) {
        file.write(microlocations.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/custom_forms.json")) {
        file.write(customForms.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/session_types.json")) {
        file.write(sessionTypes.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sessions.json")) {
        file.write(sessions.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sponsors.json")) {
        file.write(sponsors.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/speakers.json")) {
        file.write(speakers.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/tracks.json")) {
        file.write(tracks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;

}

From source file:org.loklak.api.search.GithubProfileScraper.java

public static SusiThought scrapeGithub(String profile) {

    Document html = null;

    JSONObject githubProfile = new JSONObject();

    try {/*w w  w  . java 2 s.  c  o  m*/
        html = Jsoup.connect("https://github.com/" + profile).get();
    } catch (IOException e) {

        URI uri = null;
        try {
            uri = new URI("https://api.github.com/search/users?q=" + profile);
        } catch (URISyntaxException e1) {
            e1.printStackTrace();
        }

        JSONTokener tokener = null;
        try {
            tokener = new JSONTokener(uri.toURL().openStream());
        } catch (Exception e1) {
            e1.printStackTrace();
        }

        JSONObject obj = new JSONObject(tokener);

        JSONArray arr = new JSONArray();
        arr.put(obj);

        SusiThought json = new SusiThought();
        json.setData(arr);
        return json;
    }

    String avatarUrl = html.getElementsByAttributeValue("class", "avatar rounded-2").attr("src");
    githubProfile.put("avatar_url", avatarUrl);

    String fullName = html.getElementsByAttributeValue("class", "vcard-fullname").text();
    githubProfile.put("full_name", fullName);

    String userName = html.getElementsByAttributeValue("class", "vcard-username").text();
    githubProfile.put("user_name", userName);

    String bio = html.getElementsByAttributeValue("class", "user-profile-bio").text();
    githubProfile.put("bio", bio);

    String atomFeedLink = html.getElementsByAttributeValue("type", "application/atom+xml").attr("href");
    githubProfile.put("atom_feed_link", "https://github.com" + atomFeedLink);

    String worksFor = html.getElementsByAttributeValue("itemprop", "worksFor").text();
    githubProfile.put("works_for", worksFor);

    String homeLocation = html.getElementsByAttributeValue("itemprop", "homeLocation").attr("title");
    githubProfile.put("home_location", homeLocation);

    String email = html.getElementsByAttributeValue("itemprop", "email").text();
    githubProfile.put("email", email);

    String specialLink = html.getElementsByAttributeValue("itemprop", "url").text();
    githubProfile.put("special_link", specialLink);

    String joiningDate = html.getElementsByAttributeValue("class", "join-date").attr("datetime");
    githubProfile.put("joining_date", joiningDate);

    /* If Individual User */
    if (html.getElementsByAttributeValue("class", "vcard-stat").size() != 0) {

        String followersUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(0).attr("href");
        githubProfile.put("followers_url", "https://github.com" + followersUrl);

        String followers = html.getElementsByAttributeValue("class", "vcard-stat").get(0).tagName("strong")
                .text();
        githubProfile.put("followers", followers);

        String starredUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(1).attr("href");
        githubProfile.put("starred_url", "https://github.com" + starredUrl);

        String starred = html.getElementsByAttributeValue("class", "vcard-stat").get(1).tagName("strong")
                .text();
        githubProfile.put("starred", starred);

        String followingUrl = html.getElementsByAttributeValue("class", "vcard-stat").get(2).attr("href");
        githubProfile.put("following_url", "https://github.com" + followingUrl);

        String following = html.getElementsByAttributeValue("class", "vcard-stat").get(2).tagName("strong")
                .text();
        githubProfile.put("following", following);
    }

    String gistsUrl = "https://api.github.com/users/" + profile + "/gists";
    githubProfile.put("gists_url", gistsUrl);

    String subscriptionsUrl = "https://api.github.com/users/" + profile + "/subscriptions";
    githubProfile.put("subscriptions_url", subscriptionsUrl);

    String reposUrl = "https://api.github.com/users/" + profile + "/repos";
    githubProfile.put("repos_url", reposUrl);

    String eventsUrl = "https://api.github.com/users/" + profile + "/events";
    githubProfile.put("events_url", eventsUrl);

    String receivedEventsUrl = "https://api.github.com/users/" + profile + "/received_events";
    githubProfile.put("received_events_url", receivedEventsUrl);

    JSONArray organizations = new JSONArray();
    Elements orgs = html.getElementsByAttributeValue("itemprop", "follows");
    for (Element e : orgs) {
        JSONObject obj = new JSONObject();

        String label = e.attr("aria-label");
        obj.put("label", label);

        String link = e.attr("href");
        obj.put("link", "https://github.com" + link);

        String imgLink = e.children().attr("src");
        obj.put("img_link", imgLink);

        String imgAlt = e.children().attr("alt");
        obj.put("img_Alt", imgAlt);

        organizations.put(obj);
    }
    githubProfile.put("organizations", organizations);

    /* If Organization */
    Elements navigation = html.getElementsByAttributeValue("class", "orgnav");
    for (Element e : navigation) {
        String orgRepositoriesLink = e.child(0).tagName("a").attr("href");
        githubProfile.put("organization_respositories_link", "https://github.com" + orgRepositoriesLink);

        String orgPeopleLink = e.child(1).tagName("a").attr("href");
        githubProfile.put("organization_people_link", "https://github.com" + orgPeopleLink);

        String orgPeopleNumber = e.child(1).tagName("a").child(1).text();
        githubProfile.put("organization_people_number", orgPeopleNumber);
    }

    JSONArray jsonArray = new JSONArray();
    jsonArray.put(githubProfile);

    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;
}

From source file:org.loklak.api.search.MeetupsCrawlerService.java

public static SusiThought crawlMeetups(String url) {

    Document meetupHTML = null;
    String meetupGroupName = null;
    String meetupType = null;//  w w  w . j  a  v a2s. co  m
    String groupDescription = null;
    String groupLocality = null;
    String groupCountry = null;
    String latitude = null;
    String longitude = null;
    String imageLink = null;
    Elements topicList = null;
    String[] topicListArray = new String[100];
    Integer numberOfTopics = 0;
    Elements recentMeetupsSection = null;
    Integer numberOfRecentMeetupsShown = 0;
    Integer i = 0, j = 0;
    String recentMeetupsResult[][] = new String[100][3];
    // recentMeetupsResult[i][0] == date && time
    // recentMeetupsResult[i][1] == Attendance && Review
    // recentMeetupsResult[i][2] == Information

    JSONObject result = new JSONObject();

    try {
        meetupHTML = Jsoup.connect(url).userAgent("Mozilla)").get();

    } catch (Exception e) {
        e.printStackTrace();
    }

    meetupGroupName = meetupHTML.getElementsByAttributeValue("property", "og:title").attr("content");
    result.put("group_name", meetupGroupName);

    meetupType = meetupHTML.getElementsByAttributeValue("property", "og:type").attr("content");
    result.put("meetup_type", meetupType);

    groupDescription = meetupHTML.getElementById("groupDesc").text();
    result.put("group_description", groupDescription);

    groupLocality = meetupHTML.getElementsByAttributeValue("property", "og:locality").attr("content");
    result.put("group_locality", groupLocality);

    groupCountry = meetupHTML.getElementsByAttributeValue("property", "og:country-name").attr("content");
    result.put("group_country_code", groupCountry);

    latitude = meetupHTML.getElementsByAttributeValue("property", "og:latitude").attr("content");
    result.put("group_latitude", latitude);

    longitude = meetupHTML.getElementsByAttributeValue("property", "og:longitude").attr("content");
    result.put("group_longitude", longitude);

    imageLink = meetupHTML.getElementsByAttributeValue("property", "og:image").attr("content");
    result.put("group_imageLink", imageLink);

    topicList = meetupHTML.getElementById("topic-box-2012").getElementsByTag("a");

    int p = 0;
    for (Element topicListStringsIterator : topicList) {
        topicListArray[p] = topicListStringsIterator.text().toString();
        p++;
    }
    numberOfTopics = p;

    JSONArray groupTopics = new JSONArray();
    for (int l = 0; l < numberOfTopics; l++) {
        groupTopics.put(l, topicListArray[l]);
    }
    result.put("group_topics", groupTopics);

    recentMeetupsSection = meetupHTML.getElementById("recentMeetups").getElementsByTag("p");

    i = 0;
    j = 0;

    for (Element recentMeetups : recentMeetupsSection) {
        if (j % 3 == 0) {
            j = 0;
            i++;
        }

        recentMeetupsResult[i][j] = recentMeetups.text().toString();
        j++;

    }

    numberOfRecentMeetupsShown = i;

    JSONArray recentMeetups = new JSONArray();
    for (int k = 1; k < numberOfRecentMeetupsShown; k++) {
        JSONObject obj = new JSONObject();
        obj.put("recent_meetup_number", k);
        obj.put("date_time", recentMeetupsResult[k][0]);
        obj.put("attendance", recentMeetupsResult[k][1]);
        obj.put("information", recentMeetupsResult[k][2]);
        recentMeetups.put(obj);
    }

    result.put("recent_meetups", recentMeetups);

    JSONArray meetupsCrawlerResultArray = new JSONArray();
    meetupsCrawlerResultArray.put(result);

    SusiThought json = new SusiThought();
    json.setData(meetupsCrawlerResultArray);
    return json;
}

From source file:org.loklak.api.search.WeiboUserInfo.java

@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    Query post = RemoteAccess.evaluate(request);

    // manage DoS
    if (post.isDoS_blackout()) {
        response.sendError(503, "your request frequency is too high");
        return;//ww  w.j a  v a 2 s  . c  o m
    }

    String url = post.get("url", "");
    JSONObject obj = new JSONObject();
    Document doc = Jsoup.connect(url).get();
    Elements infos;
    infos = doc.getElementsByAttributeValue("class", "li_1 clearfix");

    if (infos != null) {
        Element info;
        String profile;
        for (int i = 0; i < infos.size(); i++) {
            info = infos.get(i);
            if (info.getElementsByAttributeValueContaining("href", "loc=infblog").size() == 0) {
                profile = info.getElementsByAttributeValue("class", "pt_detail").first().text().trim();
                obj.put("pro", profile);
                switch (info.getElementsByAttributeValue("class", "pt_title S_txt2").first().text()) {
                case "Nickname":
                    obj.put("username", profile);
                    break;
                case "Location":
                    obj.put("Address", profile);
                    break;
                case "Gender":
                    obj.put("Gender", profile);
                    break;
                case "??":
                    obj.put("Sexuality", profile.replace("t", "").replace("rn", ""));
                    break;
                case "":
                    obj.put("Relationship", profile.replace("t", "").replace("rn", ""));
                    break;
                case "Birthday":
                    obj.put("Birthday", profile);
                    break;
                case "":
                    obj.put("Blood", profile);
                    break;
                case "Domain Name":
                    if (info.getElementsByAttributeValueContaining("href", "loc=infdomain").size() != 0)
                        profile = info.select("a").text();
                    obj.put("Personaldomain", profile);
                    break;
                case "":
                    obj.put("Profile", profile);
                    break;
                case "Registration":
                    obj.put("Registertime", profile.replace("t", "").replace("rn", ""));
                    break;
                case "Email":
                    obj.put("Email", profile);
                    break;
                case "QQ":
                    obj.put("Qq", profile);
                    break;
                case "":
                    obj.put("College", profile.replace("t", "").replace("rn", ""));
                    break;
                case "Tags":
                    obj.put("Tag", profile.replace("t", "").replace("rn", ""));
                    break;
                }

            } else {
                String blogurl = info.select("a").text();
                obj.put("Blog", blogurl);
            }
        }
    }

    //print JSON 
    response.setCharacterEncoding("UTF-8");
    PrintWriter sos = response.getWriter();
    sos.print(obj.toString(2));
    sos.println();
}

From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java

/**
 * Get movie meta data from aebn.net./*from  w  w  w .  j  a  v  a  2  s  . c o m*/
 *
 */
@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("AEBN: getMetadata() {}", options);

    // check if there is already meta data present in the result
    if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) {
        LOGGER.debug("AEBN: return metadata from cache");
        return options.getResult().getMediaMetadata();
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    Elements elements = null;
    Element element = null;
    Integer aebnId = 0;

    // get AebnId from previous search result
    if ((options.getResult() != null) && (options.getResult().getId() != null)) {
        aebnId = Integer.parseInt(options.getResult().getId());
        LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId);
        // preset some values from search result (if there is one)
        // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy".
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle()));
        md.storeMetadata(MediaMetadata.TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getTitle()));
    }

    // or get AebnId from options
    if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) {
        LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID));
        aebnId = Integer.parseInt(options.getId(AEBNID));
    }

    if (!isValidAebnId(aebnId)) {
        LOGGER.warn("AEBN: no or incorrect aebnId, aborting");
        return md;
    }

    // ID
    md.setId(providerInfo.getId(), aebnId);
    LOGGER.debug("AEBN: aebnId({})", aebnId);

    // Base download url for data scraping
    String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId;
    String locale = options.getLanguage().name();
    if (!StringUtils.isBlank(locale)) {
        downloadUrl = downloadUrl + "&locale=" + locale;
        LOGGER.debug("AEBN: used locale({})", locale);
    }

    // begin download and scrape
    try {
        LOGGER.debug("AEBN: download movie detail page");
        Url url = new Url(downloadUrl);
        InputStream in = url.getInputStream();
        Document document = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // Title
        // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1>
        LOGGER.debug("AEBN: parse title");
        elements = document.getElementsByAttributeValue("class", "md-movieTitle");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieTitle = cleanString(element.text());
            LOGGER.debug("AEBN: title({})", movieTitle);
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // Poster
        // front cover:
        // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg
        String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg";
        md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl);

        // Fanart/Background
        // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg
        // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..."
        // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." />
        LOGGER.debug("AEBN: parse fanart / scene thumbs");
        elements = document.getElementsByAttributeValue("class", "SceneThumbnail");
        LOGGER.debug("AEBN: {} elements found", elements.size());
        int i = 1;
        for (Element anchor : elements) {
            String backgroundUrl = anchor.attr("src");
            LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl);
            md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl);
            i++;
        }

        // Runtime
        LOGGER.debug("AEBN: parse runtime");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieRuntime = cleanString(element.attr("content"));
            movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M");
            LOGGER.debug("AEBN: runtime({})", movieRuntime);
            md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime);
        }

        // Year
        LOGGER.debug("AEBN: parse year");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieYear = cleanString(element.attr("content"));
            movieYear = StrgUtils.substr(movieYear, "(\\d+)-");
            LOGGER.debug("AEBN: year({})", movieYear);
            md.storeMetadata(MediaMetadata.YEAR, movieYear);
        }

        // Series (Collection)
        LOGGER.debug("AEBN: parse collection");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieCollection = cleanString(element.text());

            // Fake a TMDB_SET based on the hash value of the collection name
            int movieCollectionHash = movieCollection.hashCode();

            md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection);
            md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash);
            LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash);
        }

        // Studio
        LOGGER.debug("AEBN: parse studio");
        elements = document.getElementsByAttributeValue("id", "md-details")
                .select("[itemprop=productionCompany]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String movieStudio = cleanString(elements.first().text());
            LOGGER.debug("AEBN: studio({})", movieStudio);
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio);
        }

        // Genre
        LOGGER.debug("AEBN: parse genre");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]");
        for (Element g : elements) {
            md.addGenre(getTmmGenre(g.text()));
        }
        // add basic genre, since all genres at AEBN could be summarised
        // into this one
        md.addGenre(MediaGenres.EROTIC);

        // Certification
        // no data scrapeable---but obviously it's adult only, so simply
        // generate it
        String movieCertification = null;
        Certification certification = null;
        String country = options.getCountry().getAlpha2();
        LOGGER.debug("AEBN: generate certification for {}", country);
        // @formatter:off
        if (country.equals("DE")) {
            movieCertification = "FSK 18";
        }
        if (country.equals("US")) {
            movieCertification = "NC-17";
        }
        if (country.equals("GB")) {
            movieCertification = "R18";
        }
        if (country.equals("FR")) {
            movieCertification = "18";
        }
        if (country.equals("ES")) {
            movieCertification = "PX";
        }
        if (country.equals("JP")) {
            movieCertification = "R18+";
        }
        if (country.equals("IT")) {
            movieCertification = "V.M.18";
        }
        if (country.equals("NL")) {
            movieCertification = "16";
        }
        // @formatter:on
        certification = Certification.getCertification(options.getCountry(), movieCertification);
        if (certification != null) {
            LOGGER.debug("AEBN: certification({})", certification);
            md.addCertification(certification);
        }

        // Plot and Tagline
        LOGGER.debug("AEBN: parse plot");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String moviePlot = cleanString(elements.first().text());
            md.storeMetadata(MediaMetadata.PLOT, moviePlot);
            // no separate tagline available, so extract the first sentence
            // from the movie plot
            String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])");
            LOGGER.debug("AEBN: tagline(" + movieTagline + ")");
            md.storeMetadata(MediaMetadata.TAGLINE, movieTagline);
        }

        // Actors
        LOGGER.debug("AEBN: parse actors");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]");
        LOGGER.debug("AEBN: {} actors found", elements.size());
        for (Element anchor : elements) {
            String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)");
            String actorname = cleanString(anchor.select("[itemprop=name]").first().text());
            String actordetailsurl = BASE_DATAURL + anchor.attr("href");
            if (!actorname.isEmpty()) {
                LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname,
                        actordetailsurl);
                MediaCastMember cm = new MediaCastMember();
                cm.setType(MediaCastMember.CastType.ACTOR);
                cm.setName(actorname);
                if (!actorid.isEmpty()) {
                    cm.setId(actorid);
                }

                // Actor detail page
                try {
                    Url starurl = new Url(actordetailsurl);
                    InputStream starurlstream = starurl.getInputStream();
                    Document stardocument = Jsoup.parse(starurlstream, "UTF-8", "");
                    starurlstream.close();
                    Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo");
                    if (elements2.size() == 0) {
                        LOGGER.debug("AEBN: no additional actor details found");
                    } else {
                        // Actor image
                        String actorimage = elements2.select("[itemprop=image]").first().attr("src");
                        LOGGER.debug("AEBN: actor image({})", actorimage);
                        if (!actorimage.isEmpty()) {
                            cm.setImageUrl(actorimage);
                        }
                        // Actor 'fanart' images
                        // unsure if this is ever shown in tmm
                        elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery")
                                .select("a");
                        LOGGER.debug("AEBN: {} gallery images found", elements2.size());
                        for (Element thumbnail : elements2) {
                            LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href"));
                            cm.addFanart(thumbnail.attr("href"));
                        }
                    }
                } catch (Exception e) {
                    LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e);
                }

                md.addCastMember(cm);
            }
        }

        // Director
        LOGGER.debug("AEBN: parse director");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)");
            String directorname = cleanString(elements.select("[itemprop=name]").first().text());
            if (!directorname.isEmpty()) {
                MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                cm.setName(directorname);
                if (!directorid.isEmpty()) {
                    cm.setId(directorid);
                }
                cm.setImageUrl("");
                md.addCastMember(cm);
                LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname);
            }
        }

        // Original Title
        // if we have no original title, just copy the title
        if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }
    } catch (Exception e) {
        LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e);
    }

    return md;
}

From source file:org.tinymediamanager.scraper.hdtrailersnet.HDTrailersNet.java

@Override
public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getTrailers() " + options.toString());
    List<MediaTrailer> trailers = new ArrayList<MediaTrailer>();
    MediaMetadata md = options.getMetadata();

    if (md == null || StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
        LOGGER.warn("no originalTitle served");
        return trailers;
    }/*from w ww . j a  va2 s .co  m*/

    String ot = md.getStringValue(MediaMetadata.ORIGINAL_TITLE);

    // check if the original title is not empty
    if (StringUtils.isEmpty(ot)) {
        return trailers;
    }

    // best guess
    String search = "http://www.hd-trailers.net/movie/"
            + ot.replaceAll("[^a-zA-Z0-9]", "-").replaceAll("--", "-").toLowerCase() + "/";
    try {
        LOGGER.debug("Guessed HD-Trailers Url: " + search);

        Url url = new CachedUrl(search);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        Elements tr = doc.getElementsByAttributeValue("itemprop", "trailer");
        /*
         * <tr style="" itemprop="trailer" itemscope itemtype="http://schema.org/VideoObject"> <td class="bottomTableDate" rowspan="2">2012-03-30</td>
         * <td class="bottomTableName" rowspan="2"><span class="standardTrailerName" itemprop="name">Trailer 2</span> <a href=
         * "http://blog.hd-trailers.net/how-to-download-hd-trailers-from-apple/#workarounds" ><img src="http://static.hd-trailers.net/images/error.png"
         * width="16" height="16" style="border:0px;vertical-align:middle" alt="Apple Direct Download Unavailable"
         * title="Apple Direct Download Unavailable" /></a></td>
         * 
         * <td class="bottomTableResolution"><a href= "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h480p.mov"
         * rel="lightbox[res480p 852 480]" title="Men in Black 3 - Trailer 2 - 480p">480p</a></td> <td class="bottomTableResolution"><a href=
         * "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h720p.mov" rel="lightbox[res720p 1280 720]"
         * title="Men in Black 3 - Trailer 2 - 720p">720p</a></td> <td class="bottomTableResolution"><a href=
         * "http://trailers.apple.com/movies/sony_pictures/meninblack3/meninblack3-tlr2_h1080p.mov" rel="lightbox[res1080p 1920 1080]"
         * title="Men in Black 3 - Trailer 2 - 1080p">1080p</a></td> <td class="bottomTableIcon"> <a
         * href="http://trailers.apple.com/trailers/sony_pictures/meninblack3/" target="_blank"> <img
         * src="http://static.hd-trailers.net/images/apple.ico" alt="Apple" height="16px" width="16px"/></a></td> </tr> <tr> <td
         * class="bottomTableFileSize">36 MB</td> <td class="bottomTableFileSize">111 MB</td> <td class="bottomTableFileSize">181 MB</td> <td
         * class="bottomTableEmbed"><a href=
         * "/embed-code.php?movieId=men-in-black-3&amp;source=1&amp;trailerName=Trailer 2&amp;resolutions=480;720;1080" rel="lightbox[embed 600 600]"
         * title="Embed this video on your website">embed</a></td> </tr>
         */
        for (Element t : tr) {
            try {
                String date = t.select("td.bottomTableDate").first().text();
                String title = t.select("td.bottomTableName > span").first().text();

                // apple.com urls currently not working (according to hd-trailers)
                String tr0qual = t.select("td.bottomTableResolution > a").get(0).text();
                String tr0url = t.select("td.bottomTableResolution > a").get(0).attr("href");
                MediaTrailer trailer = new MediaTrailer();
                trailer.setName(title + " (" + date + ")");
                trailer.setDate(date);
                trailer.setUrl(tr0url);
                trailer.setQuality(tr0qual);
                trailer.setProvider(getProviderFromUrl(tr0url));
                LOGGER.debug(trailer.toString());
                trailers.add(trailer);

                String tr1qual = t.select("td.bottomTableResolution > a").get(1).text();
                String tr1url = t.select("td.bottomTableResolution > a").get(1).attr("href");
                trailer = new MediaTrailer();
                trailer.setName(title + " (" + date + ")");
                trailer.setDate(date);
                trailer.setUrl(tr1url);
                trailer.setQuality(tr1qual);
                trailer.setProvider(getProviderFromUrl(tr1url));
                LOGGER.debug(trailer.toString());
                trailers.add(trailer);

                String tr2qual = t.select("td.bottomTableResolution > a").get(2).text();
                String tr2url = t.select("td.bottomTableResolution > a").get(2).attr("href");
                trailer = new MediaTrailer();
                trailer.setName(title + " (" + date + ")");
                trailer.setDate(date);
                trailer.setUrl(tr2url);
                trailer.setQuality(tr2qual);
                trailer.setProvider(getProviderFromUrl(tr2url));
                LOGGER.debug(trailer.toString());
                trailers.add(trailer);
            } catch (IndexOutOfBoundsException i) {
                // ignore parse errors per line
                LOGGER.warn("Error parsing HD-Trailers line. Possible missing quality.");
            }
        }
    } catch (Exception e) {
        LOGGER.error("cannot parse HD-Trailers movie: " + ot, e);

        // clear cache
        CachedUrl.removeCachedFileForUrl(search);
    } finally {
    }
    return trailers;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    LOGGER.debug("search() " + query.toString());
    /*//  ww  w .  j a va  2s . c o m
     * IMDb matches seem to come in several "flavours".
     * 
     * Firstly, if there is one exact match it returns the matching IMDb page.
     * 
     * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles
     * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results)
     * 
     * We should check the Exact match section first, then the poplar titles and finally the partial matches.
     * 
     * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek"
     */

    Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/");

    List<MediaSearchResult> result = new ArrayList<MediaSearchResult>();

    String searchTerm = "";

    if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) {
        searchTerm = query.get(SearchParam.IMDBID);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.QUERY);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.get(SearchParam.TITLE);
    }

    if (StringUtils.isEmpty(searchTerm)) {
        return result;
    }

    // parse out language and coutry from the scraper options
    String language = query.get(SearchParam.LANGUAGE);
    String myear = query.get(SearchParam.YEAR);
    String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    StringBuilder sb = new StringBuilder(imdbSite.getSite());
    sb.append("find?q=");
    try {
        // search site was everytime in UTF-8
        sb.append(URLEncoder.encode(searchTerm, "UTF-8"));
    } catch (UnsupportedEncodingException ex) {
        // Failed to encode the movie name for some reason!
        LOGGER.debug("Failed to encode search term: " + searchTerm);
        sb.append(searchTerm);
    }

    // we need to search for all - otherwise we do not find TV movies
    sb.append(CAT_TITLE);

    LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString());
    Document doc;
    try {
        CachedUrl url = new CachedUrl(sb.toString());
        url.addHeader("Accept-Language", getAcceptLanguage(language, country));
        doc = Jsoup.parse(url.getInputStream(), "UTF-8", "");
    } catch (Exception e) {
        LOGGER.debug("tried to fetch search response", e);

        // clear Cache
        CachedUrl.removeCachedFileForUrl(sb.toString());

        return result;
    }

    // check if it was directly redirected to the site
    Elements elements = doc.getElementsByAttributeValue("rel", "canonical");
    for (Element element : elements) {
        MediaMetadata md = null;
        // we have been redirected to the movie site
        String movieName = null;
        String movieId = null;

        String href = element.attr("href");
        Matcher matcher = imdbIdPattern.matcher(href);
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                movieId = matcher.group(1);
            }
        }

        // get full information
        if (!StringUtils.isEmpty(movieId)) {
            MediaScrapeOptions options = new MediaScrapeOptions();
            options.setImdbId(movieId);
            options.setLanguage(MediaLanguages.valueOf(language));
            options.setCountry(CountryCode.valueOf(country));
            options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO)));
            options.setScrapeImdbForeignLanguage(
                    Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE)));
            md = getMetadata(options);
            if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) {
                movieName = md.getStringValue(MediaMetadata.TITLE);
            }
        }

        // if a movie name/id was found - return it
        if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) {
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            sr.setTitle(movieName);
            sr.setIMDBId(movieId);
            sr.setYear(md.getStringValue(MediaMetadata.YEAR));
            sr.setMetadata(md);
            sr.setScore(1);

            // and parse out the poster
            String posterUrl = "";
            Element td = doc.getElementById("img_primary");
            if (td != null) {
                Elements imgs = td.getElementsByTag("img");
                for (Element img : imgs) {
                    posterUrl = img.attr("src");
                    posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                    posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                    posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
                }
            }
            if (StringUtils.isNotBlank(posterUrl)) {
                sr.setPosterUrl(posterUrl);
            }

            result.add(sr);
            return result;
        }
    }

    // parse results
    // elements = doc.getElementsByClass("result_text");
    elements = doc.getElementsByClass("findResult");
    for (Element tr : elements) {
        // we only want the tr's
        if (!"tr".equalsIgnoreCase(tr.tagName())) {
            continue;
        }

        // find the id / name
        String movieName = "";
        String movieId = "";
        String year = "";
        Elements tds = tr.getElementsByClass("result_text");
        for (Element element : tds) {
            // we only want the td's
            if (!"td".equalsIgnoreCase(element.tagName())) {
                continue;
            }

            // filter out unwanted results
            Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*|
            Matcher matcher = unwanted.matcher(element.text());
            if (matcher.find()) {
                continue;
            }

            // is there a localized name? (aka)
            String localizedName = "";
            Elements italics = element.getElementsByTag("i");
            if (italics.size() > 0) {
                localizedName = italics.text().replace("\"", "");
            }

            // get the name inside the link
            Elements anchors = element.getElementsByTag("a");
            for (Element a : anchors) {
                if (StringUtils.isNotEmpty(a.text())) {
                    // movie name
                    if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) {
                        // take AKA as title, but only if not EN
                        movieName = localizedName;
                    } else {
                        movieName = a.text();
                    }

                    // parse id
                    String href = a.attr("href");
                    matcher = imdbIdPattern.matcher(href);
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            movieId = matcher.group(1);
                        }
                    }

                    // try to parse out the year
                    Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
                    matcher = yearPattern.matcher(element.text());
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            year = matcher.group(1);
                            break;
                        }
                    }
                    break;
                }
            }
        }

        // if an id/name was found - parse the poster image
        String posterUrl = "";
        tds = tr.getElementsByClass("primary_photo");
        for (Element element : tds) {
            Elements imgs = element.getElementsByTag("img");
            for (Element img : imgs) {
                posterUrl = img.attr("src");
                posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
                posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
            }
        }

        // if no movie name/id was found - continue
        if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) {
            continue;
        }

        MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
        sr.setTitle(movieName);
        sr.setIMDBId(movieId);
        sr.setYear(year);
        sr.setPosterUrl(posterUrl);

        // populate extra args
        MetadataUtil.copySearchQueryToSearchResult(query, sr);

        if (movieId.equals(query.get(SearchParam.IMDBID))) {
            // perfect match
            sr.setScore(1);
        } else {
            // compare score based on names
            float score = MetadataUtil.calculateScore(searchTerm, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                LOGGER.debug("no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) {
                LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01");
                score = score - 0.01f;
            }
            sr.setScore(score);
        }

        result.add(sr);

        // only get 40 results
        if (result.size() >= 40) {
            break;
        }
    }
    Collections.sort(result);
    Collections.reverse(result);

    return result;
}