Example usage for org.jsoup.select Elements select

List of usage examples for org.jsoup.select Elements select

Introduction

In this page you can find the example usage for org.jsoup.select Elements select.

Prototype

public Elements select(String query) 

Source Link

Document

Find matching elements within this element list.

Usage

From source file:org.loklak.api.search.EventBriteCrawlerService.java

public static SusiThought crawlEventBrite(String url) {
    Document htmlPage = null;/*from w w  w.  j  a va  2  s  .c  o m*/

    try {
        htmlPage = Jsoup.connect(url).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    String eventID = null;
    String eventName = null;
    String eventDescription = null;

    // TODO Fetch Event Color
    String eventColor = null;

    String imageLink = null;

    String eventLocation = null;

    String startingTime = null;
    String endingTime = null;

    String ticketURL = null;

    Elements tagSection = null;
    Elements tagSpan = null;
    String[][] tags = new String[5][2];
    String topic = null; // By default

    String closingDateTime = null;
    String schedulePublishedOn = null;
    JSONObject creator = new JSONObject();
    String email = null;

    Float latitude = null;
    Float longitude = null;

    String privacy = "public"; // By Default
    String state = "completed"; // By Default
    String eventType = "";

    String temp;
    Elements t;

    eventID = htmlPage.getElementsByTag("body").attr("data-event-id");
    eventName = htmlPage.getElementsByClass("listing-hero-body").text();
    eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text();

    eventColor = null;

    imageLink = htmlPage.getElementsByTag("picture").attr("content");

    eventLocation = htmlPage.select("p.listing-map-card-street-address.text-default").text();

    temp = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    if (temp.length() >= 20) {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content")
                .substring(0, 19);
    } else {
        startingTime = htmlPage.getElementsByAttributeValue("property", "event:start_time").attr("content");
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    if (temp.length() >= 20) {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content")
                .substring(0, 19);
    } else {
        endingTime = htmlPage.getElementsByAttributeValue("property", "event:end_time").attr("content");
    }

    ticketURL = url + "#tickets";

    // TODO Tags to be modified to fit in the format of Open Event "topic"
    tagSection = htmlPage.getElementsByAttributeValue("data-automation", "ListingsBreadcrumbs");
    tagSpan = tagSection.select("span");
    topic = "";

    int iterator = 0, k = 0;
    for (Element e : tagSpan) {
        if (iterator % 2 == 0) {
            tags[k][1] = "www.eventbrite.com"
                    + e.select("a.js-d-track-link.badge.badge--tag.l-mar-top-2").attr("href");
        } else {
            tags[k][0] = e.text();
            k++;
        }
        iterator++;
    }

    creator.put("email", "");
    creator.put("id", "1"); // By Default

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content");
    if (temp.length() > 0) {
        latitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:latitude").attr("content"));
    }

    temp = htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content");
    if (temp.length() > 0) {
        longitude = Float.valueOf(
                htmlPage.getElementsByAttributeValue("property", "event:location:longitude").attr("content"));
    }

    // TODO This returns: "events.event" which is not supported by Open
    // Event Generator
    // eventType = htmlPage.getElementsByAttributeValue("property",
    // "og:type").attr("content");

    String organizerName = null;
    String organizerLink = null;
    String organizerProfileLink = null;
    String organizerWebsite = null;
    String organizerContactInfo = null;
    String organizerDescription = null;
    String organizerFacebookFeedLink = null;
    String organizerTwitterFeedLink = null;
    String organizerFacebookAccountLink = null;
    String organizerTwitterAccountLink = null;

    temp = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text();
    if (temp.length() >= 5) {
        organizerName = htmlPage.select("a.js-d-scroll-to.listing-organizer-name.text-default").text()
                .substring(4);
    } else {
        organizerName = "";
    }
    organizerLink = url + "#listing-organizer";
    organizerProfileLink = htmlPage
            .getElementsByAttributeValue("class", "js-follow js-follow-target follow-me fx--fade-in is-hidden")
            .attr("href");
    organizerContactInfo = url + "#lightbox_contact";

    Document orgProfilePage = null;

    try {
        orgProfilePage = Jsoup.connect(organizerProfileLink).get();
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (orgProfilePage != null) {

        t = orgProfilePage.getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website");
        if (t != null) {
            organizerWebsite = orgProfilePage
                    .getElementsByAttributeValue("class", "l-pad-vert-1 organizer-website").text();
        } else {
            organizerWebsite = "";
        }

        t = orgProfilePage.select("div.js-long-text.organizer-description");
        if (t != null) {
            organizerDescription = orgProfilePage.select("div.js-long-text.organizer-description").text();
        } else {
            organizerDescription = "";
        }

        organizerFacebookFeedLink = organizerProfileLink + "#facebook_feed";
        organizerTwitterFeedLink = organizerProfileLink + "#twitter_feed";

        t = orgProfilePage.getElementsByAttributeValue("class", "fb-page");
        if (t != null) {
            organizerFacebookAccountLink = orgProfilePage.getElementsByAttributeValue("class", "fb-page")
                    .attr("data-href");
        } else {
            organizerFacebookAccountLink = "";
        }

        t = orgProfilePage.getElementsByAttributeValue("class", "twitter-timeline");
        if (t != null) {
            organizerTwitterAccountLink = orgProfilePage
                    .getElementsByAttributeValue("class", "twitter-timeline").attr("href");
        } else {
            organizerTwitterAccountLink = "";
        }

    }

    JSONArray socialLinks = new JSONArray();

    JSONObject fb = new JSONObject();
    fb.put("id", "1");
    fb.put("name", "Facebook");
    fb.put("link", organizerFacebookAccountLink);
    socialLinks.put(fb);

    JSONObject tw = new JSONObject();
    tw.put("id", "2");
    tw.put("name", "Twitter");
    tw.put("link", organizerTwitterAccountLink);
    socialLinks.put(tw);

    JSONArray jsonArray = new JSONArray();

    JSONObject event = new JSONObject();
    event.put("event_url", url);
    event.put("id", eventID);
    event.put("name", eventName);
    event.put("description", eventDescription);
    event.put("color", eventColor);
    event.put("background_url", imageLink);
    event.put("closing_datetime", closingDateTime);
    event.put("creator", creator);
    event.put("email", email);
    event.put("location_name", eventLocation);
    event.put("latitude", latitude);
    event.put("longitude", longitude);
    event.put("start_time", startingTime);
    event.put("end_time", endingTime);
    event.put("logo", imageLink);
    event.put("organizer_description", organizerDescription);
    event.put("organizer_name", organizerName);
    event.put("privacy", privacy);
    event.put("schedule_published_on", schedulePublishedOn);
    event.put("state", state);
    event.put("type", eventType);
    event.put("ticket_url", ticketURL);
    event.put("social_links", socialLinks);
    event.put("topic", topic);
    jsonArray.put(event);

    JSONObject org = new JSONObject();
    org.put("organizer_name", organizerName);
    org.put("organizer_link", organizerLink);
    org.put("organizer_profile_link", organizerProfileLink);
    org.put("organizer_website", organizerWebsite);
    org.put("organizer_contact_info", organizerContactInfo);
    org.put("organizer_description", organizerDescription);
    org.put("organizer_facebook_feed_link", organizerFacebookFeedLink);
    org.put("organizer_twitter_feed_link", organizerTwitterFeedLink);
    org.put("organizer_facebook_account_link", organizerFacebookAccountLink);
    org.put("organizer_twitter_account_link", organizerTwitterAccountLink);
    jsonArray.put(org);

    JSONArray microlocations = new JSONArray();
    jsonArray.put(new JSONObject().put("microlocations", microlocations));

    JSONArray customForms = new JSONArray();
    jsonArray.put(new JSONObject().put("customForms", customForms));

    JSONArray sessionTypes = new JSONArray();
    jsonArray.put(new JSONObject().put("sessionTypes", sessionTypes));

    JSONArray sessions = new JSONArray();
    jsonArray.put(new JSONObject().put("sessions", sessions));

    JSONArray sponsors = new JSONArray();
    jsonArray.put(new JSONObject().put("sponsors", sponsors));

    JSONArray speakers = new JSONArray();
    jsonArray.put(new JSONObject().put("speakers", speakers));

    JSONArray tracks = new JSONArray();
    jsonArray.put(new JSONObject().put("tracks", tracks));

    String userHome = System.getProperty("user.home");
    String path = userHome + "/Downloads/EventBriteInfo";

    new File(path).mkdir();

    try (FileWriter file = new FileWriter(path + "/event.json")) {
        file.write(event.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/org.json")) {
        file.write(org.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/social_links.json")) {
        file.write(socialLinks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/microlocations.json")) {
        file.write(microlocations.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/custom_forms.json")) {
        file.write(customForms.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/session_types.json")) {
        file.write(sessionTypes.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sessions.json")) {
        file.write(sessions.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/sponsors.json")) {
        file.write(sponsors.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/speakers.json")) {
        file.write(speakers.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    try (FileWriter file = new FileWriter(path + "/tracks.json")) {
        file.write(tracks.toString());
    } catch (IOException e1) {
        e1.printStackTrace();
    }

    SusiThought json = new SusiThought();
    json.setData(jsonArray);
    return json;

}

From source file:org.metaservice.demo.wordpress.WordpressParser.java

@Override
public List<VersionEntry> parse(Reader s, ArchiveAddress archiveParameters) throws ParserException {
    try {/*from w ww  .j  a v  a 2s .c  o m*/
        Document document = Jsoup.parse(IOUtils.toString(s), "http://wordpress.org/download/release-archive/");
        ArrayList<VersionEntry> result = new ArrayList<>();
        Elements tables = document.select("table.widefat");
        for (Element table : tables) {
            Elements rows = table.select("tr");
            // System.err.println(rows);
            for (Element row : rows) {
                Elements columns = row.select("td");
                if (columns.size() > 0) {
                    VersionEntry versionEntry = new VersionEntry();
                    versionEntry.setName(columns.get(0).text().trim());
                    versionEntry.setZip(columns.select("a[href$=zip]").attr("href"));
                    versionEntry.setTar(columns.select("a[href$=tar.gz]").attr("href"));
                    versionEntry.setIis(columns.select("a[href$=IIS.zip]").attr("href"));
                    result.add(versionEntry);
                }
            }
        }
        return result;
    } catch (IOException e) {
        throw new ParserException(e);
    }
}

From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java

@Override
protected void fillCollectionSet(String urlString, Request request, CollectionAgent agent,
        XmlCollectionSet collectionSet, XmlSource source) throws Exception {
    XmlCollectionResource nodeResource = new XmlSingleInstanceCollectionResource(agent);
    Document doc = getJsoupDocument(urlString, request);
    for (XmlGroup group : source.getXmlGroups()) {
        LOG.debug("fillCollectionSet: getting resources for XML group {} using selector {}", group.getName(),
                group.getResourceXpath());
        Date timestamp = getTimeStamp(doc, group);
        Elements elements = doc.select(group.getResourceXpath());
        LOG.debug("fillCollectionSet: {} => {}", group.getResourceXpath(), elements);
        String resourceName = getResourceName(elements, group);
        LOG.debug("fillCollectionSet: processing XML resource {}", resourceName);
        XmlCollectionResource collectionResource;
        if (group.getResourceType().equalsIgnoreCase(CollectionResource.RESOURCE_TYPE_NODE)) {
            collectionResource = nodeResource;
        } else {/*from  w ww.  j a  va 2 s .c om*/
            collectionResource = getCollectionResource(agent, resourceName, group.getResourceType(), timestamp);
        }
        LOG.debug("fillCollectionSet: processing resource {}", collectionResource);
        AttributeGroupType attribGroupType = new AttributeGroupType(group.getName(), group.getIfType());
        for (XmlObject object : group.getXmlObjects()) {
            Elements el = elements.select(object.getXpath());
            XmlCollectionAttributeType attribType = new XmlCollectionAttributeType(object, attribGroupType);
            collectionResource.setAttributeValue(attribType, el == null ? null : el.html());
        }
        processXmlResource(collectionResource, attribGroupType);
        collectionSet.getCollectionResources().add(collectionResource);
    }
}

From source file:org.opennms.protocols.http.collector.HttpCollectionHandler.java

/**
 * Gets the resource name.//from w  w w.j a v  a  2  s  .  c o m
 *
 * @param elements the JSoup elements
 * @param group the group
 * @return the resource name
 */
private String getResourceName(Elements elements, XmlGroup group) {
    // Processing multiple-key resource name.
    if (group.hasMultipleResourceKey()) {
        List<String> keys = new ArrayList<String>();
        for (String key : group.getXmlResourceKey().getKeyXpathList()) {
            LOG.debug("getResourceName: getting key for resource's name using selector {}", key);
            Elements el = elements.select(key);
            if (el != null) {
                keys.add(el.html());
            }
        }
        return StringUtils.join(keys, "_");
    }
    // If key-xpath doesn't exist or not found, a node resource will be assumed.
    if (group.getKeyXpath() == null) {
        return "node";
    }
    // Processing single-key resource name.
    LOG.debug("getResourceName: getting key for resource's name using selector {}", group.getKeyXpath());
    Elements el = elements.select(group.getKeyXpath());
    return el == null ? null : el.html();
}

From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java

/**
 * ????/*from   w  w  w .  j a v a 2s  .co  m*/
 * @param doc
 * @return
 * @throws ConfigurationException
 */
public ExtractConfig loadConfig(Document doc) throws ConfigurationException {
    Elements extractElement = doc.select("extract");
    super.jobName = doc.select("job").attr("name");
    super.indexName = doc.select("job").attr("indexName");
    String temp = extractElement.select("threadNum").text();
    if (StringUtils.isNotBlank(temp)) {
        this.threadNum = Integer.parseInt(temp);
    }

    Elements templateElement = extractElement.select("extract").select("template");
    Iterator<Element> it = templateElement.iterator();
    while (it.hasNext()) {
        Element template = it.next();
        ExtractTemplate extractTemplate = new ExtractTemplate();
        // ?Url????
        Elements urlPatternElement = template.select("url");
        List<Pattern> patterns = Lists.newArrayList();
        for (Element urlElement : urlPatternElement) {
            patterns.add(Pattern.compile(urlElement.text()));
        }
        extractTemplate.setUrlPattern(patterns);
        extractTemplate.setName(template.attr("name"));
        // ???
        Elements selectElement = template.select("elements").first().children();
        for (Element element : selectElement) {
            if ("element".equals(element.tagName())) {
                AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                extractTemplate.addCssSelector(selector);
            } else if ("if".equals(element.tagName())) {
                IFConditions ifConditions = IFConditions.create(element);
                extractTemplate.addConditions(ifConditions);
            }
        }
        this.templates.add(extractTemplate);
    }
    return this;
}

From source file:org.sbs.goodcrawler.jobconf.FetchConfig.java

/**
 * ???/*from w w  w .j  a v  a  2 s .  c  om*/
 * @param confFile
 * @return
 */
public FetchConfig loadConfig(Document confDoc) throws ConfigurationException {
    try {
        Document doc = confDoc;
        super.jobName = doc.select("job").attr("name");
        super.indexName = doc.select("job").attr("indexName");
        Elements e = doc.select("fetch");
        this.type = e.select("type").text();
        this.agent = e.select("agent").text();
        String temp = e.select("threadNum").text();
        if (StringUtils.isNotBlank(temp)) {
            this.threadNum = Integer.parseInt(temp);
        }

        temp = e.select("delayBetweenRequests").text();
        if (StringUtils.isNotBlank(temp)) {
            this.delayBetweenRequests = Integer.parseInt(temp);
        }

        temp = e.select("maxDepthOfCrawling").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDepthOfCrawling = Integer.parseInt(temp);
        }

        temp = e.select("fetchBinaryContent").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fetchBinaryContent = Boolean.parseBoolean(temp);
        }

        if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) {
            this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text());
        }

        temp = e.select("fileSuffix").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fileSuffix = temp;
        }

        temp = e.select("maxDownloadSizePerPage").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDownloadSizePerPage = Integer.parseInt(temp);
        }

        temp = e.select("https").text();
        if (StringUtils.isNotBlank(temp)) {
            this.https = Boolean.parseBoolean(temp);
        }

        temp = e.select("onlyDomain").text();
        if (StringUtils.isNotBlank(temp)) {
            this.onlyDomain = Boolean.parseBoolean(temp);
        }

        temp = e.select("socketTimeoutMilliseconds").text();
        if (StringUtils.isNotBlank(temp)) {
            this.socketTimeoutMilliseconds = Integer.parseInt(temp);
        }

        temp = e.select("connectionTimeout").text();
        if (StringUtils.isNotBlank(temp)) {
            this.connectionTimeout = Integer.parseInt(temp);
        }

        temp = e.select("maxTotalConnections").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxTotalConnections = Integer.parseInt(temp);
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text());
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(temp);
        }

        if (StringUtils.isNotBlank(e.select("proxyHost").text())) {
            this.proxyHost = e.select("proxyHost").text();
        }
        if (StringUtils.isNotBlank(e.select("proxyPort").text())) {
            this.proxyPort = Integer.parseInt(e.select("proxyPort").text());
        }
        if (StringUtils.isNotBlank(e.select("proxyUsername").text())) {
            this.proxyUsername = e.select("proxyUsername").text();
        }
        if (StringUtils.isNotBlank(e.select("proxyPassword").text())) {
            this.proxyPassword = e.select("proxyPassword").text();
        }
        if (StringUtils.isNotBlank(e.select("proxyHost").text())) {
            this.proxyHost = e.select("proxyHost").text();
        }

        // seed
        Elements seeds = doc.select("fetch seeds seed");
        for (Element element : seeds) {
            WebURL url = new WebURL();
            String seed = element.text();
            this.seeds.add(seed);
            url.setURL(seed);
            url.setJobName(jobName);
            url.setDepth((short) 0);
            try {
                PendingManager.getPendingUlr(jobName).addElement(url);
                BloomfilterHelper.getInstance().add(url.getURL());
            } catch (QueueException e1) {
                e1.printStackTrace();
            }
        }

        /*
         * ??Url
         */
        Elements fetchUrlFilters = doc.select("fetchUrlFilters filter");
        for (Element element : fetchUrlFilters) {
            this.fetchUrlFilters.add(element.text());
        }

        /*
         * ?????Url
         */
        Elements extractUrlfilters = doc.select("extractUrlfilters filter");
        for (Element element : extractUrlfilters) {
            this.extractUrlfilters.add(element.text());
        }
    } catch (NumberFormatException e) {
        throw new ConfigurationException("?" + e.getMessage());
    }

    return this;
}

From source file:org.sbs.goodcrawler.jobconf.StoreConfig.java

public StoreConfig loadConfig(Document confDoc) {
    Document doc = confDoc;/*from www. jav  a  2s.  c om*/
    jobName = doc.select("job").attr("name");
    indexName = doc.select("job").attr("indexName");
    Elements e = doc.select("store");
    this.type = e.select("type").text();
    if (StringUtils.isNotBlank(e.select("threadNum").text())) {
        this.threadNum = Integer.parseInt(e.select("threadNum").text());
    }
    String className = e.select("plugin").text();
    if (StringUtils.isNotBlank(className)) {
        this.pluginClass = className;
    }
    // id? 
    String idPolicy = e.select("idPolicy").text();
    if (StringUtils.isNotBlank(idPolicy)) {
        id = EnumUtils.getEnum(IDPolicy.class, idPolicy);
        if (!IDPolicy.auto.equals(id)) {
            String pref = e.select("ref").text();
            if (StringUtils.isNotBlank(pref)) {
                this.policyRef = pref;
            }
            if (StringUtils.isBlank(this.policyRef)) {
                try {
                    throw new ConfigurationException("ID??");
                } catch (Exception e2) {
                    e2.printStackTrace();
                }
            }
        }
    }
    return this;
}

From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java

@Override
public ExtractedPage<?, ?> onExtract(Page page) {
    if (null != page) {
        try {/* www  .  j  a v a2  s.co m*/

            Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                    urlUtils.getBaseUrl(page.getWebURL().getURL()));
            if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/"))
                return null;
            // ???Url?Url
            Elements links = doc.getElementsByTag("a");
            if (!links.isEmpty()) {
                for (Element link : links) {
                    String linkHref = link.absUrl("href");
                    if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) {
                        try {
                            WebURL url = new WebURL();

                            url.setURL(linkHref);
                            url.setJobName(conf.jobName);
                            pendingUrls.addUrl(url);
                        } catch (QueueException e) {
                            log.error(e.getMessage());
                        } catch (Exception e) {
                            log.error(e.getMessage());
                        }
                    }
                }
            }
            // ??
            //            Map<String, String> selects = conf.getSelects();
            Map<String, String> selects = null;
            ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>();
            epage.setUrl(page.getWebURL());
            HashMap<String, Object> result = new HashMap<>();
            Elements text = doc.select("#Zoom");
            if (null == text || text.size() == 0) {
                return null;
            }
            String name = doc.select("h1").text();
            name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", "");
            result.put("movie", name);
            //            result.put("_id", name);
            String ts[] = doc.select("h2 a").text().split(" ");
            if (ts.length >= 2) {
                result.put("type", ts[1].trim());
            } else {
                result.put("type", "unknow");
            }
            result.put("url", page.getWebURL().getURL());
            for (Entry<String, String> entry : selects.entrySet()) {
                Elements elements = doc.select(entry.getValue());
                if (elements.isEmpty())
                    return null;
                else {
                    if ("content".equals(entry.getKey())) {

                        for (Element element : elements) {
                            // 
                            Elements imgs = element.select("img[src]");
                            StringBuilder sb = new StringBuilder();
                            for (Element img : imgs) {
                                sb.append(img.attr("src")).append(";");
                            }
                            result.put("img", sb.toString());
                            // ?
                            Elements movieInfos = element.select("p");
                            for (Element info : movieInfos) {
                                String infotext = info.text();
                                try {
                                    String infotext_ = info.html();
                                    int start, end = 0;
                                    start = infotext_.indexOf("");
                                    if (start > 0) {
                                        end = infotext_.lastIndexOf("");
                                        if (end > 0 && start < end) {
                                            result.put("jq", infotext_.substring(start, end));
                                        } else {
                                            end = infotext_.lastIndexOf(".");
                                            if (end > 0 && start < end) {
                                                result.put("jq", infotext_.substring(start, end));
                                            }
                                        }
                                    }
                                    infotext_ = null;
                                } catch (Exception e) {
                                    e.printStackTrace();
                                }

                                if (infotext.startsWith("")) {
                                    String ss[] = infotext.split("");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.startsWith("?")) {
                                    String ss[] = infotext.split("?");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains("")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains(":")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                }
                            }

                            //                        if(result.size()<5){
                            //                           result.put("content", value)
                            //                        }

                            // ?
                            Elements elements2 = elements.select("td");
                            sb.setLength(0);
                            for (Element download : elements2) {
                                sb.append(download.text()).append(";");
                            }
                            result.put("download", sb.toString());
                        }
                    }
                }
                //               result.put(entry.getKey(), elements.html());
            }
            if (StringUtils.isNotBlank((String) result.get("nd"))) {
                result.put("nd", Integer.parseInt((String) result.get("nd")));
            }
            epage.setMessages(result);
            try {
                pendingStore.addExtracedPage(epage);
            } catch (QueueException e) {
                log.error(e.getMessage());
            }
            return epage;
        } catch (UnsupportedEncodingException e) {
            log.error(e.getMessage());
            e.printStackTrace();
        }
    }
    return null;
}

From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java

/**
 * Get movie meta data from aebn.net.//from   w  w  w .ja  v a  2 s .  c  om
 *
 */
@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("AEBN: getMetadata() {}", options);

    // check if there is already meta data present in the result
    if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) {
        LOGGER.debug("AEBN: return metadata from cache");
        return options.getResult().getMediaMetadata();
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    Elements elements = null;
    Element element = null;
    Integer aebnId = 0;

    // get AebnId from previous search result
    if ((options.getResult() != null) && (options.getResult().getId() != null)) {
        aebnId = Integer.parseInt(options.getResult().getId());
        LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId);
        // preset some values from search result (if there is one)
        // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy".
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle()));
        md.storeMetadata(MediaMetadata.TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getTitle()));
    }

    // or get AebnId from options
    if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) {
        LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID));
        aebnId = Integer.parseInt(options.getId(AEBNID));
    }

    if (!isValidAebnId(aebnId)) {
        LOGGER.warn("AEBN: no or incorrect aebnId, aborting");
        return md;
    }

    // ID
    md.setId(providerInfo.getId(), aebnId);
    LOGGER.debug("AEBN: aebnId({})", aebnId);

    // Base download url for data scraping
    String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId;
    String locale = options.getLanguage().name();
    if (!StringUtils.isBlank(locale)) {
        downloadUrl = downloadUrl + "&locale=" + locale;
        LOGGER.debug("AEBN: used locale({})", locale);
    }

    // begin download and scrape
    try {
        LOGGER.debug("AEBN: download movie detail page");
        Url url = new Url(downloadUrl);
        InputStream in = url.getInputStream();
        Document document = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // Title
        // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1>
        LOGGER.debug("AEBN: parse title");
        elements = document.getElementsByAttributeValue("class", "md-movieTitle");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieTitle = cleanString(element.text());
            LOGGER.debug("AEBN: title({})", movieTitle);
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // Poster
        // front cover:
        // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg
        String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg";
        md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl);

        // Fanart/Background
        // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg
        // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..."
        // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." />
        LOGGER.debug("AEBN: parse fanart / scene thumbs");
        elements = document.getElementsByAttributeValue("class", "SceneThumbnail");
        LOGGER.debug("AEBN: {} elements found", elements.size());
        int i = 1;
        for (Element anchor : elements) {
            String backgroundUrl = anchor.attr("src");
            LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl);
            md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl);
            i++;
        }

        // Runtime
        LOGGER.debug("AEBN: parse runtime");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieRuntime = cleanString(element.attr("content"));
            movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M");
            LOGGER.debug("AEBN: runtime({})", movieRuntime);
            md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime);
        }

        // Year
        LOGGER.debug("AEBN: parse year");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieYear = cleanString(element.attr("content"));
            movieYear = StrgUtils.substr(movieYear, "(\\d+)-");
            LOGGER.debug("AEBN: year({})", movieYear);
            md.storeMetadata(MediaMetadata.YEAR, movieYear);
        }

        // Series (Collection)
        LOGGER.debug("AEBN: parse collection");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieCollection = cleanString(element.text());

            // Fake a TMDB_SET based on the hash value of the collection name
            int movieCollectionHash = movieCollection.hashCode();

            md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection);
            md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash);
            LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash);
        }

        // Studio
        LOGGER.debug("AEBN: parse studio");
        elements = document.getElementsByAttributeValue("id", "md-details")
                .select("[itemprop=productionCompany]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String movieStudio = cleanString(elements.first().text());
            LOGGER.debug("AEBN: studio({})", movieStudio);
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio);
        }

        // Genre
        LOGGER.debug("AEBN: parse genre");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]");
        for (Element g : elements) {
            md.addGenre(getTmmGenre(g.text()));
        }
        // add basic genre, since all genres at AEBN could be summarised
        // into this one
        md.addGenre(MediaGenres.EROTIC);

        // Certification
        // no data scrapeable---but obviously it's adult only, so simply
        // generate it
        String movieCertification = null;
        Certification certification = null;
        String country = options.getCountry().getAlpha2();
        LOGGER.debug("AEBN: generate certification for {}", country);
        // @formatter:off
        if (country.equals("DE")) {
            movieCertification = "FSK 18";
        }
        if (country.equals("US")) {
            movieCertification = "NC-17";
        }
        if (country.equals("GB")) {
            movieCertification = "R18";
        }
        if (country.equals("FR")) {
            movieCertification = "18";
        }
        if (country.equals("ES")) {
            movieCertification = "PX";
        }
        if (country.equals("JP")) {
            movieCertification = "R18+";
        }
        if (country.equals("IT")) {
            movieCertification = "V.M.18";
        }
        if (country.equals("NL")) {
            movieCertification = "16";
        }
        // @formatter:on
        certification = Certification.getCertification(options.getCountry(), movieCertification);
        if (certification != null) {
            LOGGER.debug("AEBN: certification({})", certification);
            md.addCertification(certification);
        }

        // Plot and Tagline
        LOGGER.debug("AEBN: parse plot");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String moviePlot = cleanString(elements.first().text());
            md.storeMetadata(MediaMetadata.PLOT, moviePlot);
            // no separate tagline available, so extract the first sentence
            // from the movie plot
            String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])");
            LOGGER.debug("AEBN: tagline(" + movieTagline + ")");
            md.storeMetadata(MediaMetadata.TAGLINE, movieTagline);
        }

        // Actors
        LOGGER.debug("AEBN: parse actors");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]");
        LOGGER.debug("AEBN: {} actors found", elements.size());
        for (Element anchor : elements) {
            String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)");
            String actorname = cleanString(anchor.select("[itemprop=name]").first().text());
            String actordetailsurl = BASE_DATAURL + anchor.attr("href");
            if (!actorname.isEmpty()) {
                LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname,
                        actordetailsurl);
                MediaCastMember cm = new MediaCastMember();
                cm.setType(MediaCastMember.CastType.ACTOR);
                cm.setName(actorname);
                if (!actorid.isEmpty()) {
                    cm.setId(actorid);
                }

                // Actor detail page
                try {
                    Url starurl = new Url(actordetailsurl);
                    InputStream starurlstream = starurl.getInputStream();
                    Document stardocument = Jsoup.parse(starurlstream, "UTF-8", "");
                    starurlstream.close();
                    Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo");
                    if (elements2.size() == 0) {
                        LOGGER.debug("AEBN: no additional actor details found");
                    } else {
                        // Actor image
                        String actorimage = elements2.select("[itemprop=image]").first().attr("src");
                        LOGGER.debug("AEBN: actor image({})", actorimage);
                        if (!actorimage.isEmpty()) {
                            cm.setImageUrl(actorimage);
                        }
                        // Actor 'fanart' images
                        // unsure if this is ever shown in tmm
                        elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery")
                                .select("a");
                        LOGGER.debug("AEBN: {} gallery images found", elements2.size());
                        for (Element thumbnail : elements2) {
                            LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href"));
                            cm.addFanart(thumbnail.attr("href"));
                        }
                    }
                } catch (Exception e) {
                    LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e);
                }

                md.addCastMember(cm);
            }
        }

        // Director
        LOGGER.debug("AEBN: parse director");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)");
            String directorname = cleanString(elements.select("[itemprop=name]").first().text());
            if (!directorname.isEmpty()) {
                MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                cm.setName(directorname);
                if (!directorid.isEmpty()) {
                    cm.setId(directorid);
                }
                cm.setImageUrl("");
                md.addCastMember(cm);
                LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname);
            }
        }

        // Original Title
        // if we have no original title, just copy the title
        if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }
    } catch (Exception e) {
        LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e);
    }

    return md;
}

From source file:perflab.LoadrunnerWrapper.java

/**
 * @param htmlSummaryFile - load runner analysis html report file to parse
 * @param summaryFile     - location of summary file to be generated out of loadrunner html analysis
 *//*from w w  w . ja va 2 s  . c o  m*/
protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) {
    try {

        File input = new File(htmlSummaryFile);
        Document document = Jsoup.parse(input, "UTF-8");
        Document parse = Jsoup.parse(document.html());
        Elements table = parse.select("table").select("[summary=Transactions statistics summary table]");
        Elements rows = table.select("tr");

        getLog().info("number of rows in summary file=" + rows.size());

        for (Element row : rows) {

            //getLog().info("table element = " + row.toString());

            String name = row.select("td[headers=LraTransaction Name]").select("span").text();

            if (!name.isEmpty()) {

                float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text());
                float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text());
                float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text());
                int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text()
                        .replace(".", "").replace(",", ""));
                int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text()
                        .replace(".", "").replace(",", ""));
                int failedPrecentage = failed / (failed + passed) * 100;

                getLog().info("Saving Transaction [" + name + "]");
                this.transactions.add(
                        new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed, failedPrecentage));
            }
        }

    } catch (IOException e) {
        getLog().error("Can't read LoadRunner Analysis html report " + e.getMessage());
    }

}