Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:downloadwolkflow.getWorkFlowList.java

private static void downloadWorkFlow(String detailUrl, CloseableHttpClient httpclient) {
    try {//ww  w .j  a va  2 s  .  c  o  m
        HttpGet httpget = new HttpGet(detailUrl);
        HttpResponse response = httpclient.execute(httpget);
        String page = EntityUtils.toString(response.getEntity());
        Document mainDoc = Jsoup.parse(page);
        Element downloadEle = mainDoc.select("div#myexp_content ul li a").first();
        if (downloadEle == null) {
            downloadEle = mainDoc.select("div#myexp_content ul li:nth-child(1) span a").first();
        }

        String downloadUrl = downloadEle.attributes().get("href");
        Thread.sleep(500);
        if (downloadUrl.contains("download")) {
            downloadFiles(downloadUrl, httpclient);
        } else {
            System.out.println(detailUrl + " do not contain valuable resource");
        }
    } catch (IOException ex) {
        Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex);
    } catch (InterruptedException ex) {
        Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:io.sightly.tck.html.HTMLExtractor.java

/**
 * Retrieves the content of an element, without its own markup tags, identified by the {@code selector} from the given {@code markup}.
 * The {@code url} is used only for caching purposes, to avoid parsing multiple times the markup returned for the same resource.
 *
 * @param url      the url that identifies the markup
 * @param markup   the markup/*from  ww w.ja v  a 2 s.co  m*/
 * @param selector the selector used for retrieval
 * @return the contents of the selected element
 */
public static String innerHTML(String url, String markup, String selector) {
    ensureMarkup(url, markup);
    Document document = documents.get(url);
    Elements elements = document.select(selector);
    return elements.html();
}

From source file:io.sightly.tck.html.HTMLExtractor.java

/**
 * Checks if the element from the {@code markup} identified by the {@code selector} contains the text from {@code value}. The
 * {@code url} is used only for caching purposes, to avoid parsing multiple times the markup returned for the same resource.
 *
 * @param url      the url that identifies the markup
 * @param markup   the markup/* w  w w.  j a va  2 s.com*/
 * @param selector the selector used for retrieval
 * @param value    the text that should exist in the markup
 * @return {@code true} if the {@code value} was found in the markup, {@code false} otherwise
 */
public static boolean contains(String url, String markup, String selector, String value) {
    ensureMarkup(url, markup);
    Document document = documents.get(url);
    Elements elements = document.select(selector);
    return elements.outerHtml().contains(value);
}

From source file:io.sightly.tck.html.HTMLExtractor.java

/**
 * Checks if the {@code selector} identifies an element from the {@code markup}. The {@code url} is used only for caching purposes,
 * to avoid parsing multiple times the markup returned for the same resource.
 *
 * @param url      the url that identifies the markup
 * @param markup   the markup/*ww  w .  ja v  a 2 s . c  om*/
 * @param selector the selector used for retrieval
 * @return {@code true} if the element identified by the {@code selector} exists, {@code false} otherwise
 */
public static boolean exists(String url, String markup, String selector) {
    ensureMarkup(url, markup);
    Document document = documents.get(url);
    Elements elements = document.select(selector);
    return elements.size() > 0;
}

From source file:io.sightly.tck.html.HTMLExtractor.java

/**
 * Checks if the element matched by the {@code selector} has children and if their number is equal to {@code howMany}.
 *
 * @param url      the url that identifies the markup
 * @param markup   the markup//  www  .ja  va  2s.  co  m
 * @param selector the selector used for retrieval
 * @param howMany  the number of expected children
 * @return {@code true} if the number of children is equal to {@code howMany}, {@code false} otherwise
 */
public static boolean hasChildren(String url, String markup, String selector, int howMany) {
    ensureMarkup(url, markup);
    Document document = documents.get(url);
    Element element = document.select(selector).first();
    if (element == null) {
        return false;
    }
    return element.children().size() == howMany;

}

From source file:controllers.BIProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//from   w w w.ja v a2s. co  m
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.businessinsider.com/s")
            .setQueryParameter("q", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.search-result");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", item.select("img").attr("src"));
                    keyValue.put("title", item.select("h3").text());
                    keyValue.put("content", item.select("div.excerpt").first().text());
                    keyValue.put("date", item.select("li.date").text());
                    keyValue.put("url", item.select("a").attr("href"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:io.sightly.tck.html.HTMLExtractor.java

/**
 * Checks if an element matched by the {@code selector} contains or not the attribute {@code attributeName},
 * depending on the value of the {@code exists} flag. Additionally, the attribute's value can be checked against {@code attributeValue}.
 *
 * @param url            the url that identifies the markup
 * @param markup         the markup// ww  w .  j a  v a 2s .  co m
 * @param selector       the selector used for retrieval
 * @param exists         flag that defines if the attribute is expected to exist or not
 * @param attributeName  the attribute's name
 * @param attributeValue the attribute's value
 * @return {@code true} if the attribute matches the defined conditions, {@code false} otherwise
 */
public static boolean hasAttribute(String url, String markup, String selector, boolean exists,
        String attributeName, String attributeValue) {
    ensureMarkup(url, markup);
    Document document = documents.get(url);
    Elements elements = document.select(selector);
    if (elements.size() > 0) {
        if (exists) {
            if (StringUtils.isNotEmpty(attributeValue)) {
                String value = elements.attr(attributeName);
                return attributeValue.equals(value);
            }
            return true;
        } else {
            return elements.hasAttr(attributeName);
        }
    }
    return false;
}

From source file:controllers.FRBProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//from w  ww .  j a v a 2s .c om
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.forbes.com/search/")
            .setQueryParameter("q", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {

                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("li.edittools-contentitem"); // All articles belong to this class

                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    // Check if specific article belongs to gallery class (therefore it contains an image)
                    if (item.hasClass("gallery")) {
                        // Add image key and value to map
                        keyValue.put("image", item.select("img").attr("src"));
                    }

                    // Add the rest of keys and values
                    keyValue.put("title", item.select("h2").select("a").text());
                    keyValue.put("content", item.select("p").first().ownText());
                    keyValue.put("date", item.select("time").text());
                    keyValue.put("url", item.select("h2").select("a").attr("href"));

                    results.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:controllers.KWProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/*from   w  ww  .  java2s  . c  o  m*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://knowledge.wharton.upenn.edu/")
            .setQueryParameter("s", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {

                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.article.type-article.status-publish"); // All articles belong to this classes

                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    // Check if specific article belongs to "has-post-thumbnail" class (therefore it contains an image)
                    if (item.hasClass("has-post-thumbnail")) {
                        // Add image key and value to map
                        keyValue.put("image", item.select("img").attr("src"));
                    }

                    // Add the rest of keys and values
                    keyValue.put("title", item.select("h2").select("a").text());
                    keyValue.put("content", item.select("div.attribute.categorythumbs").first().text());
                    keyValue.put("date", item.select("ul.datestamp").select("li").first().text());
                    keyValue.put("url", item.select("h2").select("a").attr("href"));

                    results.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:controllers.NWProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/*w  w w  .java  2s.c o  m*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    final String officialUrl = "http://www.newsweek.com";

    F.Promise<WSResponse> wsResponsePromise = WS.url(officialUrl + "/search/site/" + query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {

                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("li.search-result"); // All articles belong to this class

                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", item.select("img").attr("src"));
                    keyValue.put("title", item.select("h2").select("a").text());
                    keyValue.put("content", item.select("div.article-summary").first().text());

                    // Get date from each article separately
                    org.jsoup.nodes.Document articleDoc = RedirectionHandler(
                            officialUrl + item.select("a").attr("href"));

                    keyValue.put("date", articleDoc.select("span.timedate").text());
                    keyValue.put("url", officialUrl + item.select("a").attr("href"));

                    results.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}