Example usage for org.jsoup.select Elements select

Introduction

In this page you can find the example usage for org.jsoup.select Elements select.

Prototype

public Elements select(String query)

Source Link

Document

Find matching elements within this element list.

Usage

From source file:gpxparser.GpxParser.java

/**
 * @param args the command line arguments
 *//* w w  w.  j a v  a  2  s  .  co m*/
public static void main(String[] args) {
    File input = new File("/home/yonseca/4.gpx");
    Track track = new Track();
    try {
        Document doc = Jsoup.parse(input, "UTF-8");
        //System.out.println(doc.text());
        Elements trackData = doc.getElementsByTag("trk");
        Elements trackName = trackData.select("name");
        track.setName(trackName.text());
        Elements trkPt = trackData.select("trkseg").select("trkpt");
        for (Iterator<Element> iterator = trkPt.iterator(); iterator.hasNext();) {
            Element dataPoint = iterator.next();
            Double lat = NumberUtils.toDouble(dataPoint.attr("lat"));
            Double lon = NumberUtils.toDouble(dataPoint.attr("lon"));
            Double altitude = NumberUtils.toDouble(dataPoint.select("ele").text());
            track.addPoint(lat, lon, altitude);
        }
        System.out.println("");

    } catch (IOException ex) {
        Logger.getLogger(GpxParser.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:Main.java

public static String getScrapeText(Elements elements, String query) {
    String resultString = "";
    for (Element element : elements.select(query))
        if (element.hasText())
            resultString += element.text().trim() + "\n";
    return resultString.trim();
}

From source file:com.cbmapi.CbmAPI.java

public static String searchCpuByName(String cpuName) {
    String encodedName = encodeToUrl(cpuName);
    Document html = null;//from w  w w  . j av  a  2  s .c om
    String url = null;
    try {
        //Connects to zoom's search engine and looks for given cpu from benhmarks section.
        html = Jsoup.connect("https://www.passmark.com/search/zoomsearch.php?zoom_sort=0&zoom_query="
                + encodedName + "&zoom_cat%5B%5D=5").get();
    } catch (IOException e) {
        System.out.println("Connection throws an exception: " + e);
    }

    //Regex check is used to validate correct search result.
    if (html != null) {
        Elements links = html.select("div.results");
        links = links.select("a[href~=^(https?:\\/\\/www.cpubenchmark.net/cpu.php\\?)]");
        url = links.attr("href");
        if (url.isEmpty()) {
            return "No results found for: " + cpuName;
        }
    } //message for connection issues.
    else {
        return "Connection to the search engine failed.";
    }
    return url;
}

From source file:app.data.parse.WebPageUtil.java

public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException {
    String original = url;//from  w ww  .j a  va  2s  . co  m

    // hit toutiao.io
    // fixme http://toutiao.io/shares/640539/url
    if (original.startsWith("https://toutiao.io/posts/")) {
        original = original.replace("/posts/", "/k/");
    }

    // check cache
    WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null;
    if (info != null) {
        return info;
    } else {
        info = new WebPageInfo();
        info.url = original;
    }

    // attach url
    Document doc = requestUrl(info.url);
    info.url = doc.baseUri(); // or doc.location()

    // hit gold.xitu.io
    if (info.url.startsWith("http://gold.xitu.io/entry/")) {
        Elements origin = doc.select("div[class=ellipsis]");
        Elements originLink = origin.select("a[class=share-link]");
        info.url = originLink.attr("href");

        // reconnect
        doc = requestUrl(info.url);
        info.url = doc.baseUri(); // or doc.location()
    }

    info.url = smartUri(info.url);

    // get title
    Elements metaTitle = doc.select("meta[property=og:title]");
    if (metaTitle != null) {
        info.title = metaTitle.attr("content");
    }
    if (StringUtils.isEmpty(info.title)) {
        metaTitle = doc.select("meta[property=twitter:title]");
        if (metaTitle != null) {
            info.title = metaTitle.attr("content");
        }
        info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title;
    }

    // get desc
    Elements metaDesc = doc.select("meta[property=og:description]");
    if (metaDesc != null) {
        info.description = metaDesc.attr("content");
    }
    if (StringUtils.isEmpty(info.description)) {
        metaDesc = doc.select("meta[property=twitter:description]");
        if (metaDesc != null) {
            info.description = metaDesc.attr("content");
        }
        if (StringUtils.isEmpty(info.description)) {
            metaDesc = doc.select("meta[name=description]");
            if (metaDesc != null) {
                info.description = metaDesc.attr("content");
            }
            if (StringUtils.isEmpty(info.description)) {
                metaDesc = doc.body().select("p");
                if (metaDesc != null) {
                    for (Element element : metaDesc) {
                        info.description = element.text();
                        if (info.description != null && info.description.length() >= 20) {
                            break;
                        }
                    }
                }
            }
        }
    }
    info.description = ellipsis(info.description, 140, "...");

    // cache info
    if (urlInfoCache != null) {
        urlInfoCache.put(original, info);
    }
    return info;
}

From source file:damo.three.ie.util.HtmlUtilities.java

/**
 * Parses the My3 account usage page to nicer JSON format.
 *
 * @param pageContent Page content as HTML.
 * @return Usage information stripped out and formatted as JSON.
 * @throws JSONException/*from   w  ww.  j a  v  a 2s .c om*/
 */
public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException {
    // The HTML on prepay is pig-ugly, so we will use JSoup to
    // clean and parse it.
    Document doc = Jsoup.parse(pageContent);
    HtmlUtilities.removeComments(doc);

    Elements elements = doc.getElementsByTag("table");

    JSONArray jsonArray = new JSONArray();

    // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items.
    // .. feck them!
    boolean three2threeCallsBug = false;

    for (Element element : elements) {

        for (Element subelement : element.select("tbody > tr")) {

            if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) {
                three2threeCallsBug = true;
            }

            Elements subsubelements = subelement.select("td");

            if (subsubelements.size() == 3) {

                // skip the "total" entries
                if (subsubelements.select("td").get(0).text().contains("Total")) {
                    continue;
                }

                JSONObject currentItem = new JSONObject();

                if (three2threeCallsBug) {
                    currentItem.put("item", "3 to 3 Calls");
                } else {
                    // Get rid of that "non-breaking space" character if it exists
                    String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "")
                            .trim();
                    currentItem.put("item", titleToClean);
                }

                /**
                 * Check if date contains "Today", if so, change it to a date.
                 * Otherwise we will never know when usage ends, unless user refreshes, As 'today'
                 * is 'today', tomorrow.. see!
                 */
                String value1 = subsubelements.select("td").get(1).text();
                if (value1.equals("Today")) {
                    DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK);
                    DateTime dt = new DateTime(); // current datetime
                    value1 = "Expires " + formatter.print(dt);
                }
                currentItem.put("value1", value1);
                currentItem.put("value2", subsubelements.select("td").get(2).text());

                // Out of Bundle charges have an extra property
                if (currentItem.getString("item").startsWith("Internet")) {

                    Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL);
                    Matcher m1 = p1.matcher(pageContent);

                    StringBuilder cleanedDate = new StringBuilder();
                    if (m1.matches()) {
                        cleanedDate.append(m1.group(1));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(2));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(3));
                        currentItem.put("value3", cleanedDate.toString());
                    }

                }
                jsonArray.put(currentItem);
            }

        }

        // reset the 3-to-3 call bug flag for next Element
        if (three2threeCallsBug) {
            three2threeCallsBug = false;
        }
    }

    return jsonArray;
}

From source file:com.greenpepper.maven.plugin.SpecificationRunnerMojo.java

public static void recoverLinkInResult(String specification, String htmlString, RepositoryIndex repositoryIndex)
        throws IOException {
    RepositoryIndex.SpecificationInfo specificationInfo = repositoryIndex.getNameToInfo().get(specification);
    if (isBlank(specificationInfo.getLink()) && isNotBlank(htmlString)) {
        LOGGER.trace("got new missing link in index for '{}'. trying to find it in the result output",
                specification);//  w w w . j  a va 2  s .  co m
        org.jsoup.nodes.Document resultOutput = Jsoup.parse(htmlString);
        Elements metaTags = resultOutput.head().getElementsByTag("meta");
        String link = metaTags.select("[name=\"external-link\"]").attr("content");
        if (isNotBlank(link)) {
            LOGGER.trace("Found {}", link);
            specificationInfo.setLink(link);
            repositoryIndex.dump();
        }
    }
}

From source file:dev.maisentito.suca.commands.EnitCommandHandler.java

@Override
public void handleCommand(MessageEvent event, String[] args) throws Throwable {
    Document doc = Jsoup.connect("http://www.wordreference.com/enit/" + StringUtils.join(args, ' '))
            .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get();
    Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)");
    row.select(".tooltip").remove();
    String def = row.text().trim().replace("\n", "");
    event.respond(def);/* w  w w.  j av  a2s.  c o  m*/
}

From source file:dev.maisentito.suca.commands.ItenCommandHandler.java

@Override
public void handleCommand(MessageEvent event, String[] args) throws Throwable {
    Document doc = Jsoup.connect("http://www.wordreference.com/iten/" + StringUtils.join(args, ' '))
            .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get();
    Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)");
    row.select(".tooltip").remove();
    String def = row.text().trim().replace("\n", "");
    event.respond(def);/*from w ww. j av a 2s .c o  m*/
}

From source file:com.manisha.allmybooksarepacked.service.BookParser.java

private String findTitle() {
    Elements title = doc.select(PathMapping.TITLE);
    title.select("span").remove();
    return title.html().trim();
}

From source file:com.manisha.allmybooksarepacked.service.BookParser.java

private String findIsbn10() {
    Elements isbn10 = doc.select(PathMapping.ISBN_10);
    isbn10.select("b").remove();
    return isbn10.html().trim();
}