List of usage examples for org.jsoup.select Elements select
public Elements select(String query)
From source file:gpxparser.GpxParser.java
/** * @param args the command line arguments *//* w w w. j a v a 2 s . co m*/ public static void main(String[] args) { File input = new File("/home/yonseca/4.gpx"); Track track = new Track(); try { Document doc = Jsoup.parse(input, "UTF-8"); //System.out.println(doc.text()); Elements trackData = doc.getElementsByTag("trk"); Elements trackName = trackData.select("name"); track.setName(trackName.text()); Elements trkPt = trackData.select("trkseg").select("trkpt"); for (Iterator<Element> iterator = trkPt.iterator(); iterator.hasNext();) { Element dataPoint = iterator.next(); Double lat = NumberUtils.toDouble(dataPoint.attr("lat")); Double lon = NumberUtils.toDouble(dataPoint.attr("lon")); Double altitude = NumberUtils.toDouble(dataPoint.select("ele").text()); track.addPoint(lat, lon, altitude); } System.out.println(""); } catch (IOException ex) { Logger.getLogger(GpxParser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:Main.java
public static String getScrapeText(Elements elements, String query) { String resultString = ""; for (Element element : elements.select(query)) if (element.hasText()) resultString += element.text().trim() + "\n"; return resultString.trim(); }
From source file:com.cbmapi.CbmAPI.java
public static String searchCpuByName(String cpuName) { String encodedName = encodeToUrl(cpuName); Document html = null;//from w w w . j av a 2 s .c om String url = null; try { //Connects to zoom's search engine and looks for given cpu from benhmarks section. html = Jsoup.connect("https://www.passmark.com/search/zoomsearch.php?zoom_sort=0&zoom_query=" + encodedName + "&zoom_cat%5B%5D=5").get(); } catch (IOException e) { System.out.println("Connection throws an exception: " + e); } //Regex check is used to validate correct search result. if (html != null) { Elements links = html.select("div.results"); links = links.select("a[href~=^(https?:\\/\\/www.cpubenchmark.net/cpu.php\\?)]"); url = links.attr("href"); if (url.isEmpty()) { return "No results found for: " + cpuName; } } //message for connection issues. else { return "Connection to the search engine failed."; } return url; }
From source file:app.data.parse.WebPageUtil.java
public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException { String original = url;//from w ww .j a va 2s . co m // hit toutiao.io // fixme http://toutiao.io/shares/640539/url if (original.startsWith("https://toutiao.io/posts/")) { original = original.replace("/posts/", "/k/"); } // check cache WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null; if (info != null) { return info; } else { info = new WebPageInfo(); info.url = original; } // attach url Document doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() // hit gold.xitu.io if (info.url.startsWith("http://gold.xitu.io/entry/")) { Elements origin = doc.select("div[class=ellipsis]"); Elements originLink = origin.select("a[class=share-link]"); info.url = originLink.attr("href"); // reconnect doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() } info.url = smartUri(info.url); // get title Elements metaTitle = doc.select("meta[property=og:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } if (StringUtils.isEmpty(info.title)) { metaTitle = doc.select("meta[property=twitter:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title; } // get desc Elements metaDesc = doc.select("meta[property=og:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[property=twitter:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[name=description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.body().select("p"); if (metaDesc != null) { for (Element element : metaDesc) { info.description = element.text(); if (info.description != null && info.description.length() >= 20) { break; } } } } } } info.description = ellipsis(info.description, 140, "..."); // cache info if (urlInfoCache != null) { urlInfoCache.put(original, info); } return info; }
From source file:damo.three.ie.util.HtmlUtilities.java
/** * Parses the My3 account usage page to nicer JSON format. * * @param pageContent Page content as HTML. * @return Usage information stripped out and formatted as JSON. * @throws JSONException/*from w ww. j a v a 2s .c om*/ */ public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException { // The HTML on prepay is pig-ugly, so we will use JSoup to // clean and parse it. Document doc = Jsoup.parse(pageContent); HtmlUtilities.removeComments(doc); Elements elements = doc.getElementsByTag("table"); JSONArray jsonArray = new JSONArray(); // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items. // .. feck them! boolean three2threeCallsBug = false; for (Element element : elements) { for (Element subelement : element.select("tbody > tr")) { if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) { three2threeCallsBug = true; } Elements subsubelements = subelement.select("td"); if (subsubelements.size() == 3) { // skip the "total" entries if (subsubelements.select("td").get(0).text().contains("Total")) { continue; } JSONObject currentItem = new JSONObject(); if (three2threeCallsBug) { currentItem.put("item", "3 to 3 Calls"); } else { // Get rid of that "non-breaking space" character if it exists String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "") .trim(); currentItem.put("item", titleToClean); } /** * Check if date contains "Today", if so, change it to a date. * Otherwise we will never know when usage ends, unless user refreshes, As 'today' * is 'today', tomorrow.. see! */ String value1 = subsubelements.select("td").get(1).text(); if (value1.equals("Today")) { DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK); DateTime dt = new DateTime(); // current datetime value1 = "Expires " + formatter.print(dt); } currentItem.put("value1", value1); currentItem.put("value2", subsubelements.select("td").get(2).text()); // Out of Bundle charges have an extra property if (currentItem.getString("item").startsWith("Internet")) { Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL); Matcher m1 = p1.matcher(pageContent); StringBuilder cleanedDate = new StringBuilder(); if (m1.matches()) { cleanedDate.append(m1.group(1)); cleanedDate.append(' '); cleanedDate.append(m1.group(2)); cleanedDate.append(' '); cleanedDate.append(m1.group(3)); currentItem.put("value3", cleanedDate.toString()); } } jsonArray.put(currentItem); } } // reset the 3-to-3 call bug flag for next Element if (three2threeCallsBug) { three2threeCallsBug = false; } } return jsonArray; }
From source file:com.greenpepper.maven.plugin.SpecificationRunnerMojo.java
public static void recoverLinkInResult(String specification, String htmlString, RepositoryIndex repositoryIndex) throws IOException { RepositoryIndex.SpecificationInfo specificationInfo = repositoryIndex.getNameToInfo().get(specification); if (isBlank(specificationInfo.getLink()) && isNotBlank(htmlString)) { LOGGER.trace("got new missing link in index for '{}'. trying to find it in the result output", specification);// w w w . j a va 2 s . co m org.jsoup.nodes.Document resultOutput = Jsoup.parse(htmlString); Elements metaTags = resultOutput.head().getElementsByTag("meta"); String link = metaTags.select("[name=\"external-link\"]").attr("content"); if (isNotBlank(link)) { LOGGER.trace("Found {}", link); specificationInfo.setLink(link); repositoryIndex.dump(); } } }
From source file:dev.maisentito.suca.commands.EnitCommandHandler.java
@Override public void handleCommand(MessageEvent event, String[] args) throws Throwable { Document doc = Jsoup.connect("http://www.wordreference.com/enit/" + StringUtils.join(args, ' ')) .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get(); Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)"); row.select(".tooltip").remove(); String def = row.text().trim().replace("\n", ""); event.respond(def);/* w w w. j av a2s. c o m*/ }
From source file:dev.maisentito.suca.commands.ItenCommandHandler.java
@Override public void handleCommand(MessageEvent event, String[] args) throws Throwable { Document doc = Jsoup.connect("http://www.wordreference.com/iten/" + StringUtils.join(args, ' ')) .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get(); Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)"); row.select(".tooltip").remove(); String def = row.text().trim().replace("\n", ""); event.respond(def);/*from w ww. j av a 2s .c o m*/ }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findTitle() { Elements title = doc.select(PathMapping.TITLE); title.select("span").remove(); return title.html().trim(); }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findIsbn10() { Elements isbn10 = doc.select(PathMapping.ISBN_10); isbn10.select("b").remove(); return isbn10.html().trim(); }