List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java
/** * * @param is/* w ww. j a v a 2s . c o m*/ */ @Override public void getPlatformStatusData(InputStream is) { try { Document doc = DataUtil.load(is, "UTF-8", ""); Element body = doc.body(); // most of the target items are sandwitched by <b> tag // this can be used to reach each target item. String tmpCurrentTime = null; String tmpUpTime = null; String currentTime = null; Elements tags = body.getElementsByTag("b"); for (Element tag : tags) { // get the current-time string: for 1.52.3 or older daemons // this is the ony place to get it. String tagText = tag.text(); logger.log(Level.FINE, "working on tagText={0}", tagText); if (tagText.equals("Daemon Status")) { // find current time and up running currentTime = tag.parent().parent().text(); logger.log(Level.INFO, "currentTime text=[{0}]", currentTime); // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m" tmstmpMatcher = currentTimeStampPattern.matcher(currentTime); if (tmstmpMatcher.find()) { logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0)); tmpCurrentTime = tmstmpMatcher.group(1); logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime); tmpUpTime = tmstmpMatcher.group(2); logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime); } } // get the remaining key-value sets if (fieldNameSet.contains(tagText)) { Element parent = tag.parent(); String fieldValue = parent.nextElementSibling().text(); logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue }); summaryInfoMap.put(tagText, fieldValue); } } // extract the daemon version and platform info that are located // at the bottom // these data are sandwitched by a <center> tag Elements ctags = body.getElementsByTag("center"); String version = null; String platform = null; for (Element ctag : ctags) { String cText = ctag.text(); logger.log(Level.FINE, "center tag Text={0}", cText); // cText is like this: // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1 if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) { String[] versionPlatform = cText.split(", "); if (versionPlatform.length == 2) { logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform); version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]); platform = versionPlatform[1]; } else { // the above regex failed logger.log(Level.WARNING, "String-formatting differs; use pattern matching"); version = DaemonStatusDataUtil.getDaemonVersion(cText); int platformOffset = cText.lastIndexOf(", ") + 2; platform = cText.substring(platformOffset); logger.log(Level.INFO, "platform={0}", platform); } } } if (summaryInfoMap.containsKey("V3 Identity")) { String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity")); logger.log(Level.INFO, "ipAddress={0}", ipAddress); if (StringUtils.isNotBlank(ipAddress)) { boxInfoMap.put("host", ipAddress); if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) { summaryInfoMap.put("IP Address", ipAddress); } } else { logger.log(Level.WARNING, "host token is blank or null: use IP Address instead"); logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address")); boxInfoMap.put("host", summaryInfoMap.get("IP Address")); } } // for pre-1.53.3 versions boxInfoMap.put("time", tmpCurrentTime); if (!summaryInfoMap.containsKey("Current Time")) { summaryInfoMap.put("Current Time", tmpCurrentTime); } boxInfoMap.put("up", tmpUpTime); if (!summaryInfoMap.containsKey("Uptime")) { summaryInfoMap.put("Uptime", tmpUpTime); } boxInfoMap.put("version", version); if (!summaryInfoMap.containsKey("Daemon Version")) { summaryInfoMap.put("Daemon Version", version); } boxInfoMap.put("platform", platform); if (!summaryInfoMap.containsKey("Platform")) { summaryInfoMap.put("Platform", platform); } } catch (IOException ex) { logger.log(Level.SEVERE, "IO error", ex); } logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap); logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap); }
From source file:de.dlopes.stocks.facilitator.services.impl.FinanznachrichtenOrderbuchExtractorImpl.java
@Override public List<String> getFinanceData(String url, FinanceDataType dataType) { List<String> list = new ArrayList<String>(); try {//www .j a va 2s . co m Document doc = null; if (url.startsWith("file://")) { File input = new File(url.replaceFirst("file://", "")); doc = Jsoup.parse(input, "UTF-8"); } else { URL input = new URL(url); doc = Jsoup.parse(input, 30000); } Elements elements = doc.body().select("span[id^=productid] > span"); for (Element e : elements) { String text = e.text(); // Guard: move on when the text is empty if (StringUtils.isEmpty(text)) { continue; } text = StringUtils.trimAllWhitespace(text); // Guard: move on when the text does not contain the ISIN or WKN if (!text.startsWith(dataType.name() + ":")) { continue; } text = text.replace(dataType.name() + ":", ""); list.add(text); } } catch (IOException e) { e.printStackTrace(); } return list; }
From source file:com.money.manager.ex.investment.morningstar.MorningstarPriceUpdater.java
/** * Parse Morningstar response into price information. * @param symbol Morningstar symbol/*from w w w. j a va2 s . c o m*/ * @param html Result * @return An object containing price details */ private PriceDownloadedEvent parse(String symbol, String html) { Document doc = Jsoup.parse(html); // symbol String yahooSymbol = symbolConverter.getYahooSymbol(symbol); // price String priceString = doc.body().getElementById("last-price-value").text(); if (TextUtils.isEmpty(priceString)) { throw new RuntimeException("No price available for " + symbol); } Money price = MoneyFactory.fromString(priceString); // currency String currency = doc.body().getElementById("curency").text(); if (currency.equals("GBX")) { price = price.divide(100, MoneyFactory.MAX_ALLOWED_PRECISION); } // date String dateString = doc.body().getElementById("asOfDate").text(); String dateFormat = "MM/dd/yyyy HH:mm:ss"; // DateTimeFormatter formatter = DateTimeFormat.forPattern(dateFormat); // the time zone is EST // DateTime date = formatter.withZone(DateTimeZone.forID("America/New_York")) // .parseDateTime(dateString) // .withZone(DateTimeZone.forID("Europe/Vienna")); // convert time zone MmxDate dateTime = new MmxDate(dateString, dateFormat).setTimeZone("America/New_York") .inTimeZone("Europe/Vienna"); // todo: should this be converted to the exchange time? return new PriceDownloadedEvent(yahooSymbol, price, dateTime.toDate()); }
From source file:me.bramhaag.discordselfbot.commands.fun.CommandLMGTFY.java
@Command(name = "lmgtfy", minArgs = 1) public void execute(@NonNull Message message, @NonNull TextChannel channel, @NonNull String[] args) { String tinyURL = "http://tinyurl.com/api-create.php?url="; String lmgtfyURL = "http://lmgtfy.com?q="; String url;/*from w w w .j av a 2 s .c om*/ try { if (args[0].equalsIgnoreCase("--expanded") || args[0].equalsIgnoreCase("-e") && args.length >= 2) { url = lmgtfyURL + URLEncoder.encode(StringUtils.join(Arrays.copyOfRange(args, 1, args.length), " "), "UTF-8"); } else { Document doc; try { doc = Jsoup .connect(tinyURL + lmgtfyURL + URLEncoder.encode(StringUtils.join(args, " "), "UTF-8")) .get(); } catch (IOException e) { e.printStackTrace(); Util.sendError(message, e.getMessage()); return; } url = doc.body().text(); } } catch (UnsupportedEncodingException e) { Util.sendError(message, e.getMessage()); return; } message.editMessage("<" + url + ">").queue(); }
From source file:ac.simons.oembed.Oembed.java
public String transformDocumentString(final String documentHtml) { final Document rv = transformDocument(documentHtml); rv.outputSettings().prettyPrint(false).escapeMode(EscapeMode.xhtml); return rv.body().html(); }
From source file:de.dlopes.stocks.facilitator.services.impl.FinanzenNetIndexHTMLExtractorImpl.java
@Override public List<String> getFinanceData(String url, FinanceDataType dataType) { List<String> list = new ArrayList<String>(); try {/* w w w . ja v a 2 s .c om*/ Document doc = null; if (url.startsWith("file://")) { File input = new File(url.replaceFirst("file://", "")); doc = Jsoup.parse(input, "UTF-8"); } else { URL input = new URL(url); doc = Jsoup.parse(input, 30000); } //String index = doc.body().select("div#mainWrapper > div.main h1 > a").text(); Elements elements = doc.body().select("#fragIndexBarView > table tr"); for (Element e : elements) { String text = e.select("td > div").text(); // Guard: move on when the text is empty if (StringUtils.isEmpty(text)) { continue; } text = StringUtils.trimAllWhitespace(text); list.add(text); } } catch (IOException e) { e.printStackTrace(); } return list; }
From source file:com.aliyun.openservices.odps.console.commands.DescribeResourceCommand.java
@Override public String runHtml(Document dom) throws ODPSConsoleException, OdpsException { Odps odps = getCurrentOdps();/* ww w .j a va2 s. c o m*/ if (!(odps.resources().exists(projectName, resourceName))) { throw new ODPSConsoleException("Resource not found : " + resourceName); } Resource r = odps.resources().get(projectName, resourceName); Element element = dom.body().appendElement("div").appendElement("dl"); element.appendElement("dt").text("Name"); element.appendElement("dd").text(r.getName()); element.appendElement("dt").text("Owner"); element.appendElement("dd").text(r.getOwner()); element.appendElement("dt").text("Type"); element.appendElement("dd").text(String.valueOf(r.getType())); if (r.getType() == Resource.Type.TABLE) { TableResource tr = (TableResource) r; String tableSource = tr.getSourceTable().getProject() + "." + tr.getSourceTable().getName(); if (tr.getSourceTablePartition() != null) { tableSource += " partition(" + tr.getSourceTablePartition().toString() + ")"; } element.appendElement("dt").text("SourceTableName"); element.appendElement("dd").text(tableSource); } element.appendElement("dt").text("Comment"); element.appendElement("dd").text(r.getComment()); element.appendElement("dt").text("CreatedTime"); element.appendElement("dd").text(DATE_FORMAT.format(r.getCreatedTime())); element.appendElement("dt").text("LastModifiedTime"); element.appendElement("dd").text(DATE_FORMAT.format(r.getLastModifiedTime())); return dom.toString(); }
From source file:net.groupbuy.entity.Article.java
/** * ?/* w w w. j av a 2 s . co m*/ * * @return */ @Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; } if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }
From source file:com.mythesis.userbehaviouranalysis.WebParser.java
/** * Parse the url and get all the content * @param link the url to parse/*from w w w .j av a 2 s . c o m*/ * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:me.vertretungsplan.parser.SVPlanParser.java
@NotNull SubstitutionSchedule parseSVPlanSchedule(List<Document> docs) throws IOException, JSONException { SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); for (Document doc : docs) { if (doc.select(".svp").size() > 0) { for (Element svp : doc.select(".svp")) { parseSvPlanDay(v, svp, doc); }/*from w w w . j a va 2s . c o m*/ } else if (doc.select(".Trennlinie").size() > 0) { Element div = new Element(Tag.valueOf("div"), ""); for (Node node : doc.body().childNodesCopy()) { if (node instanceof Element && ((Element) node).hasClass("Trennlinie") && div.select("table").size() > 0) { parseSvPlanDay(v, div, doc); div = new Element(Tag.valueOf("div"), ""); } else { div.appendChild(node); } } parseSvPlanDay(v, div, doc); } else { parseSvPlanDay(v, doc, doc); } } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }