List of usage examples for org.jsoup.nodes Document baseUri
@Override
public String baseUri()
From source file:app.data.parse.WebPageUtil.java
public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException { String original = url;// w w w .j a va 2 s . c o m // hit toutiao.io // fixme http://toutiao.io/shares/640539/url if (original.startsWith("https://toutiao.io/posts/")) { original = original.replace("/posts/", "/k/"); } // check cache WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null; if (info != null) { return info; } else { info = new WebPageInfo(); info.url = original; } // attach url Document doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() // hit gold.xitu.io if (info.url.startsWith("http://gold.xitu.io/entry/")) { Elements origin = doc.select("div[class=ellipsis]"); Elements originLink = origin.select("a[class=share-link]"); info.url = originLink.attr("href"); // reconnect doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() } info.url = smartUri(info.url); // get title Elements metaTitle = doc.select("meta[property=og:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } if (StringUtils.isEmpty(info.title)) { metaTitle = doc.select("meta[property=twitter:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title; } // get desc Elements metaDesc = doc.select("meta[property=og:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[property=twitter:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[name=description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.body().select("p"); if (metaDesc != null) { for (Element element : metaDesc) { info.description = element.text(); if (info.description != null && info.description.length() >= 20) { break; } } } } } } info.description = ellipsis(info.description, 140, "..."); // cache info if (urlInfoCache != null) { urlInfoCache.put(original, info); } return info; }
From source file:ac.simons.oembed.Oembed.java
/** * Parses the given html document into a document and processes * all anchor elements. If a valid anchor is found, it tries to * get an oembed response for it's url and than render the result * into the document replacing the given anchor.<br> * It returns the html representation of the new document.<br> * If there's an error or no oembed result for an url, the anchor tag * will be left as it was. //from w ww . ja v a 2s. com * @param document The document that should be checked for links to transform * @return the transformed document */ public Document transformDocument(final Document document) { boolean changedBaseUri = false; if (document.baseUri() == null && this.getBaseUri() != null) { document.setBaseUri(this.getBaseUri()); changedBaseUri = true; } for (Element a : document.getElementsByTag("a")) { final String href = a.absUrl("href"); try { String renderedRespose = null; final OembedResponse oembedResponse = this.transformUrl(href); // There was no response or an exception happened if (oembedResponse == null) continue; // There is a handler for this response else if (this.getHandler().containsKey(oembedResponse.getSource())) this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse); // Try to render the response itself and replace the current anchor else if ((renderedRespose = oembedResponse.render()) != null) { a.before(renderedRespose); a.remove(); } } catch (OembedException e) { logger.warn(String.format("Skipping '%s': %s", href, e.getMessage())); } } if (changedBaseUri) document.setBaseUri(null); return document; }
From source file:de.ncoder.studipsync.studip.jsoup.JsoupStudipAdapter.java
private void setDocument(Document document) throws StudipException { this.document = document; try {//from www .j ava 2s .c o m URL url = new URL(document.baseUri()); log.trace("NAV: " + url); for (NavigationListener listener : listeners) { listener.navigated(url); } } catch (MalformedURLException e) { StudipException ex = new StudipException("Illegal URL " + document.baseUri(), e); ex.put("studip.url", document.baseUri()); ex.put("studip.document", document); throw ex; } }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
void parseMultipleMonitorDays(SubstitutionSchedule v, Document doc, JSONObject data) throws JSONException, CredentialInvalidException { if (doc.select(".mon_head").size() > 1) { for (int j = 0; j < doc.select(".mon_head").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_head").get(j).clone()); Element next = doc.select(".mon_head").get(j).nextElementSibling(); if (next != null && next.tagName().equals("center")) { doc2.body().appendChild(next.select(".mon_title").first().clone()); if (next.select("table:has(tr.list)").size() > 0) { doc2.body().appendChild(next.select("table:has(tr.list)").first()); }/*ww w . j av a 2 s.co m*/ if (next.select("table.info").size() > 0) { doc2.body().appendChild(next.select("table.info").first()); } } else if (doc.select(".mon_title").size() - 1 >= j) { doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); doc2.body().appendChild(doc.select("table:has(tr.list)").get(j).clone()); } else { continue; } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else if (doc.select(".mon_title").size() > 1) { for (int j = 0; j < doc.select(".mon_title").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); Element next = doc.select(".mon_title").get(j).nextElementSibling(); while (next != null && !next.tagName().equals("center")) { doc2.body().appendChild(next); next = doc.select(".mon_title").get(j).nextElementSibling(); } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else { SubstitutionScheduleDay day = parseMonitorDay(doc, data); v.addDay(day); } }
From source file:de.geeksfactory.opacclient.apis.Zones.java
private void loadMediaList(Document lentDoc, List<LentItem> items) throws IOException { items.addAll(parseMediaList(lentDoc)); String nextPageUrl = findNextPageUrl(lentDoc); if (nextPageUrl != null) { Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding())); doc.setBaseUri(lentDoc.baseUri()); loadMediaList(doc, items);/* www. j av a 2 s .com*/ } }
From source file:de.geeksfactory.opacclient.apis.Zones.java
private void loadResList(Document lentDoc, List<ReservedItem> items) throws IOException { items.addAll(parseResList(lentDoc)); String nextPageUrl = findNextPageUrl(lentDoc); if (nextPageUrl != null) { Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding())); doc.setBaseUri(lentDoc.baseUri()); loadResList(doc, items);//from w ww .j a va2 s . c o m } }
From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java
@Override public Document runFilter(Document document) { final Document clean = Document.createShell(document.baseUri()); if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body. copySafeNodes(document.body(), clean.body()); return clean; }
From source file:org.b3log.symphony.service.LinkForgeMgmtService.java
/** * Forges the specified URL.//w ww . j a v a 2 s. c om * * @param url the specified URL * @param userId the specified user id */ public void forge(final String url, final String userId) { String html; String baseURL; try { final Document doc = Jsoup.connect(url).timeout(5000).userAgent(Symphonys.USER_AGENT_BOT).get(); doc.select("body").prepend("<a href=\"" + url + "\">" + url + "</a>"); // Add the specified URL itfself html = doc.html(); baseURL = doc.baseUri(); } catch (final Exception e) { LOGGER.log(Level.ERROR, "Parses link [" + url + "] failed", e); return; } final List<JSONObject> links = Links.getLinks(baseURL, html); final List<JSONObject> cachedTags = tagCache.getTags(); final Transaction transaction = linkRepository.beginTransaction(); try { for (final JSONObject lnk : links) { final String addr = lnk.optString(Link.LINK_ADDR); JSONObject link = linkRepository.getLink(addr); if (null == link) { link = new JSONObject(); link.put(Link.LINK_ADDR, lnk.optString(Link.LINK_ADDR)); link.put(Link.LINK_BAD_CNT, 0); link.put(Link.LINK_BAIDU_REF_CNT, 0); link.put(Link.LINK_CLICK_CNT, 0); link.put(Link.LINK_GOOD_CNT, 0); link.put(Link.LINK_SCORE, 0); link.put(Link.LINK_SUBMIT_CNT, 0); link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE)); link.put(Link.LINK_TYPE, Link.LINK_TYPE_C_FORGE); LOGGER.info(link.optString(Link.LINK_ADDR) + "____" + link.optString(Link.LINK_TITLE)); linkRepository.add(link); final JSONObject linkCntOption = optionRepository.get(Option.ID_C_STATISTIC_LINK_COUNT); final int linkCnt = linkCntOption.optInt(Option.OPTION_VALUE); linkCntOption.put(Option.OPTION_VALUE, linkCnt + 1); optionRepository.update(Option.ID_C_STATISTIC_LINK_COUNT, linkCntOption); } else { link.put(Link.LINK_BAIDU_REF_CNT, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE)); link.put(Link.LINK_SCORE, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); // XXX: Need a score algorithm linkRepository.update(link.optString(Keys.OBJECT_ID), link); } final String linkId = link.optString(Keys.OBJECT_ID); final double linkScore = link.optDouble(Link.LINK_SCORE, 0D); String title = link.optString(Link.LINK_TITLE) + " " + link.optString(Link.LINK_T_KEYWORDS); title = Pangu.spacingText(title); String[] titles = title.split(" "); titles = Strings.trimAll(titles); for (final JSONObject cachedTag : cachedTags) { final String tagId = cachedTag.optString(Keys.OBJECT_ID); final String tagTitle = cachedTag.optString(Tag.TAG_TITLE); if (!Strings.containsIgnoreCase(tagTitle, titles)) { continue; } final JSONObject tag = tagRepository.get(tagId); // clean tagUserLinkRepository.removeByTagIdUserIdAndLinkId(tagId, userId, linkId); // re-add final JSONObject tagLinkRel = new JSONObject(); tagLinkRel.put(Tag.TAG_T_ID, tagId); tagLinkRel.put(UserExt.USER_T_ID, userId); tagLinkRel.put(Link.LINK_T_ID, linkId); tagLinkRel.put(Link.LINK_SCORE, linkScore); tagUserLinkRepository.add(tagLinkRel); // refresh link score tagUserLinkRepository.updateTagLinkScore(tagId, linkId, linkScore); // re-calc tag link count final int tagLinkCnt = tagUserLinkRepository.countTagLink(tagId); tag.put(Tag.TAG_LINK_CNT, tagLinkCnt); tagRepository.update(tagId, tag); } } transaction.commit(); LOGGER.info("Forged link [" + url + "]"); } catch (final Exception e) { if (transaction.isActive()) { transaction.rollback(); } LOGGER.log(Level.ERROR, "Saves links failed", e); } }
From source file:org.cellcore.code.engine.page.extractor.starcity.STCPageDataExtractor.java
@Override protected String getName(Document doc) throws UnsupportedCardException { if (!doc.select("h3").select(":contains(Foil)").isEmpty()) { throw new UnsupportedCardException("foil"); }//ww w.j a va 2s. c o m String href = doc.baseUri(); String code = href.substring(href.lastIndexOf("=") + 1, href.length()); jsonProc(code, doc); if (doc.getElementById("custom_card_name_STC") != null) { return doc.getElementById("custom_card_name_STC").text(); } return null; }
From source file:org.openmrs.module.radiology.report.template.DefaultMrrtReportTemplateFileParser.java
private final void initializeTemplate(MrrtReportTemplate template, Document doc) { final Elements metaTags = doc.getElementsByTag("meta"); template.setPath(doc.baseUri()); template.setCharset(metaTags.attr("charset")); for (Element metaTag : metaTags) { final String name = metaTag.attr("name"); final String content = metaTag.attr("content"); switch (name) { case DCTERMS_TITLE: template.setDcTermsTitle(content); break; case DCTERMS_DESCRIPTION: template.setDcTermsDescription(content); break; case DCTERMS_IDENTIFIER: template.setDcTermsIdentifier(content); break; case DCTERMS_TYPE: template.setDcTermsType(content); break; case DCTERMS_LANGUAGE: template.setDcTermsLanguage(content); break; case DCTERMS_PUBLISHER: template.setDcTermsPublisher(content); break; case DCTERMS_RIGHTS: template.setDcTermsRights(content); break; case DCTERMS_LICENSE: template.setDcTermsLicense(content); break; case DCTERMS_DATE: template.setDcTermsDate(content); break; case DCTERMS_CREATOR: template.setDcTermsCreator(content); break; default:/*from w w w . j ava 2s . co m*/ log.debug("Unhandled meta tag " + name); } } }