List of usage examples for org.jsoup.nodes Document setBaseUri
public void setBaseUri(final String baseUri)
From source file:me.vertretungsplan.parser.UntisInfoHeadlessParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null))); doc.setBaseUri(url); Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b"); Elements frames = doc.select("frame[src*=w00]"); if (dayElems.size() == 0 && frames.size() > 0) { // doc is embedded in frame doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null))); dayElems = doc.select("#vertretung > p > b, #vertretung > b"); }// w w w. ja v a2s . c o m for (Element dayElem : dayElems) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setLastChangeString(""); String date = dayElem.text(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); Element next; if (dayElem.parent().tagName().equals("p")) { next = dayElem.parent().nextElementSibling().nextElementSibling(); } else { next = dayElem.parent().select("p").first().nextElementSibling(); } parseDay(day, next, v, null); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) { if (doc.select("a[name=RES]").size() == 0) return;//from w ww . j a v a2 s. c o m Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); item.setTitle(tr.child(0).text().trim().replace("\u00a0", "")); item.setAuthor(tr.child(1).text().trim().replace("\u00a0", "")); try { item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", ""))); } catch (IllegalArgumentException e) { item.setStatus(tr.child(4).text().trim().replace("\u00a0", "")); } if (tr.select("a").size() > 0) { item.setCancelData(tr.select("a").last().attr("href")); } media.add(item); } assert (media.size() == trs - 1); }
From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java
public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try {/*from w w w . ja v a 2 s. c o m*/ // Get the body from the WORD String htmlBody = FileUtils.readFileToString(file); Document doc = Jsoup.parse(htmlBody); if (uriroot != null) doc.setBaseUri(uriroot); setAnchors(doc, uriroot); // Setup the document document.setContent(doc.text(), bStoreBody); document.setSize((int) file.length()); document.setType("text/html"); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } return document; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) { if (doc.select("a[name=AUS]").size() == 0) return;/* ww w . j a v a2 s . c o m*/ Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); JSONObject copymap = new JSONObject(); try { if (data.has("accounttable")) { copymap = data.getJSONObject("accounttable"); } } catch (JSONException e) { } Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}"); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (copymap.optInt("title", 0) >= 0) { item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("author", 1) >= 0) { item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("format", 2) >= 0) { item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", "")); } int prolongCount = 0; if (copymap.optInt("prolongcount", 3) >= 0) { prolongCount = Integer .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", "")); item.setStatus(String.valueOf(prolongCount) + "x verl."); } if (data.optInt("maxprolongcount", -1) != -1) { item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1)); } if (copymap.optInt("returndate", 4) >= 0) { String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", ""); Matcher matcher = datePattern.matcher(value); if (matcher.find()) { try { item.setDeadline(fmt.parseLocalDate(matcher.group())); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } } } if (copymap.optInt("prolongurl", 5) >= 0) { if (tr.children().size() > copymap.optInt("prolongurl", 5)) { Element cell = tr.child(copymap.optInt("prolongurl", 5)); if (cell.select("input[name=MedNrVerlAll]").size() > 0) { // new iOPAC Version 1.45 - checkboxes to prolong multiple items // internal convention: We add "NEW" to the media ID to show that we have // the new iOPAC version Element input = cell.select("input[name=MedNrVerlAll]").first(); String value = input.val(); item.setProlongData("NEW" + value); item.setId(value.split(";")[0]); if (input.hasAttr("disabled")) item.setRenewable(false); } else { // previous versions - link for prolonging on every medium String link = cell.select("a").attr("href"); item.setProlongData(link); // find media number with regex Pattern pattern = Pattern.compile("mednr=([^&]*)&"); Matcher matcher = pattern.matcher(link); if (matcher.find() && matcher.group() != null) item.setId(matcher.group(1)); } } } media.add(item); } assert (media.size() == trs - 1); }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl, int recursionDepth) throws IOException, CredentialInvalidException { String html;//from w w w.j a v a 2 s.c o m if (url.equals(VALUE_URL_LOGIN_RESPONSE)) { html = loginResponse; } else { try { html = httpGet(url, encoding).replace(" ", ""); } catch (HttpResponseException e) { if (docs.size() == 0) { throw e; } else { return; // ignore if first page was loaded and redirect didn't work } } } Document doc = Jsoup.parse(html); doc.setBaseUri(url); if (doc.select(".mon_title").size() == 0) { // We have a problem - there seems to be no substitution schedule. Maybe it is hiding // inside a frame? if (doc.select("frameset frame[name").size() > 0) { for (Element frame : doc.select("frameset frame")) { if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?") && recursionDepth < MAX_RECURSION_DEPTH) { String frameUrl = frame.absUrl("src"); loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1); } } } else if (doc.text().contains("registriert")) { throw new CredentialInvalidException(); } else { if (docs.size() == 0) { // ignore if first page was loaded and redirect didn't work throw new IOException( "Could not find .mon-title, seems like there is no Untis " + "schedule here"); } } } else { findSubDocs(docs, html, doc); if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) { loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1); } } } }
From source file:ac.simons.oembed.Oembed.java
/** * Parses the given html document into a document and processes * all anchor elements. If a valid anchor is found, it tries to * get an oembed response for it's url and than render the result * into the document replacing the given anchor.<br> * It returns the html representation of the new document.<br> * If there's an error or no oembed result for an url, the anchor tag * will be left as it was. // w w w. j av a 2 s . com * @param document The document that should be checked for links to transform * @return the transformed document */ public Document transformDocument(final Document document) { boolean changedBaseUri = false; if (document.baseUri() == null && this.getBaseUri() != null) { document.setBaseUri(this.getBaseUri()); changedBaseUri = true; } for (Element a : document.getElementsByTag("a")) { final String href = a.absUrl("href"); try { String renderedRespose = null; final OembedResponse oembedResponse = this.transformUrl(href); // There was no response or an exception happened if (oembedResponse == null) continue; // There is a handler for this response else if (this.getHandler().containsKey(oembedResponse.getSource())) this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse); // Try to render the response itself and replace the current anchor else if ((renderedRespose = oembedResponse.render()) != null) { a.before(renderedRespose); a.remove(); } } catch (OembedException e) { logger.warn(String.format("Skipping '%s': %s", href, e.getMessage())); } } if (changedBaseUri) document.setBaseUri(null); return document; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public void start() throws IOException { String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url); sessid = null;/*from w w w .j a v a 2s. com*/ for (Element link : doc.select("a")) { String sid = getQueryParamsFirst(link.absUrl("href")).get("sess"); if (sid != null) { sessid = sid; break; } } super.start(); }
From source file:de.geeksfactory.opacclient.apis.Open.java
@Override public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException { /*/*w w w .jav a 2s . c o m*/ When there are many pages of results, there will only be links to the next 4 and previous 4 pages, so we will click links until it gets to the correct page. */ if (searchResultDoc == null) throw new NotReachableException(); Document doc = searchResultDoc; Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN"); int from = Integer.valueOf(pageLinks.first().text()); int to = Integer.valueOf(pageLinks.last().text()); Element linkToClick; boolean willBeCorrectPage; if (page < from) { linkToClick = pageLinks.first(); willBeCorrectPage = false; } else if (page > to) { linkToClick = pageLinks.last(); willBeCorrectPage = false; } else { linkToClick = pageLinks.get(page - from); willBeCorrectPage = true; } Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)"); Matcher matcher = pattern.matcher(linkToClick.attr("href")); if (!matcher.find()) throw new OpacErrorException(StringProvider.INTERNAL_ERROR); FormElement form = (FormElement) doc.select("form").first(); HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1)) .addTextBody("__EVENTARGUMENT", matcher.group(2)).build(); ByteArrayOutputStream stream = new ByteArrayOutputStream(); data.writeTo(stream); String postUrl = form.attr("abs:action"); String html = httpPost(postUrl, data, "UTF-8"); if (willBeCorrectPage) { // We clicked on the correct link Document doc2 = Jsoup.parse(html); doc2.setBaseUri(postUrl); return parse_search(doc2, page); } else { // There was no correct link, so try to find one again searchResultDoc = Jsoup.parse(html); searchResultDoc.setBaseUri(postUrl); return searchGetPage(page); } }
From source file:de.geeksfactory.opacclient.apis.Zones22.java
private Document login(Account acc) throws IOException, OpacErrorException { String html = httpGet(//w w w . jav a 2 s. c o m opac_url + "/APS_ZONES?fn=MyZone&Style=Portal3&SubStyle=&Lang=GER&ResponseEncoding=utf-8", getDefaultEncoding()); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url + "/APS_ZONES"); if (doc.select(".AccountSummaryCounterLink").size() > 0) { return doc; } if (doc.select("#LoginForm").size() == 0) { throw new NotReachableException(); } List<NameValuePair> params = new ArrayList<NameValuePair>(); for (Element input : doc.select("#LoginForm input")) { if (!input.attr("name").equals("BRWR") && !input.attr("name").equals("PIN")) params.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } params.add(new BasicNameValuePair("BRWR", acc.getName())); params.add(new BasicNameValuePair("PIN", acc.getPassword())); String loginHtml; try { loginHtml = httpPost(doc.select("#LoginForm").get(0).absUrl("action"), new UrlEncodedFormEntity(params), getDefaultEncoding()); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return null; } catch (IOException e) { e.printStackTrace(); return null; } if (!loginHtml.contains("Kontostand")) { throw new OpacErrorException(stringProvider.getString(StringProvider.LOGIN_FAILED)); } Document doc2 = Jsoup.parse(loginHtml); Pattern objid_pat = Pattern.compile("Obj_([0-9]+)\\?.*"); for (Element a : doc2.select("a")) { Matcher objid_matcher = objid_pat.matcher(a.attr("href")); if (objid_matcher.matches()) { accountobj = objid_matcher.group(1); } } return doc2; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public List<SearchField> getSearchFields() throws IOException, OpacErrorException, JSONException { String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url); List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=kat1] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); fields.add(field);//from ww w.ja v a 2 s . com } DropdownSearchField field = new DropdownSearchField(); Elements zst_opts = doc.select("#teilk2 option"); for (int i = 0; i < zst_opts.size(); i++) { Element opt = zst_opts.get(i); if (!opt.val().equals("")) { field.addDropdownValue(opt.val(), opt.text()); } } field.setDisplayName("Einrichtung"); field.setId("f[teil2]"); field.setVisible(true); field.setMeaning(SearchField.Meaning.BRANCH); fields.add(field); try { field = new DropdownSearchField(); Document doc2 = Jsoup .parse(httpGet(opac_url + "/zweigstelle.cgi?sess=" + sessid, ENCODING, false, cookieStore)); Elements home_opts = doc2.select("#zweig option"); for (int i = 0; i < home_opts.size(); i++) { Element opt = home_opts.get(i); if (!opt.val().equals("")) { Map<String, String> option = new HashMap<>(); option.put("key", opt.val()); option.put("value", opt.text()); field.addDropdownValue(opt.val(), opt.text()); } } field.setDisplayName("Leihstelle"); field.setId("_heidi_branch"); field.setVisible(true); field.setMeaning(SearchField.Meaning.HOME_BRANCH); fields.add(field); } catch (IOException e) { e.printStackTrace(); } TextSearchField pagefield = new TextSearchField(); pagefield.setId("_heidi_page"); pagefield.setVisible(false); pagefield.setDisplayName("Seite"); pagefield.setHint(""); fields.add(pagefield); return fields; }