List of usage examples for org.jsoup.nodes Document title
public String title()
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }/* www .j a v a 2s. c o m*/ LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { loginResponse = new LoginHandler(scheduleData, credential, cookieProvider).handleLoginWithResponse(executor, cookieStore);// w ww. ja v a 2 s. c om SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); JSONArray urls = scheduleData.getData().getJSONArray(PARAM_URLS); String encoding = scheduleData.getData().optString(PARAM_ENCODING, null); List<Document> docs = new ArrayList<>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); final String urlStr = url.getString(SUBPARAM_URL); for (String dateUrl : ParserUtils.handleUrlWithDateFormat(urlStr)) { loadUrl(dateUrl, encoding, url.getBoolean(SUBPARAM_FOLLOWING), docs); } } for (Document doc : docs) { if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) { for (Element part : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) { SubstitutionScheduleDay day = parseMonitorDay(part, scheduleData.getData()); v.addDay(day); } } else if (doc.title().contains("Untis") || doc.html().contains("<!--<title>Untis")) { SubstitutionScheduleDay day = parseMonitorDay(doc, scheduleData.getData()); v.addDay(day); } // else Error if (scheduleData.getData().has(PARAM_LAST_CHANGE_SELECTOR) && doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).size() > 0) { String text = doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).first() .text(); String lastChange; Pattern pattern = Pattern.compile("\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d,? \\d\\d:\\d\\d"); Matcher matcher = pattern.matcher(text); if (matcher.find()) { lastChange = matcher.group(); } else { lastChange = text; } v.setLastChangeString(lastChange); v.setLastChange(ParserUtils.parseDateTime(lastChange)); } } if (scheduleData.getData().has(PARAM_WEBSITE)) { v.setWebsite(scheduleData.getData().getString(PARAM_WEBSITE)); } else if (urls.length() == 1) { v.setWebsite(urls.getJSONObject(0).getString("url")); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse//from w w w . j a v a2 s . com * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
/** * Method used to perform recursive creation indexing for a given web page * in search database./* ww w . j av a 2s.c o m*/ * * @param webPage webPage.url is entered url * webPage.title is set * @param html Jsoup.Document of entered url * @param recursionNumber used to stop recursion at exceeding * MAX_RECURSION_SEARCH_NUMBER */ private void indexElements(WebPage webPage, Document html, final int recursionNumber) throws IOException, ParseException { String title = html.title(); if (referencedTitles.contains(title.trim())) { return; } referencedTitles.add(title.trim()); webPage.setTitle(title); if (containsPage(webPage)) { System.out.println(webPage.getUrl() + " is already indexed"); return; } Element prevElement = null; Elements elements = html.body().getAllElements(); //.getElementsByTag("a"); addDoc(webPage, html.text()); // for (Element element : elements) { //// System.out.println(element.nodeName() + " element.text() " //// + element.text() + " url " //// + element.absUrl("href")); // if (element.nodeName().equalsIgnoreCase("body")) { // addDoc(webPage, element.text()); // break; //// continue; // } // if (null == prevElement) { // prevElement = element; //// } else if (prevElementContainsElementText(prevElement, element)) { //// continue; // } //// if (null !== webPagesService.findWebPage(element.absUrl("href"))) // if (element.text().trim().isEmpty()) { // continue; // } //// StringTokenizer str = new StringTokenizer(element.text()); //// str. // addDoc(webPage, element.text()); // } if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) { // System.out.println(recursionNumber + " " // + referencedSites.contains(webPage.getUrl())); return; } elements.parallelStream() .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF) && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF)) && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF)))) .forEach((Element element) -> { WebPage webPage1 = new WebPage(element.absUrl(HREF)); String url1 = webPage1.getUrl(); // System.out.println(recursionNumber + " recursion for '" // + url1 + "'"); try { Document htmlR = Jsoup.connect(url1).get(); indexElements(webPage1, htmlR, recursionNumber + 1); } catch (IOException | ParseException e) { System.out.println("Exception " + e.getMessage()); } referencedSites.add(url1); }); // for (Element element : elements) { // if (!element.nodeName().equalsIgnoreCase("a")) { // continue; // } // WebPage webPage1 = new WebPage(element.absUrl("href")); // if (null == webPage1.getUrl() // || webPage1.getUrl().isEmpty() // || referencedSites.contains(webPage1.getUrl())) { // continue; // } // System.out.println(recursionNumber + "recursion for " // + element.absUrl("href")); // try { // Document htmlR = Jsoup.connect(webPage1.getUrl()).get(); // webPage1.setTitle(htmlR.title()); // indexElements(webPage1, htmlR, recursionNumber + 1); // } catch (IOException e) { // System.out.println("IOException " + e.getMessage()); // } // referencedSites.add(webPage1.getUrl()); // } }
From source file:com.gmail.jiangyang5157.cardboard.net.DescriptionRequest.java
private Response<Object> doHtmlParse(NetworkResponse response) { String parsed = getParsedString(response); Document doc; try {//from w ww .j a v a2 s.com doc = Jsoup.parse(parsed); } catch (VerifyError | NoClassDefFoundError e) { // http://stackoverflow.com/questions/38059373/java-lang-verifyerror-when-downloading-data-with-jsoup-in-android-n // TODO: 10/1/2016 upgrade Jsoup return Response.error(new ParseError(response)); } // for <title>Hello World</title> String content = doc.title(); // for <meta property="og:description" content="Hello world." /> Element mataPropertyOgDescription = doc.select("meta[property^=og:description]").first(); if (mataPropertyOgDescription != null) { content += "\n\n" + mataPropertyOgDescription.attr("content"); } return Response.success(content, HttpHeaderParser.parseCacheHeaders(response)); }
From source file:autoInsurance.BeiJPingAnImpl.java
public String login(String in) { String out = ""; JSONObject jsonObject = JSONObject.fromObject(in); String ukey = jsonObject.getString("ukey"); String loginName = jsonObject.getString("loginName"); String password = jsonObject.getString("password"); String url = "https://icore-pts.pingan.com.cn/ebusiness/auto/newness/toibcswriter.do?transmitId=apply"; String retStr = httpClientUtil.doPost(url, new HashMap(), charset); Document doc = Jsoup.parse(retStr); Element script_umlogin = doc.getElementById("script_umlogin"); if (null != script_umlogin) { url = "https://icore-pts.pingan.com.cn/ebusiness/j_security_check"; Map<String, String> paramMap = new HashMap<String, String>(); paramMap.put("j_username", loginName); paramMap.put("j_password", password); paramMap.put("SMAUTHREASON", "0"); paramMap.put("randCode", ""); retStr = httpClientUtil.doPost(url, paramMap, charset); doc = Jsoup.parse(retStr);/*from w w w . j av a 2 s . co m*/ String title = doc.title(); if (title.startsWith("")) { url = "https://icore-pts.pingan.com.cn/ebusiness/auto/newness/toibcswriter.do?transmitId=apply"; retStr = httpClientUtil.doPost(url, new HashMap(), charset); doc = Jsoup.parse(retStr); FormElement form = (FormElement) doc.getElementById("workbenchIBCSAppForm"); List<KeyVal> datas = form.formData(); paramMap = new HashMap<String, String>(); for (KeyVal kv : datas) { paramMap.put(kv.key(), kv.value()); } paramMap.put("dealerCodes", ""); url = "https://icorepnbs.pingan.com.cn/icore_pnbs/do/usermanage/systemTransfer"; retStr = httpClientUtil.doPost(url, paramMap, charset); doc = Jsoup.parse(retStr); title = doc.title(); if ("".equals(title)) out = "{\"success\": true, \"msg\": \"" + loginName + "," + paramMap.get("userName") + ",\"}"; } } else { FormElement form = (FormElement) doc.getElementById("workbenchIBCSAppForm"); if (null != form) { return "{\"success\": false, \"msg\": \"!\"}"; } } return out; }
From source file:info.smartkit.hairy_batman.query.SogouSearchQuery.java
public void parseWxOpenId() { Document doc; try {/*from ww w.j a va2 s. c o m*/ // need http protocol // doc = Jsoup.connect(GlobalConsts.SOGOU_SEARCH_URL_BASE+ wxFoo.getSubscribeId()).get(); doc = Jsoup.connect("http://weixin.sogou.com/weixin?type=1&query=" + wxFoo.getSubscribeId() + "&fr=sgsearch&ie=utf8&_ast=1423915648&_asf=null&w=01019900&cid=null&sut=19381").get(); LOG.debug("openID html INFO:" + doc.html()); // get page title String title = doc.title(); LOG.debug("title : " + title); // get all "?:" value of html <span> //Elements openIdLink = doc.select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELEMENTS).select(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_HTML_ELE_IDENTITY); Elements openIdLink = doc.getElementsByClass("wx-rb"); Element a = null; String openIdLinkHref = ""; if (openIdLink != null && openIdLink.size() > 0) { Iterator<Element> itea = openIdLink.iterator(); while (itea.hasNext()) { a = itea.next(); LOG.debug("openID html INFO:" + a.html()); if (a.getElementsByTag("em").html().indexOf(wxFoo.getSubscribeId()) != -1) { break; } } } if (a != null) { openIdLinkHref = a.attr("href"); } LOG.debug("openIdLinkHref:" + openIdLinkHref); // FIXME:???? if (this.wxFoo.getOpenId() == null && openIdLinkHref.length() > 0) { this.wxFoo.setOpenId(openIdLinkHref.split(GlobalConsts.SOGOU_SEARCH_WX_OPEN_ID_KEYWORDS)[1]); LOG.info("saved wxOpenId value: " + this.wxFoo.getOpenId()); GlobalVariables.wxFooListWithOpenId.add(this.wxFoo); // File reporting new FileReporter(GlobalConsts.REPORT_FILE_OUTPUT_OPENID, GlobalVariables.wxFooListWithOpenId, FileReporter.REPORTER_TYPE.R_T_OPENID, FileReporter.REPORTER_FILE_TYPE.EXCEL).write(); // Then,OpenID JSON site parse if (this.wxFoo.getOpenId() != null) { // Save openId to DB. try { GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)", new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(), this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(), this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), this.wxFoo.getOpenId() }, new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR }); this.parseSogouJsonSite(this.wxFoo.getOpenId()); } catch (DataAccessException e) { e.printStackTrace(); } } else { LOG.warn("SogouSearchQuery getOpenId Failure! site info:" + wxFoo.getCode()); // TODO write those info to File or DB for collect which // agency not open weixin service // Save openId to DB. try { GlobalVariables.jdbcTempate.update("insert into " + GlobalConsts.QUERY_TABLE_NAME_BASIC + "(id,store,agency,unit,subscribeId,onSubscribe,code,openId) values(?,?,?,?,?,?,?,?)", new Object[] { this.wxFoo.getId(), this.wxFoo.getStore(), this.wxFoo.getAgency(), this.wxFoo.getUnit(), this.wxFoo.getSubscribeId(), this.wxFoo.getOnSubscribe(), this.wxFoo.getCode(), "" }, new int[] { java.sql.Types.INTEGER, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR, java.sql.Types.VARCHAR }); LOG.warn("Can not get subsriber info: " + this.wxFoo.getCode()); this.parseSogouJsonSite(this.wxFoo.getOpenId()); } catch (DataAccessException e) { e.printStackTrace(); } } } } catch (IOException e) { // e.printStackTrace(); LOG.error(e.toString()); } }
From source file:info.mikaelsvensson.devtools.sitesearch.SiteSearchPlugin.java
private IndexEntry createIndexEntry(final File file) { try {/*from w w w.j a va 2 s .c o m*/ Document document = Jsoup.parse(file, "UTF-8", "http://invalid.host"); Element contentEl = document.getElementById("contentBox"); if (contentEl == null) { contentEl = document.body(); } if (contentEl != null) { String text = Jsoup.clean(contentEl.html(), Whitelist.simpleText()); Collection<WordCount> wordCount = getWordCount(text); Collection<WordCount> filteredWordCount = filterWordCount(wordCount); return new IndexEntry(document.title(), getRelativePath(getSiteOutputFolder(), file), filteredWordCount); } } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } return null; }
From source file:faescapeplan.FAEscapePlanUI.java
@SuppressWarnings("unchecked") private void downloadJournals(ArrayList<String> journalList) { JSONArray jsonList = new JSONArray(); String downloadLoc = this.saveLocText.getText(); Path jsonPath = Paths.get(downloadLoc + "\\" + userData.getName() + "\\journals\\journals.json"); try {/* w ww . ja v a 2s .c o m*/ Files.deleteIfExists(jsonPath); Files.createFile(jsonPath); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); JOptionPane.showMessageDialog(this, "A critical IO exception occurred in method: downloadJournals"); } for (String item : journalList) { try { Map<String, String> jsonMap = new LinkedHashMap<>(); Document doc = Jsoup.connect("http://www.furaffinity.net/journal/" + item + "/") .cookies(userData.getCookies()).userAgent(USER_AGENT).get(); String title = doc.title().split(" -- ")[0]; String date = doc.getElementsByClass("popup_date").get(0).attr("title"); String body = doc.getElementsByClass("journal-body").get(0).html(); jsonMap.put("title", title); jsonMap.put("date", date); jsonMap.put("body", body); jsonList.add(jsonMap); Path journalPath = Paths.get(downloadLoc, "\\" + userData.getName() + "\\journals\\" + item + "_" + title + ".txt"); String bodyParsed = removeHtmlTags(body); try (FileWriter journalWriter = new FileWriter(new File(journalPath.toString()))) { journalWriter.append(title + System.getProperty("line.separator")); journalWriter.append(date + System.getProperty("line.separator")); journalWriter.append(bodyParsed + System.getProperty("line.separator")); } } catch (FileAlreadyExistsException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); updateTextLog("File already exists"); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); updateTextLog("An IO Exception occurred while downloading journal: " + item); } } String jsonString = JSONValue.toJSONString(jsonList); try { Files.write(jsonPath, Arrays.asList(jsonString), StandardOpenOption.WRITE); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
private Document accountHttpPost(Account account, String func) throws IOException, OpacErrorException { // get media list via http POST List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("FUNC", func)); nameValuePairs.add(new BasicNameValuePair("LANG", "de")); nameValuePairs.add(new BasicNameValuePair("BENUTZER", account.getName())); nameValuePairs.add(new BasicNameValuePair("PASSWORD", account.getPassword())); String html = httpPost(opacUrl + "/" + opacDir + "/user.C", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding());/*from w w w .j a v a 2 s. c om*/ Document doc = Jsoup.parse(html); // Error recognition // <title>OPAC Fehler</title> if (doc.title().contains("Fehler") || (doc.select("h2").size() > 0 && doc.select("h2").text().contains("Fehler"))) { String errText = "unknown error"; Elements elTable = doc.select("table"); if (elTable.size() > 0) { errText = elTable.get(0).text(); } throw new OpacErrorException(errText); } if (doc.select("tr td font[color=red]").size() == 1) { throw new OpacErrorException(doc.select("font[color=red]").text()); } if (doc.text().contains("No html file set") || doc.text().contains("Der BIBDIA Server konnte den Auftrag nicht")) { throw new OpacErrorException(stringProvider.getString(StringProvider.WRONG_LOGIN_DATA)); } return doc; }