List of usage examples for org.jsoup.nodes Document html
public String html()
From source file:com.cognifide.aet.job.common.comparators.source.SourceComparator.java
private String formatCodeMarkup(String code) { Document doc = Jsoup.parse(code); NodeTraversor traversor = new NodeTraversor(new MarkupVisitor()); traversor.traverse(doc);/* ww w.j av a 2s . co m*/ return doc.html(); }
From source file:me.vertretungsplan.parser.DSBLightParser.java
private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl) throws IOException, JSONException, CredentialInvalidException { String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || doc.select(".mon_list").size() > 0) { parseMultipleMonitorDays(schedule, doc, data); if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl)) { parseDay(redirectUrl, referer, schedule, startUrl); }//from w ww.ja v a2 s. c om } } }
From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java
/** * ?page??/*from ww w .ja va2s .c o m*/ */ @Override public void visit(Page page) { try { String url = page.getWebURL().getURL(); page.setContentType("text/html; charset=" + gather.getEncoding()); Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get(); String title = doc.title(); if (gather.getTitleExternal() && gather.getTitleRegex() != null && gather.getTitleRegex().length() > 0) { Elements titleEles = doc.select(gather.getTitleRegex()); if (!titleEles.isEmpty()) { String tempTitle = titleEles.text(); if (tempTitle != null && tempTitle.length() > 0) { title = tempTitle; } } } if (title != null && title.trim().length() > 0) { Elements elements = doc.select(matchRegex); if (filterRegex != null && filterRegex.trim().length() > 0) { elements = elements.not(filterRegex); } if (!elements.isEmpty()) { String subHtml = elements.html(); Document blockDoc = Jsoup.parse(subHtml); String contentText = blockDoc.html(); if (gather.getRemoveHref()) { Document moveDoc = Jsoup.parse(contentText); Elements moveEles = moveDoc.select("*").not("a"); contentText = moveEles.html(); } if (gather.getRemoveHtmlTag()) contentText = doc.text(); if (isLocal) { contentText = doc.text(); Boolean isMatcher = true; for (int i = 0; i < keys.length; i++) { Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find(); if (!result) { isMatcher = false; break; } } if (isMatcher) { Storage storage = new Storage(); storage.setGatherId(gather.getId()); storage.setGatherName(gather.getName()); storage.setTitle(title); storage.setUrl(url); try { gatherService.addStorage(storage); } catch (Exception e) { logger.error("save storage error : {}", e.getLocalizedMessage()); } finally { storage = null; } } } else { Content content = new Content(); content.setDetail(contentText); content.setPage(1); List<Content> contents = new ArrayList<Content>(); contents.add(content); Article article = new Article(); article.setTitle(title); article.setContents(contents); articleMainService.addArticleMainByCrawler(article, gather.getChannelId(), CrawlerUtil.USER_NAME); } } } } catch (IOException e) { logger.warn(e.getLocalizedMessage()); } }
From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java
@Test public void testParseContent() throws Exception { Assert.assertNotNull(pdfStreamResult); final Document doc = pdfStreamResult.parseContent("<div>text</div>"); Assert.assertNotNull(doc);/*from ww w . j a v a2 s .co m*/ Assert.assertEquals("<html><head></head><body><div>text</div></body></html>", StringUtils.deleteWhitespace(doc.html())); }
From source file:se.vgregion.portal.iframe.controller.CSEditController.java
/** * RenderMapping for edit page. The method extracts input elements from a given URL. * * @param prefs PortletPreferences/*from w w w. ja va 2s .c o m*/ * @param model Model * @return view */ @RenderMapping(params = "action=loginExtractor") public String loginExtractor(PortletPreferences prefs, Model model) { PortletConfig portletConfig = PortletConfig.getInstance(prefs); model.addAttribute("portletConfig", portletConfig); String loginFormUrl = portletConfig.getSrc(); model.addAttribute("loginformUrl", loginFormUrl); try { final int timeout = 5000; Document doc = new JSoupHelper().invoke(new URL(loginFormUrl), timeout); model.addAttribute("loginformContent", doc.html()); List<Form> loginforms = loginformService.extract(doc); model.addAttribute("loginforms", loginforms); LoginExtractor loginExtractor = initLoginExtractor(loginforms); model.addAttribute("loginExtractor", loginExtractor); } catch (Exception e) { model.addAttribute("loginformContent", "Failed to lookup page content"); model.addAttribute("error", e); e.printStackTrace(); } return "loginExtractor"; }
From source file:org.brunocvcunha.taskerbox.impl.jobs.MonsterJobSeeker.java
private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl) throws JSONException, ClientProtocolException, IOException, URISyntaxException { if (alreadyPerformedAction(jobUrl)) { return true; }/* ww w . java2 s .c om*/ String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer; if (!considerTitle(jobTitle)) { logInfo(log, "-- Ignored [title] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerEmployer(jobEmployer)) { logInfo(log, "-- Ignored [employer] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerLocation(location)) { logInfo(log, "-- Ignored [location] " + headline); addAlreadyPerformedAction(jobUrl); return true; } try { Thread.sleep(1000L); } catch (InterruptedException e) { e.printStackTrace(); } HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl); String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity); Document jobDocument = Jsoup.parse(jobResult); Elements elDescription = jobDocument.select("div#jobBodyContent"); if (!jobDocument.html().contains("ApplyOnlineUrl: ''") && !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !this.externalApply) { logInfo(log, "-- Ignored [externalApply] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerVisaDescription(elDescription.html())) { logInfo(log, "-- Ignored [visa] " + headline); addAlreadyPerformedAction(jobUrl); return true; } if (!considerExperienceDescription(elDescription.html())) { logInfo(log, "-- Ignored [exp] " + headline); addAlreadyPerformedAction(jobUrl); return true; } ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html()); if (result.getScore() < this.requiredScore) { logInfo(log, "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline); addAlreadyPerformedAction(jobUrl); return true; } headline = headline + " - " + result.getMatches(); logInfo(log, "Open --> " + headline); // logInfo(log, elDescription.html()); performUnique(jobUrl); try { Thread.sleep(5000L); } catch (InterruptedException e) { e.printStackTrace(); } return true; }
From source file:me.vertretungsplan.parser.DSBMobileParser.java
private void loadScheduleFromUrl(SubstitutionSchedule v, String url, List<String> usedUrls) throws IOException, JSONException, CredentialInvalidException, IncompatibleScheduleException { usedUrls.add(url);//from w w w .j a va 2 s . c o m String html = httpGet(url, data.has(PARAM_ENCODING) ? data.optString(PARAM_ENCODING, null) : "UTF-8"); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || data.optString(PARAM_TYPE, "").equals("untis")) { parseMultipleMonitorDays(v, doc, data); } else if (doc.html().toLowerCase().contains("created by davinci") || data.optString(PARAM_TYPE, "").equals("davinci")) { Elements titles = doc.select("h2"); Elements tables = doc.select("h2 + p + table"); if (titles.size() != tables.size()) throw new IOException("Anzahl berschriften != Anzahl Tabellen"); for (int i = 0; i < titles.size(); i++) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); String date = titles.get(i).text(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); DaVinciParser.parseDaVinciTable(tables.get(i), v, day, colorProvider); v.addDay(day); } } else if (doc.select(".tdaktionen").size() > 0 || data.optString(PARAM_TYPE, "").equals("indiware")) { new IndiwareParser(scheduleData, cookieProvider).parseIndiwarePage(v, doc.html()); } else if (doc.text().matches(".*Fr diesen Bereich.*wurde kein Inhalt bereitgestellt\\.")) { return; } else { throw new IncompatibleScheduleException(); } if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!usedUrls.contains(redirectUrl)) { loadScheduleFromUrl(v, redirectUrl, usedUrls); } } }
From source file:mailbox.CreationViaEmail.java
private static String replaceCidWithAttachments(String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = { "src", "href" }; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; }/* w w w. j a v a2 s . co m*/ String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java
@Test public void testParseContentInput() throws Exception { Assert.assertNotNull(pdfStreamResult); final Document doc = pdfStreamResult.parseContent("<form><input type='text' name='name'></form>"); Assert.assertNotNull(doc);// ww w . j av a 2 s . co m Assert.assertEquals( "<html><head></head><body><form><inputtype=\"text\"name=\"name\"/></form></body></html>", StringUtils.deleteWhitespace(doc.html())); }
From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java
@Test public void testParseContentScript() throws Exception { Assert.assertNotNull(pdfStreamResult); final Document doc = pdfStreamResult.parseContent( "<head><script>alert('alert 1');<\\/script></head><script>alert('alert 2');</script><div>text</div>"); Assert.assertNotNull(doc);/*from w ww .j a v a 2 s . c om*/ Assert.assertEquals("<html><head></head><body><div>text</div></body></html>", StringUtils.deleteWhitespace(doc.html())); }