List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:com.osw.enderecamento.util.EnderecamentoParseUtil.java
/** * extract info about a address/*from w w w. j a va 2 s . c o m*/ * * @param html * @return address list */ public List<Enderecamento> parseEnderecamento(String html) { Document docHtml = Jsoup.parse(html); Elements tagTr = docHtml.getElementsByTag("tr"); List<Enderecamento> enderecamentos = new LinkedList<Enderecamento>(); for (Element element : tagTr) { if (element.getElementsByTag("td").size() > 0) { Elements tahTd = element.getElementsByTag("td"); Enderecamento enderecamento = new Enderecamento(); enderecamento.setLogradouro(tahTd.get(0).text()); enderecamento.setBairro(tahTd.get(1).text()); enderecamento.setCidade(tahTd.get(2).text()); enderecamento.setCep(tahTd.get(3).text()); enderecamentos.add(enderecamento); } } return enderecamentos; }
From source file:org.sonatype.nexus.proxy.maven.routing.internal.scrape.NginxIndexScraper.java
@Override protected Element getParentDirectoryElement(final Page page) { final Document doc = Jsoup.parseBodyFragment("<a href=\"../\">../</a>", page.getUrl()); return doc.getElementsByTag("a").first(); }
From source file:popo.defcon.MsgMeCDC.java
void Parse() { String input = readPage();/* ww w. j a v a2 s .co m*/ if (input == null) { System.out.println("Error connecting to Internet"); return; } String time; Document cdc = Jsoup.parse(input); Elements notices = cdc.getElementsByTag("tbody"); Elements alerts = notices.get(1).getElementsByTag("tr"); alerts.remove(0); System.out.println("Current Old Time = " + oldtime); for (Element node : alerts) { Elements content = node.getElementsByTag("td"); time = content.last().text(); if (convertTime(time).compareTo(convertTime(oldtime)) <= 0) { MsgMeCDC.oldtime = alerts.get(0).getElementsByTag("td").last().text(); return; } System.out.println("Current notice time :" + convertTime(time)); Logger.getLogger(MsgMeCDC.class.getName()).log(Level.INFO, "Current notice time :" + convertTime(time)); //for (Element text : content) { // System.out.println(text.text()); //} String smsTitle = content.get(1).text(); //String smsCompanyName = content.get(2).text(); String smsNoticeTime = content.get(4).text(); String preSMStext = content.get(3).text(); String randomtext = "Placement/ Internship Form Description Files"; int start = preSMStext.indexOf(randomtext) + randomtext.length() + 1; int twilio = "Sent from your Twilio trial account - ".length(); int end = 150 - (smsTitle.length() + smsNoticeTime.length() + twilio + 2); String smsContent = preSMStext.substring(start, start + end); String sms = smsTitle + '\n' + smsNoticeTime + '\n' + smsContent; System.out.println(sms); sendSMS(sms); Logger.getLogger(MsgMeCDC.class.getName()).log(Level.INFO, "SMS sent: " + sms); Logger.getLogger(MsgMeCDC.class.getName()).log(Level.INFO, "Length of SMS is " + (sms.length() + twilio)); System.out.println("\nLength of SMS is " + (sms.length() + twilio)); System.out.println(""); } //System.out.println(notices.toString()); }
From source file:com.crosstreelabs.cognitio.gumshoe.format.HtmlFormatHandler.java
@Override public void processLinks(final Visit visit) { try {//from w w w.j a va 2s .c o m String charset = StringUtils.defaultIfBlank(visit.contentCharset, "UTF-8"); Document doc = Jsoup.parse(visit.contentStream, charset, visit.result.location); Elements anchors = doc.getElementsByTag("a"); for (Element e : anchors) { String url = stripURLFragmentIdentifier(e.attr("abs:href")); String uri = stripURLFragmentIdentifier(e.attr("href").toLowerCase()); if (uri.isEmpty() || url.isEmpty() || uri.contains("javascript:") || uri.contains("mailto:") || uri.contains("@")) { continue; } visit.discoveredLinks.add(URL.parse(url).toString()); // TODO Need to add the link text as the title } visit.contentStream.reset(); } catch (GalimatiasParseException | IOException ex) { throw new RuntimeException(ex); } }
From source file:com.jaspersoft.android.jaspermobile.data.repository.report.page.HtmlPageCreator.java
@Override @NonNull/*w w w. java 2 s .c o m*/ public ReportPage create() throws Exception { String range = pageRequest.getRange(); PageRange pageRange = PageRange.parse(range); ReportExportOptions options = ReportExportOptions.builder() .withFormat(ReportFormat.valueOf(pageRequest.getFormat())).withPageRange(pageRange).build(); ReportExport export = execution.export(options); ReportExportOutput output = export.download(); InputStream reportExport = output.getStream(); InputStream customScript = context.getAssets().open("rest-report.js"); try { Reader reader = new InputStreamReader(customScript); Template template = Mustache.compiler().compile(reader); int pageIndex = Integer.valueOf(range) - 1; Map<String, Object> data = new HashMap<>(); data.put("jasperPrintName", execution.getExecutionId()); data.put("pageIndex", pageIndex); data.put("isPro", server.isProEdition()); String customJs = template.execute(data); DataNode indexHeadScript = DataNode.createFromEncoded("<script>" + customJs + "</script>", ""); Document document = Jsoup.parse(reportExport, Charset.defaultCharset().name(), ""); Elements head = document.getElementsByTag("head"); head.append(indexHeadScript.toString()); byte[] exportContent = document.toString().getBytes(Charset.forName("UTF-8")); return new ReportPage(exportContent, output.isFinal()); } finally { IOUtils.closeQuietly(reportExport); IOUtils.closeQuietly(customScript); } }
From source file:web.analyzer.utils.Utils.java
public boolean hasLoginForm(Document doc) { Elements formElements = doc.getElementsByTag("form"); for (Element formElement : formElements) { String frmElementAsString = formElement.toString().toLowerCase().replace("'", "\""); Pattern inputTextTagPattern = Pattern.compile("type=\"text\""); Matcher inputTextTagMatcher = inputTextTagPattern.matcher(frmElementAsString); int inputTextTagCount = 0; while (inputTextTagMatcher.find()) { inputTextTagCount++;//w ww . jav a 2 s.co m } Pattern inputEmailTagPattern = Pattern.compile("type=\"email\""); Matcher inputEmailTagMatcher = inputEmailTagPattern.matcher(frmElementAsString); int inputEmailTagCount = 0; while (inputEmailTagMatcher.find()) { inputEmailTagCount++; } Pattern inputPasswordTagPattern = Pattern.compile("type=\"password\""); Matcher inputPasswordTagMatcher = inputPasswordTagPattern.matcher(frmElementAsString); int inputPasswordTagCount = 0; while (inputPasswordTagMatcher.find()) { inputPasswordTagCount++; } if ((inputTextTagCount == 1 || inputEmailTagCount == 1) && inputPasswordTagCount == 1) { return true; } } return false; }
From source file:mailbox.CreationViaEmail.java
private static String replaceCidWithAttachments(String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = { "src", "href" }; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; }/*w w w.ja v a 2 s . c o m*/ String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
From source file:com.mycompany.crawlertest.GrabManager.java
public void go(URL start) throws InterruptedException, IOException { // stay within same site urlBase = start.toString().replaceAll("(.*//.*/).*", "$1"); StopWatch stopWatch = new StopWatch(); stopWatch.start();// w ww .jav a 2 s . co m submitNewURL(start, 0); while (checkPageGrabs()) ; stopWatch.stop(); System.out.println("Found " + masterList.size() + " urls"); System.out.println("in " + stopWatch.getTime() / 1000 + " seconds"); for (String url : Uttils.URLS) { System.out.println(url); if (!url.contains("http")) { url = start.getProtocol() + "://" + start.getHost() + url; //System.out.println("com.mycompany.crawlertest.GrabManager.go() : " + ); } try { Document document = Jsoup.parse(new URL(url), TIMEOUT); if (document.getElementsByTag("h1") != null && document.getElementsByTag("h1").size() != 0) { Elements videoTag = document.getElementsByTag("iframe"); if (videoTag != null && videoTag.size() != 0) { if (videoTag.get(0).hasAttr("allowfullscreen")) { String tag = videoTag.get(0).toString(); System.out.println(document.getElementsByTag("h1").get(0).text() + " ___ " + tag); } } } } catch (Exception EX) { System.out.println("ERROR : " + EX.getMessage()); } //processHeaders(document.select("h1")); } }
From source file:io.gravitee.management.service.impl.EmailServiceImpl.java
private String addResourcesInMessage(final MimeMessageHelper mailMessage, final String htmlText) throws Exception { final Document document = Jsoup.parse(htmlText); final List<String> resources = new ArrayList<>(); final Elements imageElements = document.getElementsByTag("img"); resources.addAll(//from w w w. j a v a 2 s . c o m imageElements.stream().filter(imageElement -> imageElement.hasAttr("src")).map(imageElement -> { final String src = imageElement.attr("src"); imageElement.attr("src", "cid:" + src); return src; }).collect(Collectors.toList())); final String html = document.html(); mailMessage.setText(html, true); for (final String res : resources) { final FileSystemResource templateResource = new FileSystemResource(new File(templatesPath, res)); mailMessage.addInline(res, templateResource, MimetypesFileTypeMap.getDefaultFileTypeMap().getContentType(res)); } return html; }
From source file:com.aestasit.markdown.slidery.converters.TextTemplateConverter.java
protected Elements getSlideCollection(Document slidesDocument) { return slidesDocument.getElementsByTag("section"); }