List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:com.k42b3.aletheia.response.html.Images.java
private void parseImages(String html) { Document doc = Jsoup.parse(html); Elements image = doc.getElementsByTag("img"); for (Element img : image) { String src = img.attr("src"); if (!src.isEmpty()) { try { URL url = new URL(Util.resolveHref(baseUrl, src)); if (!images.contains(url)) { images.add(url);// ww w . j ava2 s. com } } catch (Exception e) { Aletheia.handleException(e); } } } }
From source file:de.siegmar.securetransfer.controller.MvcTest.java
@Test public void messageWithoutFileWithoutPassword() throws Exception { final String messageToSend = "my secret message"; final String boundary = "------TestBoundary" + UUID.randomUUID(); final MultipartEntityBuilder builder = MultipartEntityBuilder.create().setBoundary(boundary) .addTextBody("expirationDays", "1").addTextBody("message", messageToSend); // Create new message and expect redirect with flash message after store final MvcResult createMessageResult = mockMvc .perform(post("/send").content(ByteStreams.toByteArray(builder.build().getContent())) .contentType(MediaType.MULTIPART_FORM_DATA_VALUE + "; boundary=" + boundary)) .andExpect(status().isFound()).andExpect(redirectedUrlPattern("/send/**")) .andExpect(flash().attribute("message", messageToSend)).andReturn(); // receive data after redirect final String messageStatusUrl = createMessageResult.getResponse().getRedirectedUrl(); final MvcResult messageStatusResult = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("send/message_status")).andReturn(); final SenderMessage senderMessage = (SenderMessage) messageStatusResult.getModelAndView().getModel() .get("senderMessage"); assertNotNull(senderMessage);/*from w w w . j a va2s. c o m*/ assertNotNull(senderMessage.getId()); assertNotNull(senderMessage.getReceiverId()); assertNotNull(senderMessage.getExpiration()); assertNull(senderMessage.getReceived()); assertFalse(senderMessage.isPasswordEncrypted()); final String receiveUrl = (String) messageStatusResult.getModelAndView().getModel().get("receiveUrl"); assertNotNull(receiveUrl); final String linkSecret = messageStatusUrl.replaceFirst(".*linkSecret=", ""); HashCode.fromString(linkSecret); // call receiver URL final MvcResult confirmPage = mockMvc.perform(get(receiveUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("receive/message_confirm")).andReturn(); final Document confirmPageDoc = Jsoup.parse(confirmPage.getResponse().getContentAsString()); final String confirmUrl = confirmPageDoc.getElementsByTag("form").attr("action"); // Receive message final MvcResult messageResult = mockMvc.perform(get(confirmUrl).param("linkSecret", linkSecret)) .andExpect(status().isOk()).andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("receive/message")).andReturn(); final DecryptedMessage decryptedMessage = (DecryptedMessage) messageResult.getModelAndView().getModel() .get("decryptedMessage"); assertEquals(messageToSend, decryptedMessage.getMessage()); assertEquals(0, decryptedMessage.getFiles().size()); // Check message is burned mockMvc.perform(get(receiveUrl)).andExpect(status().isNotFound()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("message_not_found")); // Check sender status page final MvcResult messageStatusResult2 = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("send/message_status")).andReturn(); final SenderMessage senderMessage2 = (SenderMessage) messageStatusResult2.getModelAndView().getModel() .get("senderMessage"); assertNotNull(senderMessage2); assertNotNull(senderMessage2.getId()); assertNotNull(senderMessage2.getReceiverId()); assertNotNull(senderMessage2.getExpiration()); assertNotNull(senderMessage2.getReceived()); assertFalse(senderMessage.isPasswordEncrypted()); }
From source file:de.siegmar.securetransfer.controller.MvcTest.java
@Test public void messageWithFileWithPassword() throws Exception { final String messageToSend = "my secret message"; final String password = "top secret password"; final String fileContent = "test file content"; final String boundary = "------TestBoundary" + UUID.randomUUID(); final MultipartEntityBuilder builder = MultipartEntityBuilder.create().setBoundary(boundary) .addTextBody("expirationDays", "1").addTextBody("message", messageToSend) .addTextBody("password", password).addBinaryBody("files", fileContent.getBytes(StandardCharsets.UTF_8), ContentType.APPLICATION_OCTET_STREAM, "test.txt"); // Create new message and expect redirect with flash message after store final MvcResult createMessageResult = mockMvc .perform(post("/send").content(ByteStreams.toByteArray(builder.build().getContent())) .contentType(MediaType.MULTIPART_FORM_DATA_VALUE + "; boundary=" + boundary)) .andExpect(status().isFound()).andExpect(redirectedUrlPattern("/send/**")) .andExpect(flash().attribute("message", messageToSend)).andReturn(); // receive data after redirect final String messageStatusUrl = createMessageResult.getResponse().getRedirectedUrl(); final String linkSecret = messageStatusUrl.replaceFirst(".*linkSecret=", ""); HashCode.fromString(linkSecret);/*w w w . j a va2 s . c o m*/ final MvcResult messageStatusResult = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("send/message_status")).andReturn(); final SenderMessage senderMessage = (SenderMessage) messageStatusResult.getModelAndView().getModel() .get("senderMessage"); assertNotNull(senderMessage); assertNotNull(senderMessage.getId()); assertNotNull(senderMessage.getReceiverId()); assertNotNull(senderMessage.getExpiration()); assertNull(senderMessage.getReceived()); assertTrue(senderMessage.isPasswordEncrypted()); final String receiveUrl = (String) messageStatusResult.getModelAndView().getModel().get("receiveUrl"); assertNotNull(receiveUrl); // call receiver URL final MvcResult confirmPage = mockMvc.perform(get(receiveUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("receive/message_ask_password")).andReturn(); final Document confirmPageDoc = Jsoup.parse(confirmPage.getResponse().getContentAsString()); final String passwordUrl = confirmPageDoc.getElementsByTag("form").attr("action"); // Receive message final MvcResult messageResult = mockMvc .perform(post(passwordUrl).param("linkSecret", linkSecret).param("password", password)) .andExpect(status().isOk()).andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("receive/message")).andReturn(); final DecryptedMessage decryptedMessage = (DecryptedMessage) messageResult.getModelAndView().getModel() .get("decryptedMessage"); assertEquals(messageToSend, decryptedMessage.getMessage()); assertEquals(1, decryptedMessage.getFiles().size()); final DecryptedFile file = decryptedMessage.getFiles().get(0); final String fileId = file.getId(); final String fileKey = file.getKeyHex(); // Download file final MvcResult downloadResult = mockMvc .perform(get("/receive/file/{id}/{key}", fileId, fileKey).sessionAttr("iv_file_" + fileId, file.getKeyIv().getIv())) .andExpect(request().asyncStarted()) //.andExpect(request().asyncResult("Deferred result")) .andExpect(status().isOk()).andExpect(content().contentType("application/octet-stream")) .andReturn(); downloadResult.getAsyncResult(); assertEquals(fileContent, downloadResult.getResponse().getContentAsString()); // Check message is burned mockMvc.perform(get(receiveUrl)).andExpect(status().isNotFound()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("message_not_found")); // Check file is burned mockMvc.perform(get("/receive/file/{id}/{key}", fileId, fileKey).sessionAttr("iv_file_" + fileId, file.getKeyIv().getIv())).andExpect(status().isNotFound()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("message_not_found")); // Check sender status page final MvcResult messageStatusResult2 = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk()) .andExpect(content().contentType("text/html;charset=UTF-8")) .andExpect(view().name("send/message_status")).andReturn(); final SenderMessage senderMessage2 = (SenderMessage) messageStatusResult2.getModelAndView().getModel() .get("senderMessage"); assertNotNull(senderMessage2); assertNotNull(senderMessage2.getId()); assertNotNull(senderMessage2.getReceiverId()); assertNotNull(senderMessage2.getExpiration()); assertNotNull(senderMessage2.getReceived()); assertTrue(senderMessage.isPasswordEncrypted()); }
From source file:DownloadDialog.java
/******************************************************************** * Method: storeTerms//from www. ja v a 2s. c o m * Purpose: store available terms to use /*******************************************************************/ public void storeTerms() { try { // Default terms termsName = new ArrayList<String>(); termsValue = new ArrayList<String>(); // Create client for terms DefaultHttpClient client = new DefaultHttpClient(); HttpGet dynamicGet = new HttpGet("http://jweb.kettering.edu/cku1/xhwschedule.P_SelectSubject"); // Execute post call HttpResponse response = client.execute(dynamicGet); Document doc = Jsoup.parse(HTMLParser.parse(response)); Elements options = doc.getElementsByTag("option"); // Store every option for (Element option : options) { // First term option if (!option.text().contains("None")) { this.termsName.add(option.text()); this.termsValue.add(option.val()); } } //client.close(); } // Catch all exceptions catch (Exception e) { // Print track, set false, return false e.printStackTrace(); } }
From source file:MySpaceParser.java
private void parseSingleFile(File file) throws Exception { Document htmlFile = null; try {/*from w w w . ja v a 2s. c o m*/ htmlFile = Jsoup.parse(file, "ISO-8859-1"); } catch (Exception e) { e.printStackTrace(); } // Elements parents =htmlFile.getElementsByClass("cover"); Elements parents = htmlFile.getElementsByTag("section"); String title = "*^*"; String artist = "*^*"; String url = "*^*"; String imageurl = "*^*"; String pageTitle = "*^*"; String description = "*^*"; String songid = "*^*"; String genre = "*^*"; String album = "*^*"; String year = "*^*"; boolean isVideo = false; Elements titles = htmlFile.getElementsByTag("title"); Elements metas = htmlFile.getElementsByTag("meta"); for (Element meta : metas) { String name = meta.attr("name"); String prop = meta.attr("property"); if (prop.equals("og:video")) { System.out.println(); url = meta.attr("content"); String arr[] = url.split("/"); songid = arr[arr.length - 1]; title = arr[arr.length - 2]; artist = arr[arr.length - 4]; isVideo = true; } if (name.equals("description")) { // System.out.println(); description = meta.attr("content"); } } for (Element Pagetitle : titles) { pageTitle = Pagetitle.html(); // System.out.println(pageTitle); break; } if (isVideo) { SongData s = new SongData(title, url, album, artist, year, genre, imageurl); s.setPagetitle(pageTitle); s.setDescrption(description); index.put(songid, s); return; } if (parents.isEmpty() && !isVideo) { return; } else { // boolean isVideo = false; titles = htmlFile.getElementsByTag("title"); metas = htmlFile.getElementsByTag("meta"); for (Element meta : metas) { String name = meta.attr("name"); String prop = meta.attr("property"); if (prop.equals("og:video")) { System.out.println(); url = meta.attr("content"); String arr[] = url.split("/"); songid = arr[arr.length - 1]; isVideo = true; } if (name.equals("description")) { // System.out.println(); description = meta.attr("content"); } } for (Element Pagetitle : titles) { pageTitle = Pagetitle.html(); // System.out.println(pageTitle); break; } for (Element e : parents) { if (e.attr("id").equals("song")) { Elements e1 = e.children(); for (Element e2 : e1) { if (e2.attr("id").equals("actions")) { Elements e3 = e2.children(); int count = 0; for (Element e4 : e3) { if (count == 1) { songid = e4.attr("data-song-id"); album = e4.attr("data-album-title"); title = e4.attr("data-title"); artist = e4.attr("data-artist-name"); url = "www.myspace.com" + e4.attr("data-url"); genre = e4.attr("data-genre-name"); imageurl = e4.attr("data-image-url"); SongData s = new SongData(title, url, album, artist, year, genre, imageurl); s.setPagetitle(pageTitle); s.setDescrption(description); index.put(songid, s); } count++; } // System.out.println(); } } // System.out.println(e.attr("id")); } } //System.out.println(); } }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
public void processHTMLfile(File input_html) throws IOException, TableExtractorException, CloneNotSupportedException, SQLException, ResultSinkException { logger.info("Start processing " + input_html); Document doc = Jsoup.parse(input_html, "UTF-8"); Elements tables = doc.getElementsByTag("table"); /* count of parseable tables found */ int tables_found = 0; /* determine raion name */ String raion_name = extractRaionFromFileName(input_html.getName()); //System.err.println(raion_name); // TODO: inflect raion name in case /* searches for a table that has " . -" in its very 1st cell */ for (Element table : tables) { Elements rows = table.getElementsByTag("tr"); boolean firstRow = true; row_loop: for (Element row : rows) { Elements cells = row.getElementsByTag("td"); if (firstRow) { //System.err.println(row.text()); if (isParsableTable(row)) { firstRow = false; logger.info("Processing table #" + ++tables_found + " in " + input_html); } else break row_loop; }//from w w w . j a v a 2 s . c o m if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) continue row_loop; /* skip the row if it looks like a table header */ /* skip rows with all cells empty */ boolean emptyRow = true; for (Element cell : cells) emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty(); if (emptyRow) continue; int i_cell = 0; Element station_id = null; Element address_field = null; Element org_address = null; /* address of the ??? */ Element station_address = null; for (Element cell : cells) { switch (i_cell) { case 0: station_id = cell; break; case 1: address_field = cell; break; case 2: org_address = cell; break; case 3: station_address = cell; default: break; } i_cell++; } if (station_id == null) throw new TableExtractorException("Polling station ID not found", row, input_html); if (address_field == null) throw new TableExtractorException("Address list not found", row, input_html); /* extract int from poll station id */ int psid; try { psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { Exception te = new TableExtractorException("Failed to parse polling station ID >" + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html); logger.severe(te.getMessage() + "; rest of " + input_html + " ignored."); return; } /* extraction from HTML completely finished, now we work only with the addresses in the text form */ extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field), cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address)); } } if (tables_found == 0) logger.severe("No parsable tables found in " + input_html); resultSink.commit(); logger.info("" + tables_found + " table(s) processed in " + input_html); }
From source file:org.sbs.goodcrawler.fetcher.FetchWorker.java
/** * @param url//w w w . java 2s. c o m * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getURL())) { // ?? if (fetchFilter(url.getURL())) { result = fetcher.fetchHeader(url); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); pendingUrls.processedSuccess(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getURL())) { onFailed(url); return; } // ?? if (extractFilter(url.getURL())) { pendingPages.addElement(page); } // depth if (url.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ?? if (fetchFilter(linkHref) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setURL(linkHref); purl.setJobName(conf.jobName); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl, 1000)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getURL() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } else { onIgnored(url); } } } catch (Exception e) { onFailed(url); } catch (QueueException e) { onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } }
From source file:tkbautobooking.BookingSystem.java
private void praseBookingToken() throws Exception { Document doc = Jsoup.parse(BookingPageHTML); Element script = doc.getElementsByTag("script").last(); String str = script.toString().substring(script.toString().indexOf("access_token")); str = str.substring(str.indexOf("\"") + 1); str = str.substring(0, str.indexOf("\"")); booking_hidden_token = str;//www. j a va 2 s.co m }
From source file:net.GoTicketing.GoTicketing.java
/** * ??// ww w . j av a 2 s . co m * @throws Exception */ private void praseVoiceCaptchaSrc() throws Exception { Document doc = Jsoup.parse(TicketingPageHTML); Element voc = doc.getElementsByTag("audio").last(); if (voc == null) throw new Exception("Can't get voice captcha source !"); //out.println(host + voc.attr("src").substring(1)); VoiceCaptchaSrc = host + voc.attr("src").substring(1); }