Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:com.k42b3.aletheia.response.html.Images.java

private void parseImages(String html) {
    Document doc = Jsoup.parse(html);
    Elements image = doc.getElementsByTag("img");

    for (Element img : image) {
        String src = img.attr("src");

        if (!src.isEmpty()) {
            try {
                URL url = new URL(Util.resolveHref(baseUrl, src));

                if (!images.contains(url)) {
                    images.add(url);// ww w .  j ava2 s.  com
                }
            } catch (Exception e) {
                Aletheia.handleException(e);
            }
        }
    }
}

From source file:de.siegmar.securetransfer.controller.MvcTest.java

@Test
public void messageWithoutFileWithoutPassword() throws Exception {
    final String messageToSend = "my secret message";

    final String boundary = "------TestBoundary" + UUID.randomUUID();
    final MultipartEntityBuilder builder = MultipartEntityBuilder.create().setBoundary(boundary)
            .addTextBody("expirationDays", "1").addTextBody("message", messageToSend);

    // Create new message and expect redirect with flash message after store
    final MvcResult createMessageResult = mockMvc
            .perform(post("/send").content(ByteStreams.toByteArray(builder.build().getContent()))
                    .contentType(MediaType.MULTIPART_FORM_DATA_VALUE + "; boundary=" + boundary))
            .andExpect(status().isFound()).andExpect(redirectedUrlPattern("/send/**"))
            .andExpect(flash().attribute("message", messageToSend)).andReturn();

    // receive data after redirect
    final String messageStatusUrl = createMessageResult.getResponse().getRedirectedUrl();

    final MvcResult messageStatusResult = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("send/message_status")).andReturn();

    final SenderMessage senderMessage = (SenderMessage) messageStatusResult.getModelAndView().getModel()
            .get("senderMessage");

    assertNotNull(senderMessage);/*from  w w w . j a  va2s. c  o m*/
    assertNotNull(senderMessage.getId());
    assertNotNull(senderMessage.getReceiverId());
    assertNotNull(senderMessage.getExpiration());
    assertNull(senderMessage.getReceived());
    assertFalse(senderMessage.isPasswordEncrypted());

    final String receiveUrl = (String) messageStatusResult.getModelAndView().getModel().get("receiveUrl");

    assertNotNull(receiveUrl);

    final String linkSecret = messageStatusUrl.replaceFirst(".*linkSecret=", "");
    HashCode.fromString(linkSecret);

    // call receiver URL
    final MvcResult confirmPage = mockMvc.perform(get(receiveUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("receive/message_confirm")).andReturn();

    final Document confirmPageDoc = Jsoup.parse(confirmPage.getResponse().getContentAsString());
    final String confirmUrl = confirmPageDoc.getElementsByTag("form").attr("action");

    // Receive message
    final MvcResult messageResult = mockMvc.perform(get(confirmUrl).param("linkSecret", linkSecret))
            .andExpect(status().isOk()).andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("receive/message")).andReturn();

    final DecryptedMessage decryptedMessage = (DecryptedMessage) messageResult.getModelAndView().getModel()
            .get("decryptedMessage");

    assertEquals(messageToSend, decryptedMessage.getMessage());
    assertEquals(0, decryptedMessage.getFiles().size());

    // Check message is burned
    mockMvc.perform(get(receiveUrl)).andExpect(status().isNotFound())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("message_not_found"));

    // Check sender status page
    final MvcResult messageStatusResult2 = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("send/message_status")).andReturn();

    final SenderMessage senderMessage2 = (SenderMessage) messageStatusResult2.getModelAndView().getModel()
            .get("senderMessage");

    assertNotNull(senderMessage2);
    assertNotNull(senderMessage2.getId());
    assertNotNull(senderMessage2.getReceiverId());
    assertNotNull(senderMessage2.getExpiration());
    assertNotNull(senderMessage2.getReceived());
    assertFalse(senderMessage.isPasswordEncrypted());
}

From source file:de.siegmar.securetransfer.controller.MvcTest.java

@Test
public void messageWithFileWithPassword() throws Exception {
    final String messageToSend = "my secret message";
    final String password = "top secret password";
    final String fileContent = "test file content";

    final String boundary = "------TestBoundary" + UUID.randomUUID();
    final MultipartEntityBuilder builder = MultipartEntityBuilder.create().setBoundary(boundary)
            .addTextBody("expirationDays", "1").addTextBody("message", messageToSend)
            .addTextBody("password", password).addBinaryBody("files",
                    fileContent.getBytes(StandardCharsets.UTF_8), ContentType.APPLICATION_OCTET_STREAM,
                    "test.txt");

    // Create new message and expect redirect with flash message after store
    final MvcResult createMessageResult = mockMvc
            .perform(post("/send").content(ByteStreams.toByteArray(builder.build().getContent()))
                    .contentType(MediaType.MULTIPART_FORM_DATA_VALUE + "; boundary=" + boundary))
            .andExpect(status().isFound()).andExpect(redirectedUrlPattern("/send/**"))
            .andExpect(flash().attribute("message", messageToSend)).andReturn();

    // receive data after redirect
    final String messageStatusUrl = createMessageResult.getResponse().getRedirectedUrl();

    final String linkSecret = messageStatusUrl.replaceFirst(".*linkSecret=", "");
    HashCode.fromString(linkSecret);/*w  w  w .  j a va2  s  . c  o  m*/

    final MvcResult messageStatusResult = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("send/message_status")).andReturn();

    final SenderMessage senderMessage = (SenderMessage) messageStatusResult.getModelAndView().getModel()
            .get("senderMessage");

    assertNotNull(senderMessage);
    assertNotNull(senderMessage.getId());
    assertNotNull(senderMessage.getReceiverId());
    assertNotNull(senderMessage.getExpiration());
    assertNull(senderMessage.getReceived());
    assertTrue(senderMessage.isPasswordEncrypted());

    final String receiveUrl = (String) messageStatusResult.getModelAndView().getModel().get("receiveUrl");

    assertNotNull(receiveUrl);

    // call receiver URL
    final MvcResult confirmPage = mockMvc.perform(get(receiveUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("receive/message_ask_password")).andReturn();

    final Document confirmPageDoc = Jsoup.parse(confirmPage.getResponse().getContentAsString());
    final String passwordUrl = confirmPageDoc.getElementsByTag("form").attr("action");

    // Receive message
    final MvcResult messageResult = mockMvc
            .perform(post(passwordUrl).param("linkSecret", linkSecret).param("password", password))
            .andExpect(status().isOk()).andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("receive/message")).andReturn();

    final DecryptedMessage decryptedMessage = (DecryptedMessage) messageResult.getModelAndView().getModel()
            .get("decryptedMessage");

    assertEquals(messageToSend, decryptedMessage.getMessage());
    assertEquals(1, decryptedMessage.getFiles().size());

    final DecryptedFile file = decryptedMessage.getFiles().get(0);
    final String fileId = file.getId();
    final String fileKey = file.getKeyHex();

    // Download file
    final MvcResult downloadResult = mockMvc
            .perform(get("/receive/file/{id}/{key}", fileId, fileKey).sessionAttr("iv_file_" + fileId,
                    file.getKeyIv().getIv()))
            .andExpect(request().asyncStarted())
            //.andExpect(request().asyncResult("Deferred result"))
            .andExpect(status().isOk()).andExpect(content().contentType("application/octet-stream"))
            .andReturn();

    downloadResult.getAsyncResult();
    assertEquals(fileContent, downloadResult.getResponse().getContentAsString());

    // Check message is burned
    mockMvc.perform(get(receiveUrl)).andExpect(status().isNotFound())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("message_not_found"));

    // Check file is burned
    mockMvc.perform(get("/receive/file/{id}/{key}", fileId, fileKey).sessionAttr("iv_file_" + fileId,
            file.getKeyIv().getIv())).andExpect(status().isNotFound())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("message_not_found"));

    // Check sender status page
    final MvcResult messageStatusResult2 = mockMvc.perform(get(messageStatusUrl)).andExpect(status().isOk())
            .andExpect(content().contentType("text/html;charset=UTF-8"))
            .andExpect(view().name("send/message_status")).andReturn();

    final SenderMessage senderMessage2 = (SenderMessage) messageStatusResult2.getModelAndView().getModel()
            .get("senderMessage");

    assertNotNull(senderMessage2);
    assertNotNull(senderMessage2.getId());
    assertNotNull(senderMessage2.getReceiverId());
    assertNotNull(senderMessage2.getExpiration());
    assertNotNull(senderMessage2.getReceived());
    assertTrue(senderMessage.isPasswordEncrypted());
}

From source file:DownloadDialog.java

/********************************************************************
 * Method: storeTerms//from  www. ja v a 2s. c o m
 * Purpose: store available terms to use
/*******************************************************************/
public void storeTerms() {

    try {

        // Default terms
        termsName = new ArrayList<String>();
        termsValue = new ArrayList<String>();

        // Create client for terms
        DefaultHttpClient client = new DefaultHttpClient();
        HttpGet dynamicGet = new HttpGet("http://jweb.kettering.edu/cku1/xhwschedule.P_SelectSubject");

        // Execute post call
        HttpResponse response = client.execute(dynamicGet);
        Document doc = Jsoup.parse(HTMLParser.parse(response));
        Elements options = doc.getElementsByTag("option");

        // Store every option
        for (Element option : options) {

            // First term option
            if (!option.text().contains("None")) {

                this.termsName.add(option.text());
                this.termsValue.add(option.val());
            }
        }

        //client.close();
    }

    // Catch all exceptions
    catch (Exception e) {

        // Print track, set false, return false
        e.printStackTrace();
    }
}

From source file:MySpaceParser.java

private void parseSingleFile(File file) throws Exception {

    Document htmlFile = null;
    try {/*from w  w w  . ja v  a 2s.  c  o  m*/

        htmlFile = Jsoup.parse(file, "ISO-8859-1");
    } catch (Exception e) {
        e.printStackTrace();
    }
    // Elements parents =htmlFile.getElementsByClass("cover");

    Elements parents = htmlFile.getElementsByTag("section");

    String title = "*^*";
    String artist = "*^*";
    String url = "*^*";
    String imageurl = "*^*";
    String pageTitle = "*^*";
    String description = "*^*";
    String songid = "*^*";
    String genre = "*^*";
    String album = "*^*";
    String year = "*^*";
    boolean isVideo = false;
    Elements titles = htmlFile.getElementsByTag("title");
    Elements metas = htmlFile.getElementsByTag("meta");
    for (Element meta : metas) {
        String name = meta.attr("name");
        String prop = meta.attr("property");
        if (prop.equals("og:video")) {
            System.out.println();
            url = meta.attr("content");
            String arr[] = url.split("/");
            songid = arr[arr.length - 1];
            title = arr[arr.length - 2];
            artist = arr[arr.length - 4];
            isVideo = true;
        }
        if (name.equals("description")) {
            // System.out.println();
            description = meta.attr("content");
        }
    }
    for (Element Pagetitle : titles) {
        pageTitle = Pagetitle.html();
        // System.out.println(pageTitle);
        break;
    }

    if (isVideo) {
        SongData s = new SongData(title, url, album, artist, year, genre, imageurl);
        s.setPagetitle(pageTitle);
        s.setDescrption(description);
        index.put(songid, s);
        return;
    }
    if (parents.isEmpty() && !isVideo) {
        return;
    } else {
        // boolean isVideo = false;
        titles = htmlFile.getElementsByTag("title");
        metas = htmlFile.getElementsByTag("meta");
        for (Element meta : metas) {
            String name = meta.attr("name");
            String prop = meta.attr("property");
            if (prop.equals("og:video")) {
                System.out.println();
                url = meta.attr("content");
                String arr[] = url.split("/");
                songid = arr[arr.length - 1];
                isVideo = true;
            }
            if (name.equals("description")) {
                // System.out.println();
                description = meta.attr("content");
            }
        }
        for (Element Pagetitle : titles) {
            pageTitle = Pagetitle.html();
            // System.out.println(pageTitle);
            break;
        }

        for (Element e : parents) {
            if (e.attr("id").equals("song")) {
                Elements e1 = e.children();
                for (Element e2 : e1) {
                    if (e2.attr("id").equals("actions")) {
                        Elements e3 = e2.children();
                        int count = 0;
                        for (Element e4 : e3) {

                            if (count == 1) {
                                songid = e4.attr("data-song-id");
                                album = e4.attr("data-album-title");
                                title = e4.attr("data-title");
                                artist = e4.attr("data-artist-name");
                                url = "www.myspace.com" + e4.attr("data-url");
                                genre = e4.attr("data-genre-name");
                                imageurl = e4.attr("data-image-url");
                                SongData s = new SongData(title, url, album, artist, year, genre, imageurl);
                                s.setPagetitle(pageTitle);
                                s.setDescrption(description);
                                index.put(songid, s);
                            }
                            count++;
                        }
                        // System.out.println();
                    }
                }

                // System.out.println(e.attr("id"));
            }

        }
        //System.out.println();

    }

}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
            CloneNotSupportedException, SQLException, ResultSinkException {

        logger.info("Start processing " + input_html);

        Document doc = Jsoup.parse(input_html, "UTF-8");
        Elements tables = doc.getElementsByTag("table");

        /* count of parseable tables found */
        int tables_found = 0;

        /* determine raion name */
        String raion_name = extractRaionFromFileName(input_html.getName());
        //System.err.println(raion_name);

        // TODO: inflect raion name in  case

        /* searches for a table that has " . -" in its very 1st cell */
        for (Element table : tables) {
            Elements rows = table.getElementsByTag("tr");
            boolean firstRow = true;

            row_loop: for (Element row : rows) {
                Elements cells = row.getElementsByTag("td");

                if (firstRow) {
                    //System.err.println(row.text());
                    if (isParsableTable(row)) {
                        firstRow = false;
                        logger.info("Processing table #" + ++tables_found + " in " + input_html);
                    } else
                        break row_loop;
                }//from  w w w  .  j  a v a 2 s .  c  o m

                if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                        " . -") < 3)
                    continue row_loop; /* skip the row if it looks like a table header */

                /* skip rows with all cells empty */
                boolean emptyRow = true;
                for (Element cell : cells)
                    emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                if (emptyRow)
                    continue;

                int i_cell = 0;
                Element station_id = null;
                Element address_field = null;
                Element org_address = null; /* address of the ??? */
                Element station_address = null;

                for (Element cell : cells) {
                    switch (i_cell) {
                    case 0:
                        station_id = cell;
                        break;
                    case 1:
                        address_field = cell;
                        break;
                    case 2:
                        org_address = cell;
                        break;
                    case 3:
                        station_address = cell;
                    default:
                        break;
                    }
                    i_cell++;
                }

                if (station_id == null)
                    throw new TableExtractorException("Polling station ID not found", row, input_html);
                if (address_field == null)
                    throw new TableExtractorException("Address list not found", row, input_html);

                /* extract int from poll station id */
                int psid;
                try {
                    psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                } catch (NumberFormatException e) {
                    Exception te = new TableExtractorException("Failed to parse polling station ID >"
                            + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                    logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                    return;
                }

                /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                        cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
            }
        }

        if (tables_found == 0)
            logger.severe("No parsable tables found in " + input_html);
        resultSink.commit();

        logger.info("" + tables_found + " table(s) processed in " + input_html);
    }

From source file:org.sbs.goodcrawler.fetcher.FetchWorker.java

/**
 * @param url//w  w  w  . java 2s. c o m
 * @desc 
 */
public void fetchPage(WebURL url) {
    PageFetchResult result = null;
    try {
        if (null != url && StringUtils.isNotBlank(url.getURL())) {
            // ??
            if (fetchFilter(url.getURL())) {
                result = fetcher.fetchHeader(url);
                // ??
                int statusCode = result.getStatusCode();
                if (statusCode == CustomFetchStatus.PageTooBig) {
                    onIgnored(url);
                    return;
                }
                if (statusCode != HttpStatus.SC_OK) {
                    onFailed(url);
                } else {
                    Page page = new Page(url);
                    pendingUrls.processedSuccess();
                    if (!result.fetchContent(page)) {
                        onFailed(url);
                        return;
                    }
                    if (!parser.parse(page, url.getURL())) {
                        onFailed(url);
                        return;
                    }
                    // ??
                    if (extractFilter(url.getURL())) {
                        pendingPages.addElement(page);
                    }

                    // depth
                    if (url.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) {
                        return;
                    }
                    // ???Url?Url
                    Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                            urlUtils.getBaseUrl(page.getWebURL().getURL()));
                    Elements links = doc.getElementsByTag("a");
                    if (!links.isEmpty()) {
                        for (Element link : links) {
                            String linkHref = link.absUrl("href");
                            // ??
                            if (fetchFilter(linkHref) && !bloomfilterHelper.exist(linkHref)) {
                                WebURL purl = new WebURL();
                                purl.setURL(linkHref);
                                purl.setJobName(conf.jobName);
                                purl.setDepth((short) (url.getDepth() + 1));
                                if (purl.getDepth() > conf.getMaxDepthOfCrawling()
                                        && conf.getMaxDepthOfCrawling() != -1)
                                    return;
                                try {
                                    if (!pendingUrls.addElement(purl, 1000)) {
                                        FileUtils.writeStringToFile(new File("status/_urls.good"),
                                                url.getURL() + "\n", true);
                                    }
                                } catch (QueueException e) {
                                    log.error(e.getMessage());
                                }
                            }
                        }
                    }
                }
            } else {
                onIgnored(url);
            }
        }
    } catch (Exception e) {
        onFailed(url);
    } catch (QueueException e) {
        onFailed(url);
    } finally {
        if (null != result)
            result.discardContentIfNotConsumed();
    }
}

From source file:tkbautobooking.BookingSystem.java

private void praseBookingToken() throws Exception {

    Document doc = Jsoup.parse(BookingPageHTML);
    Element script = doc.getElementsByTag("script").last();

    String str = script.toString().substring(script.toString().indexOf("access_token"));
    str = str.substring(str.indexOf("\"") + 1);
    str = str.substring(0, str.indexOf("\""));

    booking_hidden_token = str;//www. j  a va  2 s.co m
}

From source file:net.GoTicketing.GoTicketing.java

/**
 * ??// ww  w .  j av  a 2 s  .  co m
 * @throws Exception 
 */
private void praseVoiceCaptchaSrc() throws Exception {
    Document doc = Jsoup.parse(TicketingPageHTML);
    Element voc = doc.getElementsByTag("audio").last();
    if (voc == null)
        throw new Exception("Can't get voice captcha source !");

    //out.println(host + voc.attr("src").substring(1));
    VoiceCaptchaSrc = host + voc.attr("src").substring(1);
}