Example usage for org.jsoup.nodes Document html

Introduction

In this page you can find the example usage for org.jsoup.nodes Document html.

Prototype

public String html()

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:com.cognifide.aet.job.common.comparators.source.SourceComparator.java

private String formatCodeMarkup(String code) {
    Document doc = Jsoup.parse(code);
    NodeTraversor traversor = new NodeTraversor(new MarkupVisitor());
    traversor.traverse(doc);/* ww  w.j av  a  2s  . co m*/
    return doc.html();
}

From source file:me.vertretungsplan.parser.DSBLightParser.java

private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl)
        throws IOException, JSONException, CredentialInvalidException {
    String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer);
    Document doc = Jsoup.parse(html);
    if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis")
            || doc.select(".mon_list").size() > 0) {
        parseMultipleMonitorDays(schedule, doc, data);
        if (doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl)) {
                parseDay(redirectUrl, referer, schedule, startUrl);
            }//from w  ww.ja v a2 s. c  om
        }
    }
}

From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java

/**
 * ?page??/*from ww w .ja  va2s  .c o m*/
 */
@Override
public void visit(Page page) {
    try {
        String url = page.getWebURL().getURL();

        page.setContentType("text/html; charset=" + gather.getEncoding());
        Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

        String title = doc.title();
        if (gather.getTitleExternal() && gather.getTitleRegex() != null
                && gather.getTitleRegex().length() > 0) {
            Elements titleEles = doc.select(gather.getTitleRegex());
            if (!titleEles.isEmpty()) {
                String tempTitle = titleEles.text();
                if (tempTitle != null && tempTitle.length() > 0) {
                    title = tempTitle;
                }
            }
        }

        if (title != null && title.trim().length() > 0) {
            Elements elements = doc.select(matchRegex);
            if (filterRegex != null && filterRegex.trim().length() > 0) {
                elements = elements.not(filterRegex);
            }
            if (!elements.isEmpty()) {
                String subHtml = elements.html();
                Document blockDoc = Jsoup.parse(subHtml);
                String contentText = blockDoc.html();

                if (gather.getRemoveHref()) {
                    Document moveDoc = Jsoup.parse(contentText);
                    Elements moveEles = moveDoc.select("*").not("a");
                    contentText = moveEles.html();
                }
                if (gather.getRemoveHtmlTag())
                    contentText = doc.text();

                if (isLocal) {
                    contentText = doc.text();

                    Boolean isMatcher = true;
                    for (int i = 0; i < keys.length; i++) {
                        Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                        if (!result) {
                            isMatcher = false;
                            break;
                        }
                    }

                    if (isMatcher) {
                        Storage storage = new Storage();
                        storage.setGatherId(gather.getId());
                        storage.setGatherName(gather.getName());
                        storage.setTitle(title);
                        storage.setUrl(url);
                        try {
                            gatherService.addStorage(storage);
                        } catch (Exception e) {
                            logger.error("save storage error : {}", e.getLocalizedMessage());
                        } finally {
                            storage = null;
                        }
                    }
                } else {
                    Content content = new Content();
                    content.setDetail(contentText);
                    content.setPage(1);
                    List<Content> contents = new ArrayList<Content>();
                    contents.add(content);

                    Article article = new Article();
                    article.setTitle(title);
                    article.setContents(contents);

                    articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                            CrawlerUtil.USER_NAME);
                }
            }
        }
    } catch (IOException e) {
        logger.warn(e.getLocalizedMessage());
    }
}

From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java

@Test
public void testParseContent() throws Exception {
    Assert.assertNotNull(pdfStreamResult);

    final Document doc = pdfStreamResult.parseContent("<div>text</div>");
    Assert.assertNotNull(doc);/*from   ww  w  .  j a  v a2  s  .co  m*/

    Assert.assertEquals("<html><head></head><body><div>text</div></body></html>",
            StringUtils.deleteWhitespace(doc.html()));
}

From source file:se.vgregion.portal.iframe.controller.CSEditController.java

/**
 * RenderMapping for edit page. The method extracts input elements from a given URL.
 *
 * @param prefs PortletPreferences/*from  w  w w.  ja va  2s .c o m*/
 * @param model Model
 * @return view
 */
@RenderMapping(params = "action=loginExtractor")
public String loginExtractor(PortletPreferences prefs, Model model) {
    PortletConfig portletConfig = PortletConfig.getInstance(prefs);
    model.addAttribute("portletConfig", portletConfig);

    String loginFormUrl = portletConfig.getSrc();
    model.addAttribute("loginformUrl", loginFormUrl);

    try {
        final int timeout = 5000;
        Document doc = new JSoupHelper().invoke(new URL(loginFormUrl), timeout);
        model.addAttribute("loginformContent", doc.html());

        List<Form> loginforms = loginformService.extract(doc);
        model.addAttribute("loginforms", loginforms);

        LoginExtractor loginExtractor = initLoginExtractor(loginforms);
        model.addAttribute("loginExtractor", loginExtractor);
    } catch (Exception e) {
        model.addAttribute("loginformContent", "Failed to lookup page content");
        model.addAttribute("error", e);
        e.printStackTrace();
    }

    return "loginExtractor";
}

From source file:org.brunocvcunha.taskerbox.impl.jobs.MonsterJobSeeker.java

private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl)
        throws JSONException, ClientProtocolException, IOException, URISyntaxException {

    if (alreadyPerformedAction(jobUrl)) {
        return true;
    }/*  ww  w .  java2  s  .c  om*/

    String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer;

    if (!considerTitle(jobTitle)) {
        logInfo(log, "-- Ignored [title] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerEmployer(jobEmployer)) {
        logInfo(log, "-- Ignored [employer] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerLocation(location)) {
        logInfo(log, "-- Ignored [location] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    try {
        Thread.sleep(1000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl);
    String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity);
    Document jobDocument = Jsoup.parse(jobResult);

    Elements elDescription = jobDocument.select("div#jobBodyContent");

    if (!jobDocument.html().contains("ApplyOnlineUrl: ''")
            && !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !this.externalApply) {
        logInfo(log, "-- Ignored [externalApply] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerVisaDescription(elDescription.html())) {
        logInfo(log, "-- Ignored [visa] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }
    if (!considerExperienceDescription(elDescription.html())) {
        logInfo(log, "-- Ignored [exp] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html());

    if (result.getScore() < this.requiredScore) {
        logInfo(log,
                "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    headline = headline + " - " + result.getMatches();

    logInfo(log, "Open --> " + headline);
    // logInfo(log, elDescription.html());

    performUnique(jobUrl);

    try {
        Thread.sleep(5000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    return true;

}

From source file:me.vertretungsplan.parser.DSBMobileParser.java

private void loadScheduleFromUrl(SubstitutionSchedule v, String url, List<String> usedUrls)
        throws IOException, JSONException, CredentialInvalidException, IncompatibleScheduleException {
    usedUrls.add(url);//from w w  w .j a va  2  s . c o  m
    String html = httpGet(url, data.has(PARAM_ENCODING) ? data.optString(PARAM_ENCODING, null) : "UTF-8");
    Document doc = Jsoup.parse(html);

    if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis")
            || data.optString(PARAM_TYPE, "").equals("untis")) {
        parseMultipleMonitorDays(v, doc, data);
    } else if (doc.html().toLowerCase().contains("created by davinci")
            || data.optString(PARAM_TYPE, "").equals("davinci")) {
        Elements titles = doc.select("h2");
        Elements tables = doc.select("h2 + p + table");
        if (titles.size() != tables.size())
            throw new IOException("Anzahl berschriften != Anzahl Tabellen");
        for (int i = 0; i < titles.size(); i++) {
            SubstitutionScheduleDay day = new SubstitutionScheduleDay();
            String date = titles.get(i).text();
            day.setDateString(date);
            day.setDate(ParserUtils.parseDate(date));
            DaVinciParser.parseDaVinciTable(tables.get(i), v, day, colorProvider);
            v.addDay(day);
        }
    } else if (doc.select(".tdaktionen").size() > 0 || data.optString(PARAM_TYPE, "").equals("indiware")) {
        new IndiwareParser(scheduleData, cookieProvider).parseIndiwarePage(v, doc.html());
    } else if (doc.text().matches(".*Fr diesen Bereich.*wurde kein Inhalt bereitgestellt\\.")) {
        return;
    } else {
        throw new IncompatibleScheduleException();
    }

    if (doc.select("meta[http-equiv=refresh]").size() > 0) {
        Element meta = doc.select("meta[http-equiv=refresh]").first();
        String attr = meta.attr("content").toLowerCase();
        String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                + attr.substring(attr.indexOf("url=") + 4);
        if (!usedUrls.contains(redirectUrl)) {
            loadScheduleFromUrl(v, redirectUrl, usedUrls);
        }
    }
}

From source file:mailbox.CreationViaEmail.java

private static String replaceCidWithAttachments(String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = { "src", "href" };

    for (String attrName : attrNames) {
        Elements tags = doc.select("*[" + attrName + "]");
        for (Element tag : tags) {
            String uriString = tag.attr(attrName).trim();

            if (!uriString.toLowerCase().startsWith("cid:")) {
                continue;
            }/* w w w.  j a v  a2  s  . co  m*/

            String cid = uriString.substring("cid:".length());

            if (!attachments.containsKey(cid)) {
                continue;
            }

            Long id = attachments.get(cid).id;
            tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
        }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
        return bodies.get(0).html();
    } else {
        return doc.html();
    }
}

From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java

@Test
public void testParseContentInput() throws Exception {
    Assert.assertNotNull(pdfStreamResult);

    final Document doc = pdfStreamResult.parseContent("<form><input type='text' name='name'></form>");
    Assert.assertNotNull(doc);// ww w  . j av a 2  s .  co m

    Assert.assertEquals(
            "<html><head></head><body><form><inputtype=\"text\"name=\"name\"/></form></body></html>",
            StringUtils.deleteWhitespace(doc.html()));
}

From source file:com.amashchenko.struts2.pdfstream.PdfStreamResultTest.java

@Test
public void testParseContentScript() throws Exception {
    Assert.assertNotNull(pdfStreamResult);

    final Document doc = pdfStreamResult.parseContent(
            "<head><script>alert('alert 1');<\\/script></head><script>alert('alert 2');</script><div>text</div>");
    Assert.assertNotNull(doc);/*from  w  ww .j  a  v  a 2  s .  c  om*/

    Assert.assertEquals("<html><head></head><body><div>text</div></body></html>",
            StringUtils.deleteWhitespace(doc.html()));
}