Example usage for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body()

Source Link

Document

Accessor to the document's body element.

Usage

From source file:com.dajodi.scandic.JSoupScraper.java

@Override
public Map<String, String> scrapeFormInputFields(InputStream inStream) {

    try {//from  w w w. j  a v a  2s.  c om
        Document doc = Jsoup.parse(inStream, HTTP.UTF_8, "");

        Element form = doc.body().getElementById("aspnetForm");

        Elements inputNodes = form.getElementsByTag("input");
        Map<String, String> inputMap = new HashMap<String, String>();

        for (Element element : inputNodes) {

            String name = element.attr("name");
            String value = element.attr("value");

            if (name != null) {
                inputMap.put(name, value == null ? "" : value);
            } else {
                //TODO: remove me
                Log.d("Something weird");
            }
        }

        doc.empty();
        return inputMap;
    } catch (Exception e) {
        throw new ScandicHtmlException(e);
    }
}

From source file:dev.maisentito.suca.commands.EnitCommandHandler.java

@Override
public void handleCommand(MessageEvent event, String[] args) throws Throwable {
    Document doc = Jsoup.connect("http://www.wordreference.com/enit/" + StringUtils.join(args, ' '))
            .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get();
    Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)");
    row.select(".tooltip").remove();
    String def = row.text().trim().replace("\n", "");
    event.respond(def);// w  w  w .j  a v a  2 s  .c om
}

From source file:dev.maisentito.suca.commands.ItenCommandHandler.java

@Override
public void handleCommand(MessageEvent event, String[] args) throws Throwable {
    Document doc = Jsoup.connect("http://www.wordreference.com/iten/" + StringUtils.join(args, ' '))
            .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get();
    Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)");
    row.select(".tooltip").remove();
    String def = row.text().trim().replace("\n", "");
    event.respond(def);/*from   ww w .  ja  va2 s  .com*/
}

From source file:alliance.docs.DocumentationTest.java

@Test
public void testBrokenAnchorsPresent() throws IOException, URISyntaxException {
    List<Path> docs = Files.list(getPath()).filter(f -> f.toString().endsWith(HTML_DIRECTORY))
            .collect(Collectors.toList());

    Set<String> links = new HashSet<>();
    Set<String> anchors = new HashSet<>();

    for (Path path : docs) {
        Document doc = Jsoup.parse(path.toFile(), "UTF-8", EMPTY_STRING);

        String thisDoc = StringUtils.substringAfterLast(path.toString(), File.separator);
        Elements elements = doc.body().getAllElements();
        for (Element element : elements) {
            if (!element.toString().contains(":")
                    && StringUtils.substringBetween(element.toString(), HREF_ANCHOR, CLOSE) != null) {
                links.add(thisDoc + "#" + StringUtils.substringBetween(element.toString(), HREF_ANCHOR, CLOSE));
            }/*from w w  w  . j a v  a2 s .  c  o  m*/

            anchors.add(thisDoc + "#" + StringUtils.substringBetween(element.toString(), ID, CLOSE));
        }
    }
    links.removeAll(anchors);
    assertThat("Anchors missing section reference: " + links.toString(), links.isEmpty());
}

From source file:jodtemplate.pptx.style.HtmlStylizer.java

@Override
public List<Element> stylize(final String text, final Element arPr, final Element apPr, final Slide slide)
        throws JODTemplateException {
    final Document htmlDoc = Jsoup.parse(text);
    try {/*from  w  w  w.  ja  va 2s. c  om*/
        return process(htmlDoc.body(), arPr, apPr, slide);
    } catch (IOException e) {
        throw new JODTemplateException("Stylizer error", e);
    }
}

From source file:com.crawler.app.run.JellyfishCrawlerSiteMNMN.java

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.// ww  w . jav  a 2 s  . c om
 */
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    //logger.info("URL: ", url);
    String host = "127.0.0.1";
    String port = "3306";
    String dbName = "crawler";
    String dbUser = "root";
    String dbPwd = "";
    MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd);
    System.out.println("\n URL visit: " + url);

    String href = url.toLowerCase();
    if (href.startsWith("http://monngonmoingay.com/") && href.endsWith("/")) {
        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String text = htmlParseData.getText();
            String html = htmlParseData.getHtml();
            String title = htmlParseData.getTitle();

            Document doc = Jsoup.parse(html, "UTF-8");
            //doc.outputSettings().escapeMode(EscapeMode.xhtml);
            Element body = doc.body();

            // Elements listDetail = body.select("section div[class=MyJobLeft]");  
            String jobUrl = url;
            //String jobName = listDetail.select("h1").html();   
            /*
            String companyName = listDetail.select("div[class=tit_company]").html();
            String jobLocation = listDetail.select("div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a").html();   
            String companyAddress = listDetail.select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]").html();
            String companyContact = listDetail.select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html();
            String companyPhone = listDetail.select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html();
            String companyWebsite = listDetail.select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html();
                    
            if (listDetail.isEmpty() || jobName.isEmpty()) {
               listDetail = body.select("div[id=main_content] div[id=main_content_right]");
               jobName = listDetail.select("h1 p").html(); 
               companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html();
               jobLocation = listDetail.select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();
               if (listDetail.isEmpty() || jobName.isEmpty()) {
                  listDetail = body.select("div[id=main_content] div[class=content_right]");          
                  jobName = listDetail.select("h1").html();
                  companyName = listDetail.select("div[class=intro_company] div[class=title_into] p[class=title_comp]").html();
                   Elements gCompanyWebList = listDetail.select("div[class=intro_company] div[class=title_into] p");
                   if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1)
                      companyWebsite = gCompanyWebList.get(1).html();
                   jobLocation = listDetail.select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();
                           
                    
               }
            }      
            jobName = listDetail.select("h1 a").html(); 
            if (jobName.isEmpty())
              jobName = listDetail.select("h1 p").html();
            if (jobName.isEmpty())
              jobName = listDetail.select("h1").html();
            */
            System.out.println("\n Url : " + jobUrl);
            //System.out.println("\n Title : " + jobName);
            try {
                Integer siteID = 3;
                //String companyWebsite = "";
                /*
                MysqlCrawler.getInstance().insertJFHRContents(
                      siteID
                      , jobUrl
                      , jobName
                      , jobLocation
                      , companyName
                      , companyAddress
                      , companyPhone
                      , companyContact
                      , companyWebsite);
                       */
                //System.exit(1);
            } catch (Exception ex) {
                //System.out.println("\n Fail I : " + i);
                System.out.println("\n Ex : " + ex);
            }
        }

    }

    /*
        Header[] responseHeaders = page.getFetchResponseHeaders();
        if (responseHeaders != null) {
          logger.debug("Response headers:");
          for (Header header : responseHeaders) {
            logger.debug("\t{}: {}", header.getName(), header.getValue());
          }
        }
    */
    logger.debug("=============");
}

From source file:com.crawler.app.run.JellyfishCrawlerSiteCB.java

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.//from w w  w  . j  a va2s. com
 */
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    //logger.info("URL: ", url);
    String host = "127.0.0.1";
    String port = "3306";
    String dbName = "crawler";
    String dbUser = "root";
    String dbPwd = "";
    MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd);
    System.out.println("\n URL visit: " + url);

    String href = url.toLowerCase();
    if (href.startsWith("http://careerbuilder.vn/vi/tim-viec-lam") && href.endsWith(".html")) {
        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String text = htmlParseData.getText();
            String html = htmlParseData.getHtml();
            String title = htmlParseData.getTitle();

            Document doc = Jsoup.parse(html, "UTF-8");
            //doc.outputSettings().escapeMode(EscapeMode.xhtml);
            Element body = doc.body();

            Elements listDetail = body.select("section div[class=MyJobLeft]");
            String jobUrl = url;
            String jobName = listDetail.select("h1").html();
            String companyName = listDetail.select("div[class=tit_company]").html();
            String jobLocation = listDetail.select(
                    "div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a")
                    .html();
            String companyAddress = listDetail
                    .select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]")
                    .html();
            String companyContact = listDetail
                    .select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html();
            String companyPhone = listDetail
                    .select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html();
            String companyWebsite = listDetail
                    .select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html();

            if (listDetail.isEmpty() || jobName.isEmpty()) {
                listDetail = body.select("div[id=main_content] div[id=main_content_right]");
                jobName = listDetail.select("h1 p").html();
                companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html();
                jobLocation = listDetail
                        .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();
                if (listDetail.isEmpty() || jobName.isEmpty()) {
                    listDetail = body.select("div[id=main_content] div[class=content_right]");
                    jobName = listDetail.select("h1").html();
                    companyName = listDetail
                            .select("div[class=intro_company] div[class=title_into] p[class=title_comp]")
                            .html();
                    Elements gCompanyWebList = listDetail
                            .select("div[class=intro_company] div[class=title_into] p");
                    if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1)
                        companyWebsite = gCompanyWebList.get(1).html();
                    jobLocation = listDetail
                            .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();

                }
            }
            jobName = listDetail.select("h1 a").html();
            if (jobName.isEmpty())
                jobName = listDetail.select("h1 p").html();
            if (jobName.isEmpty())
                jobName = listDetail.select("h1").html();

            System.out.println("\n Title : " + jobName);
            try {
                Integer siteID = 3;
                //String companyWebsite = "";
                /*
                MysqlCrawler.getInstance().insertJFHRContents(
                      siteID
                      , jobUrl
                      , jobName
                      , jobLocation
                      , companyName
                      , companyAddress
                      , companyPhone
                      , companyContact
                      , companyWebsite);
                       */
                //System.exit(1);
            } catch (Exception ex) {
                //System.out.println("\n Fail I : " + i);
                System.out.println("\n Ex : " + ex);
            }
        }

    }

    /*
        Header[] responseHeaders = page.getFetchResponseHeaders();
        if (responseHeaders != null) {
          logger.debug("Response headers:");
          for (Header header : responseHeaders) {
            logger.debug("\t{}: {}", header.getName(), header.getValue());
          }
        }
    */
    logger.debug("=============");
}

From source file:org.ala.lucene.CreateWordPressIndex.java

/**
 * Index the WP pages by parsing with Jsoup and indexing into SOLR
 *
 * @return/*from   w w w .  j a  v  a2 s.  c  o m*/
 * @throws IOException
 */
protected int indexPages() throws Exception {
    int documentCount = 0;
    // Initialise SOLR
    SolrServer solrServer = solrUtils.getSolrServer();
    logger.info("Deleting all WordPress documents in SOLR index...");
    solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages
    solrServer.commit();

    for (String pageUrl : this.pageUrls) {
        try {
            // Crawl and extract text from WP pages
            Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get();
            String title = document.select("head > title").text();
            String id = document.select("head > meta[name=id]").attr("content");
            String bodyText = document.body().text();
            Elements postCategories = document.select("ul[class=post-categories]");
            List<String> categoriesOut = new ArrayList<String>();
            Boolean excludePost = false;

            if (!postCategories.isEmpty()) {
                // Is a WP post (not page)
                Elements categoriesIn = postCategories.select("li > a"); // get list of li elements

                for (Element cat : categoriesIn) {
                    String thisCat = cat.text();

                    if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat)
                        // exclude category "button" posts
                        excludePost = true;
                    }
                    if (thisCat != null) {
                        // add category to list
                        categoriesOut.add(thisCat.replaceAll(" ", "_"));
                    }
                }
            }

            if (excludePost) {
                logger.debug("Excluding post (id: " + id + ") with category: "
                        + StringUtils.join(categoriesOut, "|"));
                continue;
            }

            documentCount++;
            // Index with SOLR
            logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: "
                    + StringUtils.substring(bodyText, 0, 100) + "... ");
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("idxtype", IndexedTypes.WORDPRESS);
            doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field
            doc.addField("id", "wp" + id); // probably not needed but safer to leave in
            doc.addField("name", title, 1.2f);
            doc.addField("content", bodyText);
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            doc.addField("categories", categoriesOut);
            // add to index
            solrServer.add(doc);

            if (documentCount % 100 == 0) {
                logger.info("Committing to SOLR (count = " + documentCount + ")...");
                solrServer.commit();
            }
        } catch (IOException ex) {
            // catch it so we don't stop indexing other pages
            logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex);
        }
    }

    logger.info("Final Committing to SOLR...");
    solrServer.commit();
    //logger.info("Optimising SOLR index...");
    //solrServer.optimize(); // throws errors on my machine??
    logger.info("Committed to SOLR. Final document count: " + documentCount);
    return documentCount;
}

From source file:com.crawler.app.run.JellyfishCrawlerSiteVNW.java

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.//w  w w.j  a  v  a  2 s .c  om
 */
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    //logger.info("URL: ", url);
    String host = "127.0.0.1";
    String port = "3306";
    String dbName = "crawler";
    String dbUser = "root";
    String dbPwd = "";
    MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd);
    System.out.println("\n URL visit: " + url);

    String href = url.toLowerCase();
    if (href.startsWith("http://www.vietnamworks.com/")
            && (href.endsWith("jd") || href.endsWith("jv") || href.endsWith("jv/"))) {

        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String text = htmlParseData.getText();
            String html = htmlParseData.getHtml();
            String title = htmlParseData.getTitle();

            Document doc = Jsoup.parse(html, "UTF-8");
            //doc.outputSettings().escapeMode(EscapeMode.xhtml);
            Element body = doc.body();
            //get meta description content
            //String description = doc.select("meta[name=description]").get(0).attr("content");
            //System.out.println("Meta description : " + description);

            //Element e = doc.getElementById("detail_copyB");
            Element detail = body.select("section[id=content]").first();
            //String aTitlePost = getTagValues(e.toString(), "<h3>", "</h3>");

            String jobUrl = url;//detail.select("h3[class=title] a").first().attr("abs:href");
            String jobName = detail.select("div[class=job-header-info] h1").html();
            String companyName = detail.select("span[class=company-name text-lg block] strong").html();
            String companyAddress = detail.select("span[class=company-address block]").html();
            String jobLocation = detail.select("p[class=work-location] span[itemprop=address] a").html();
            String companyContact = detail.select("div[class=col-xs-12 col-md-8 col-lg-8 pull-left] p strong")
                    .html();// div[class=company-info] span[class=company-address block] p

            System.out.println("\n Title : " + jobName);
            System.out.println("\n Contact : " + companyContact);

            try {
                /*
                Integer siteID = 2;
                String companyPhone = "", companyWebsite = "";
                MysqlCrawler.getInstance().insertJFHRContents(
                      siteID
                      , jobUrl
                      , jobName
                      , jobLocation
                      , companyName
                      , companyAddress
                      , companyPhone
                      , companyContact
                      , companyWebsite);
                       */
                //System.exit(1);
            } catch (Exception ex) {
                //System.out.println("\n Fail I : " + i);
                System.out.println("\n Ex : " + ex);
            }

            //String eCrawl2 = listTD.get(0);
            //String eCrawl3 = listTD.get(1);
            /*
            System.out.println("\n Cate : " + bCate);
            System.out.println("\n Title : " + aTitlePost);
            System.out.println("\n Date : " + hDatePost);*/
            //System.out.println("\n E : " + listTD.toString() + " --- " + eCrawl2 + "----" + eCrawl3);
            //System.out.println("\n Count : " + doc.toString());
            //System.out.println("\n Total Div: --" + listDetail.size());
            //System.exit(1);

            //String content = htmlParseData.getBodyText();
            //Set<WebURL> links = htmlParseData.getOutgoingUrls();

            //logger.debug("Text length: {}", text.length());
            //System.out.println("Text length: {}" + text);

            //System.out.println("\n Title: {}" + title);

            //logger.debug("Html: {}", html);
            //System.out.println("Html: {}" + html);

            //logger.debug("Number of outgoing links: {}", links.size());
            //System.out.println("Number of outgoing links: {}" + links.size());

            //final String str = "<tag>apple</tag><b>hello</b><tag>orange</tag><tag>pear</tag>";
            //System.out.println("\n Matcher: {}" + Arrays.toString(getTagValues(html).toArray())); // Prints [apple, orange, pear]

            //MysqlCrawler.getInstance().insertURL(url, title, "");      

        }
    }

    /*
        Header[] responseHeaders = page.getFetchResponseHeaders();
        if (responseHeaders != null) {
          logger.debug("Response headers:");
          for (Header header : responseHeaders) {
            logger.debug("\t{}: {}", header.getName(), header.getValue());
          }
        }
    */
    logger.debug("=============");
}

From source file:com.lingxiang2014.entity.Article.java

@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(content)) {
        return new String[] { "" };
    }/*w ww  .  j a  v  a 2s .  c  om*/
    if (content.contains(PAGE_BREAK_SEPARATOR)) {
        return content.split(PAGE_BREAK_SEPARATOR);
    } else {
        List<String> pageContents = new ArrayList<String>();
        Document document = Jsoup.parse(content);
        List<Node> children = document.body().childNodes();
        if (children != null) {
            int textLength = 0;
            StringBuffer html = new StringBuffer();
            for (Node node : children) {
                if (node instanceof Element) {
                    Element element = (Element) node;
                    html.append(element.outerHtml());
                    textLength += element.text().length();
                    if (textLength >= PAGE_CONTENT_LENGTH) {
                        pageContents.add(html.toString());
                        textLength = 0;
                        html.setLength(0);
                    }
                } else if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text();
                    String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text);
                    Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text);
                    for (String content : contents) {
                        if (matcher.find()) {
                            content += matcher.group();
                        }
                        html.append(content);
                        textLength += content.length();
                        if (textLength >= PAGE_CONTENT_LENGTH) {
                            pageContents.add(html.toString());
                            textLength = 0;
                            html.setLength(0);
                        }
                    }
                }
            }
            String pageContent = html.toString();
            if (StringUtils.isNotEmpty(pageContent)) {
                pageContents.add(pageContent);
            }
        }
        return pageContents.toArray(new String[pageContents.size()]);
    }
}