List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:com.dajodi.scandic.JSoupScraper.java
@Override public Map<String, String> scrapeFormInputFields(InputStream inStream) { try {//from w w w. j a v a 2s. c om Document doc = Jsoup.parse(inStream, HTTP.UTF_8, ""); Element form = doc.body().getElementById("aspnetForm"); Elements inputNodes = form.getElementsByTag("input"); Map<String, String> inputMap = new HashMap<String, String>(); for (Element element : inputNodes) { String name = element.attr("name"); String value = element.attr("value"); if (name != null) { inputMap.put(name, value == null ? "" : value); } else { //TODO: remove me Log.d("Something weird"); } } doc.empty(); return inputMap; } catch (Exception e) { throw new ScandicHtmlException(e); } }
From source file:dev.maisentito.suca.commands.EnitCommandHandler.java
@Override public void handleCommand(MessageEvent event, String[] args) throws Throwable { Document doc = Jsoup.connect("http://www.wordreference.com/enit/" + StringUtils.join(args, ' ')) .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get(); Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)"); row.select(".tooltip").remove(); String def = row.text().trim().replace("\n", ""); event.respond(def);// w w w .j a v a 2 s .c om }
From source file:dev.maisentito.suca.commands.ItenCommandHandler.java
@Override public void handleCommand(MessageEvent event, String[] args) throws Throwable { Document doc = Jsoup.connect("http://www.wordreference.com/iten/" + StringUtils.join(args, ' ')) .userAgent(getStringGlobal(Main.GLOBAL_USERAGENT, "")).referrer("http://www.google.com/").get(); Elements row = doc.body().select("table.WRD:nth-child(2) > tbody:nth-child(1) > tr:nth-child(2)"); row.select(".tooltip").remove(); String def = row.text().trim().replace("\n", ""); event.respond(def);/*from ww w . ja va2 s .com*/ }
From source file:alliance.docs.DocumentationTest.java
@Test public void testBrokenAnchorsPresent() throws IOException, URISyntaxException { List<Path> docs = Files.list(getPath()).filter(f -> f.toString().endsWith(HTML_DIRECTORY)) .collect(Collectors.toList()); Set<String> links = new HashSet<>(); Set<String> anchors = new HashSet<>(); for (Path path : docs) { Document doc = Jsoup.parse(path.toFile(), "UTF-8", EMPTY_STRING); String thisDoc = StringUtils.substringAfterLast(path.toString(), File.separator); Elements elements = doc.body().getAllElements(); for (Element element : elements) { if (!element.toString().contains(":") && StringUtils.substringBetween(element.toString(), HREF_ANCHOR, CLOSE) != null) { links.add(thisDoc + "#" + StringUtils.substringBetween(element.toString(), HREF_ANCHOR, CLOSE)); }/*from w w w . j a v a2 s . c o m*/ anchors.add(thisDoc + "#" + StringUtils.substringBetween(element.toString(), ID, CLOSE)); } } links.removeAll(anchors); assertThat("Anchors missing section reference: " + links.toString(), links.isEmpty()); }
From source file:jodtemplate.pptx.style.HtmlStylizer.java
@Override public List<Element> stylize(final String text, final Element arPr, final Element apPr, final Slide slide) throws JODTemplateException { final Document htmlDoc = Jsoup.parse(text); try {/*from w w w. ja va 2s. c om*/ return process(htmlDoc.body(), arPr, apPr, slide); } catch (IOException e) { throw new JODTemplateException("Stylizer error", e); } }
From source file:com.crawler.app.run.JellyfishCrawlerSiteMNMN.java
/** * This function is called when a page is fetched and ready to be processed * by your program.// ww w . jav a 2 s . c om */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://monngonmoingay.com/") && href.endsWith("/")) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); // Elements listDetail = body.select("section div[class=MyJobLeft]"); String jobUrl = url; //String jobName = listDetail.select("h1").html(); /* String companyName = listDetail.select("div[class=tit_company]").html(); String jobLocation = listDetail.select("div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a").html(); String companyAddress = listDetail.select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]").html(); String companyContact = listDetail.select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html(); String companyPhone = listDetail.select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html(); String companyWebsite = listDetail.select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[id=main_content_right]"); jobName = listDetail.select("h1 p").html(); companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html(); jobLocation = listDetail.select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[class=content_right]"); jobName = listDetail.select("h1").html(); companyName = listDetail.select("div[class=intro_company] div[class=title_into] p[class=title_comp]").html(); Elements gCompanyWebList = listDetail.select("div[class=intro_company] div[class=title_into] p"); if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1) companyWebsite = gCompanyWebList.get(1).html(); jobLocation = listDetail.select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); } } jobName = listDetail.select("h1 a").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1 p").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1").html(); */ System.out.println("\n Url : " + jobUrl); //System.out.println("\n Title : " + jobName); try { Integer siteID = 3; //String companyWebsite = ""; /* MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:com.crawler.app.run.JellyfishCrawlerSiteCB.java
/** * This function is called when a page is fetched and ready to be processed * by your program.//from w w w . j a va2s. com */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://careerbuilder.vn/vi/tim-viec-lam") && href.endsWith(".html")) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); Elements listDetail = body.select("section div[class=MyJobLeft]"); String jobUrl = url; String jobName = listDetail.select("h1").html(); String companyName = listDetail.select("div[class=tit_company]").html(); String jobLocation = listDetail.select( "div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a") .html(); String companyAddress = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]") .html(); String companyContact = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html(); String companyPhone = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html(); String companyWebsite = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[id=main_content_right]"); jobName = listDetail.select("h1 p").html(); companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[class=content_right]"); jobName = listDetail.select("h1").html(); companyName = listDetail .select("div[class=intro_company] div[class=title_into] p[class=title_comp]") .html(); Elements gCompanyWebList = listDetail .select("div[class=intro_company] div[class=title_into] p"); if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1) companyWebsite = gCompanyWebList.get(1).html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); } } jobName = listDetail.select("h1 a").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1 p").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1").html(); System.out.println("\n Title : " + jobName); try { Integer siteID = 3; //String companyWebsite = ""; /* MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:org.ala.lucene.CreateWordPressIndex.java
/** * Index the WP pages by parsing with Jsoup and indexing into SOLR * * @return/*from w w w . j a v a2 s. c o m*/ * @throws IOException */ protected int indexPages() throws Exception { int documentCount = 0; // Initialise SOLR SolrServer solrServer = solrUtils.getSolrServer(); logger.info("Deleting all WordPress documents in SOLR index..."); solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages solrServer.commit(); for (String pageUrl : this.pageUrls) { try { // Crawl and extract text from WP pages Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get(); String title = document.select("head > title").text(); String id = document.select("head > meta[name=id]").attr("content"); String bodyText = document.body().text(); Elements postCategories = document.select("ul[class=post-categories]"); List<String> categoriesOut = new ArrayList<String>(); Boolean excludePost = false; if (!postCategories.isEmpty()) { // Is a WP post (not page) Elements categoriesIn = postCategories.select("li > a"); // get list of li elements for (Element cat : categoriesIn) { String thisCat = cat.text(); if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat) // exclude category "button" posts excludePost = true; } if (thisCat != null) { // add category to list categoriesOut.add(thisCat.replaceAll(" ", "_")); } } } if (excludePost) { logger.debug("Excluding post (id: " + id + ") with category: " + StringUtils.join(categoriesOut, "|")); continue; } documentCount++; // Index with SOLR logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: " + StringUtils.substring(bodyText, 0, 100) + "... "); SolrInputDocument doc = new SolrInputDocument(); doc.addField("idxtype", IndexedTypes.WORDPRESS); doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field doc.addField("id", "wp" + id); // probably not needed but safer to leave in doc.addField("name", title, 1.2f); doc.addField("content", bodyText); doc.addField("australian_s", "recorded"); // so they appear in default QF search doc.addField("categories", categoriesOut); // add to index solrServer.add(doc); if (documentCount % 100 == 0) { logger.info("Committing to SOLR (count = " + documentCount + ")..."); solrServer.commit(); } } catch (IOException ex) { // catch it so we don't stop indexing other pages logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex); } } logger.info("Final Committing to SOLR..."); solrServer.commit(); //logger.info("Optimising SOLR index..."); //solrServer.optimize(); // throws errors on my machine?? logger.info("Committed to SOLR. Final document count: " + documentCount); return documentCount; }
From source file:com.crawler.app.run.JellyfishCrawlerSiteVNW.java
/** * This function is called when a page is fetched and ready to be processed * by your program.//w w w.j a v a 2 s .c om */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://www.vietnamworks.com/") && (href.endsWith("jd") || href.endsWith("jv") || href.endsWith("jv/"))) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); //get meta description content //String description = doc.select("meta[name=description]").get(0).attr("content"); //System.out.println("Meta description : " + description); //Element e = doc.getElementById("detail_copyB"); Element detail = body.select("section[id=content]").first(); //String aTitlePost = getTagValues(e.toString(), "<h3>", "</h3>"); String jobUrl = url;//detail.select("h3[class=title] a").first().attr("abs:href"); String jobName = detail.select("div[class=job-header-info] h1").html(); String companyName = detail.select("span[class=company-name text-lg block] strong").html(); String companyAddress = detail.select("span[class=company-address block]").html(); String jobLocation = detail.select("p[class=work-location] span[itemprop=address] a").html(); String companyContact = detail.select("div[class=col-xs-12 col-md-8 col-lg-8 pull-left] p strong") .html();// div[class=company-info] span[class=company-address block] p System.out.println("\n Title : " + jobName); System.out.println("\n Contact : " + companyContact); try { /* Integer siteID = 2; String companyPhone = "", companyWebsite = ""; MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } //String eCrawl2 = listTD.get(0); //String eCrawl3 = listTD.get(1); /* System.out.println("\n Cate : " + bCate); System.out.println("\n Title : " + aTitlePost); System.out.println("\n Date : " + hDatePost);*/ //System.out.println("\n E : " + listTD.toString() + " --- " + eCrawl2 + "----" + eCrawl3); //System.out.println("\n Count : " + doc.toString()); //System.out.println("\n Total Div: --" + listDetail.size()); //System.exit(1); //String content = htmlParseData.getBodyText(); //Set<WebURL> links = htmlParseData.getOutgoingUrls(); //logger.debug("Text length: {}", text.length()); //System.out.println("Text length: {}" + text); //System.out.println("\n Title: {}" + title); //logger.debug("Html: {}", html); //System.out.println("Html: {}" + html); //logger.debug("Number of outgoing links: {}", links.size()); //System.out.println("Number of outgoing links: {}" + links.size()); //final String str = "<tag>apple</tag><b>hello</b><tag>orange</tag><tag>pear</tag>"; //System.out.println("\n Matcher: {}" + Arrays.toString(getTagValues(html).toArray())); // Prints [apple, orange, pear] //MysqlCrawler.getInstance().insertURL(url, title, ""); } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:com.lingxiang2014.entity.Article.java
@Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; }/*w ww . j a v a 2s . c om*/ if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }