List of usage examples for org.jsoup.select Elements select
public Elements select(String query)
From source file:org.manalith.ircbot.plugin.linuxpkgfinder.GentooPortageProvider.java
@Override public String find(String arg) { String result = ""; String url = "http://gentoo-portage.com/Search?search=" + arg; String pkgname = ""; String description = ""; Connection conn = Jsoup.connect(url); conn.timeout(120000); // timeout : 60s. This is to slow - -; try {/*from ww w. ja v a2 s . co m*/ Elements e = conn.get().select("#search_results").select("a"); int result_size = e.size(); if (result_size == 0) { result = "[Gentoo] "; return result; } pkgname = e.select("div").text().split("\\s")[0]; if (!pkgname.split("\\/")[1].equals(arg)) { result = "[Gentoo] "; return result; } description = e.select("div").get(1).text(); String detail_url = "http://gentoo-portage.com/" + pkgname; Connection conn2 = Jsoup.connect(detail_url); conn2.timeout(120000); // timeout : 60s. Elements ee = conn2.get().select("#ebuild_list>ul>li"); if (ee.size() == 0) { result = "[Gentoo] "; return result; } result = ee.get(0).select("div").get(0).select("b").text(); result += " : " + description; } catch (Exception e) { logger.error(e.getMessage(), e); result = ": " + e.getMessage(); } return result; }
From source file:accountgen.controller.Controller.java
private void setName(Document doc, Person p) { Elements e = doc.getElementsByClass("address"); Element name = e.select("h3").first(); p.setFirstname(StringEscapeUtils.unescapeHtml4(name.html().split(" ")[0]).trim()); p.setMiddlename(""); p.setLastname(StringEscapeUtils.unescapeHtml4(name.html().split(name.html().split(" ")[0])[1]).trim()); }
From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java
/** * Get value pair of label element./* ww w .j a v a2s . c om*/ * * @param elements group of elements * @param label label to look for * @return value */ private String getValuePairOfLabel(Elements elements, String label) { Element labelElement = elements.select("dt:containsOwn(" + label + ")").first(); return labelElement != null && labelElement.nextElementSibling() != null ? labelElement.nextElementSibling().text() : ""; }
From source file:accountgen.controller.Controller.java
private void setAddress(Document doc, Person p) { Elements e = doc.getElementsByClass("address"); Element ad = e.select(".adr").first(); Address address = new Address(); String streetnumber = StringEscapeUtils .unescapeHtml4(// ww w. jav a 2s .c om ad.html().split("<br />")[0].split(" ")[ad.html().split("<br />")[0].split(" ").length - 1]) .trim(); String state = StringEscapeUtils .unescapeHtml4( ad.html().split("<br />")[1].split(" ")[ad.html().split("<br />")[1].split(" ").length - 1]) .trim(); address.setStreetnumber(streetnumber); address.setStreetname(StringEscapeUtils.unescapeHtml4(ad.html().split(streetnumber)[0]).trim()); address.setState(state); address.setPostcode( StringEscapeUtils.unescapeHtml4(ad.html().split("<br />")[1].split(state)[0]).trim().split(" ")[0]); address.setCountry(Consts.COUNTRY); p.setAdress(address); }
From source file:com.dmrr.asistenciasx.Horarios.java
private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed try {//from w w w . j a v a 2s .c o m int x = jTableHorarios.getSelectedRow(); if (x == -1) { JOptionPane.showMessageDialog(this, "Seleccione un profesor primero", "Datos incompletos", JOptionPane.WARNING_MESSAGE); return; } Integer idProfesor = Integer .parseInt((String) jTableHorarios.getValueAt(jTableHorarios.getSelectedRow(), 1)); JPasswordField pf = new JPasswordField(); String nip = ""; int okCxl = JOptionPane.showConfirmDialog(null, pf, "Introduzca el NIP del jefe del departamento", JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE); if (okCxl == JOptionPane.OK_OPTION) { nip = new String(pf.getPassword()); } else { return; } org.jsoup.Connection.Response respuesta = Jsoup .connect("http://siiauescolar.siiau.udg.mx/wus/gupprincipal.valida_inicio") .data("p_codigo_c", "2225255", "p_clave_c", nip).method(org.jsoup.Connection.Method.POST) .timeout(0).execute(); Document login = respuesta.parse(); String sessionId = respuesta.cookie(getFecha() + "SIIAUSESION"); String sessionId2 = respuesta.cookie(getFecha() + "SIIAUUDG"); Document listaHorarios = Jsoup.connect("http://siiauescolar.siiau.udg.mx/wse/sspsecc.consulta_oferta") .data("ciclop", "201510", "cup", "J", "deptop", "", "codprofp", "" + idProfesor, "ordenp", "0", "mostrarp", "1000", "tipop", "T", "secp", "A", "regp", "T") .userAgent("Mozilla").cookie(getFecha() + "SIIAUSESION", sessionId) .cookie(getFecha() + "SIIAUUDG", sessionId2).timeout(0).post(); Elements tabla = listaHorarios.select("body"); tabla.select("style").remove(); Elements font = tabla.select("font"); font.removeAttr("size"); System.out.println(tabla.html()); JEditorPane jEditorPane = new JEditorPane(); jEditorPane.setEditable(false); HTMLEditorKit kit = new HTMLEditorKit(); jEditorPane.setEditorKit(kit); javax.swing.text.Document doc = kit.createDefaultDocument(); jEditorPane.setDocument(doc); jEditorPane.setText(tabla.html()); JOptionPane.showMessageDialog(null, jEditorPane); } catch (IOException ex) { Logger.getLogger(Horarios.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.crawler.app.run.JellyfishCrawlerSiteCB.java
/** * This function is called when a page is fetched and ready to be processed * by your program./*w w w . j a v a 2s .c o m*/ */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://careerbuilder.vn/vi/tim-viec-lam") && href.endsWith(".html")) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); Elements listDetail = body.select("section div[class=MyJobLeft]"); String jobUrl = url; String jobName = listDetail.select("h1").html(); String companyName = listDetail.select("div[class=tit_company]").html(); String jobLocation = listDetail.select( "div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a") .html(); String companyAddress = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]") .html(); String companyContact = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html(); String companyPhone = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html(); String companyWebsite = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[id=main_content_right]"); jobName = listDetail.select("h1 p").html(); companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[class=content_right]"); jobName = listDetail.select("h1").html(); companyName = listDetail .select("div[class=intro_company] div[class=title_into] p[class=title_comp]") .html(); Elements gCompanyWebList = listDetail .select("div[class=intro_company] div[class=title_into] p"); if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1) companyWebsite = gCompanyWebList.get(1).html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); } } jobName = listDetail.select("h1 a").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1 p").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1").html(); System.out.println("\n Title : " + jobName); try { Integer siteID = 3; //String companyWebsite = ""; /* MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:org.ala.lucene.CreateWordPressIndex.java
/** * Index the WP pages by parsing with Jsoup and indexing into SOLR * * @return//from w w w . jav a 2 s . c o m * @throws IOException */ protected int indexPages() throws Exception { int documentCount = 0; // Initialise SOLR SolrServer solrServer = solrUtils.getSolrServer(); logger.info("Deleting all WordPress documents in SOLR index..."); solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages solrServer.commit(); for (String pageUrl : this.pageUrls) { try { // Crawl and extract text from WP pages Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get(); String title = document.select("head > title").text(); String id = document.select("head > meta[name=id]").attr("content"); String bodyText = document.body().text(); Elements postCategories = document.select("ul[class=post-categories]"); List<String> categoriesOut = new ArrayList<String>(); Boolean excludePost = false; if (!postCategories.isEmpty()) { // Is a WP post (not page) Elements categoriesIn = postCategories.select("li > a"); // get list of li elements for (Element cat : categoriesIn) { String thisCat = cat.text(); if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat) // exclude category "button" posts excludePost = true; } if (thisCat != null) { // add category to list categoriesOut.add(thisCat.replaceAll(" ", "_")); } } } if (excludePost) { logger.debug("Excluding post (id: " + id + ") with category: " + StringUtils.join(categoriesOut, "|")); continue; } documentCount++; // Index with SOLR logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: " + StringUtils.substring(bodyText, 0, 100) + "... "); SolrInputDocument doc = new SolrInputDocument(); doc.addField("idxtype", IndexedTypes.WORDPRESS); doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field doc.addField("id", "wp" + id); // probably not needed but safer to leave in doc.addField("name", title, 1.2f); doc.addField("content", bodyText); doc.addField("australian_s", "recorded"); // so they appear in default QF search doc.addField("categories", categoriesOut); // add to index solrServer.add(doc); if (documentCount % 100 == 0) { logger.info("Committing to SOLR (count = " + documentCount + ")..."); solrServer.commit(); } } catch (IOException ex) { // catch it so we don't stop indexing other pages logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex); } } logger.info("Final Committing to SOLR..."); solrServer.commit(); //logger.info("Optimising SOLR index..."); //solrServer.optimize(); // throws errors on my machine?? logger.info("Committed to SOLR. Final document count: " + documentCount); return documentCount; }
From source file:de.ncoder.studipsync.studip.jsoup.JsoupStudipAdapter.java
@Override public List<Seminar> parseSeminars() throws StudipException { ensureLoggedIn();//from www . ja va2 s.c o m navigate(PAGE_SEMINARS); Elements events = document.select("#content>table:first-of-type>tbody>tr"); List<Seminar> seminars = new ArrayList<>(); for (org.jsoup.nodes.Element event : events) { if (event.select(">td").size() > 4) { Elements info = event.select(">td:nth-of-type(4)>a:first-of-type"); Elements font = info.select("font"); if (info.size() >= 1 && font.size() >= 2) { Seminar seminar = Seminar.getSeminar(info.get(0).absUrl("href"), font.get(0).text().trim(), font.get(1).text().trim()); seminars.add(seminar); } } } log.debug("Parsed " + seminars.size() + " seminars."); log.trace(seminars.toString()); return seminars; }
From source file:org.confab.PhpBB3Parser.java
/** * Parses each post for a particular topic. * @param html Html containing the posts to be parsed * @return List of Post objects *//*from w w w . j av a 2 s .c o m*/ public List<Post> parsePosts(Document html, ForumThread parent) { Utilities.debug("Starting parsePosts"); List<Post> ret = new ArrayList<Post>(); // Each post should have it's own table Elements div_posts = html.select("div#posts"); assert !div_posts.isEmpty(); Elements posts_table = div_posts.select("table[id~=(post\\d+)]"); assert !posts_table.isEmpty(); for (Element el_post : posts_table) { Post new_post = new Post(parent); // Get post id (id=post\d+) new_post.id = el_post.attr("id").replace("post", "").trim(); assert new_post.id != null; // Get post message Elements el_message = el_post.select("div[id~=(post_message_\\d+)]"); assert !el_message.isEmpty(); new_post.message = el_message.first().text(); assert new_post.message != null; Utilities.debug("new_post.message: " + new_post.message); // Get post author Elements el_author = el_post.select(".bigusername"); assert !el_author.isEmpty(); new_post.author.username = el_author.first().text(); assert new_post.author != null; Utilities.debug("new_post.author: " + new_post.author); ret.add(new_post); } Utilities.debug("Finished parsePosts"); return ret; }