List of usage examples for org.jsoup.select Elements select
public Elements select(String query)
From source file:org.brunocvcunha.taskerbox.impl.jobs.MonsterJobSeeker.java
@Override protected void execute() throws Exception { try {// w ww . j a v a2 s. c o m for (int x = 1; x < this.maxPages; x++) { int uniqueCount = 0; // DefaultHttpClient client = // TaskerboxHttpBox.getInstance().buildNewHttpClient(); String seekUrl = "http://jobsearch.monster." + this.site + "/search/?q=" + URLEncoder.encode(this.search) + "&sort=dt.rv.di&pg=" + x; logInfo(log, "... Seeking " + seekUrl); HttpEntity entity = TaskerboxHttpBox.getInstance().getEntityForURL(seekUrl); String result = TaskerboxHttpBox.getInstance().readResponseFromEntity(entity); if (result.contains("Sorry, no jobs were found that match your criteria")) { System.err.println("Busca encerrada."); this.bootstrapHttpClient(true); break; // return; } try { Document doc = Jsoup.parse(result); Elements el = doc.select("table.listingsTable").select("tr"); for (val item : el) { Elements jobTitleEl = item.select("div.jobTitleContainer"); Elements companyEl = item.select("div.companyContainer"); Elements locationEl = item.select("div.jobLocationSingleLine"); // aaa String url = jobTitleEl.select("a").attr("href"); if (url.equals("")) { continue; } if (url.contains("?mescoid")) { url = url.substring(0, url.indexOf("?mescoid")); } if (url.contains("?jobPosition")) { url = url.substring(0, url.indexOf("?jobPosition")); } if (url.contains("&jobPosition")) { url = url.substring(0, url.indexOf("&jobPosition")); } String company = ""; if (!companyEl.select("a").isEmpty()) { company = companyEl.select("a").get(0).attr("title"); } handleJob(jobTitleEl.text(), company, locationEl.select("a").text(), url); uniqueCount++; } if (uniqueCount == 0) { logInfo(log, "MONSTER BREAK -- NO UNIQUE COUNT"); break; } try { Thread.sleep(10000L); } catch (InterruptedException e) { e.printStackTrace(); } } catch (Exception e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoCount(Document doc, Video video) { Elements countElements = doc.select("div#video_favorite_edit span"); if (CollectionUtils.isNotEmpty(countElements)) { Elements countWantedElements = countElements.select("#subscribed a"); if (CollectionUtils.isNotEmpty(countWantedElements)) { String countWanted = countWantedElements.first().text(); try { video.setCountWanted(Integer.valueOf(countWanted)); } catch (Exception e) { }/* w w w . j av a2s .c o m*/ } Elements countWatchedElements = countElements.select("#watched a"); if (CollectionUtils.isNotEmpty(countWatchedElements)) { String countWatched = countWatchedElements.first().text(); try { video.setCountWatched(Integer.valueOf(countWatched)); } catch (Exception e) { } } Elements countOwnedElements = countElements.select("#owned a"); if (CollectionUtils.isNotEmpty(countOwnedElements)) { String countOwned = countOwnedElements.first().text(); try { video.setCountOwned(Integer.valueOf(countOwned)); } catch (Exception e) { } } } }
From source file:com.github.binlee1990.transformers.spider.PersonCrawler.java
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); logger.info(url);/*from w ww .j a va 2 s .co m*/ if (!url.startsWith("http://www.javlibrary.com/cn/?v=jav")) { return; } if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); String videoIdentificationCode = doc.select("div#video_id td.text").first().text().toString(); Video queryVideo = new Video(); queryVideo.setIdentificationCode(videoIdentificationCode); Video video = videoMapper.queryByVideo(queryVideo); if (null != video) { return; } video = new Video(); video.setUrl(url); Date now = new Date(); video.setCreateTime(now); video.setUpdateTime(now); String title = doc.select("div#video_title a").first().text().toString(); video.setTitle(title); video.setIdentificationCode(videoIdentificationCode); Elements rdElements = doc.select("div#video_date td.text"); if (CollectionUtils.isNotEmpty(rdElements)) { String releaseDate = rdElements.first().text().toString(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try { Date date = sdf.parse(releaseDate); video.setReleaseDate(date); } catch (ParseException e) { } } Elements dmElements = doc.select("div#video_length span.text"); if (CollectionUtils.isNotEmpty(dmElements)) { String durationMinutes = dmElements.first().text().toString(); video.setDurationMinutes(Integer.valueOf(durationMinutes)); } Elements dElements = doc.select("div#video_director td.text"); if (CollectionUtils.isNotEmpty(dElements)) { String director = dElements.first().text().toString(); video.setDirector(director); } Elements pElements = doc.select("div#video_maker td.text"); if (CollectionUtils.isNotEmpty(pElements)) { String producer = pElements.first().text().toString(); video.setProducer(producer); } Elements disElements = doc.select("div#video_label td.text"); if (CollectionUtils.isNotEmpty(disElements)) { String distributor = disElements.first().text().toString(); video.setDistributor(distributor); } Elements countElements = doc.select("div#video_favorite_edit span"); if (CollectionUtils.isNotEmpty(countElements)) { Elements countWantedElements = countElements.select("#subscribed a"); if (CollectionUtils.isNotEmpty(countWantedElements)) { String countWanted = countWantedElements.first().text(); try { video.setCountWanted(Integer.valueOf(countWanted)); } catch (Exception e) { } } Elements countWatchedElements = countElements.select("#watched a"); if (CollectionUtils.isNotEmpty(countWatchedElements)) { String countWatched = countWatchedElements.first().text(); try { video.setCountWatched(Integer.valueOf(countWatched)); } catch (Exception e) { } } Elements countOwnedElements = countElements.select("#owned a"); if (CollectionUtils.isNotEmpty(countOwnedElements)) { String countOwned = countOwnedElements.first().text(); try { video.setCountOwned(Integer.valueOf(countOwned)); } catch (Exception e) { } } } Elements sElements = doc.select("div#video_review td.text span.score"); if (CollectionUtils.isNotEmpty(sElements)) { String score = sElements.first().text().toString(); score = StringUtils.replace(score, "(", ""); score = StringUtils.replace(score, ")", ""); if (StringUtils.isNotBlank(score)) { try { video.setScore(Float.valueOf(score)); } catch (Exception e) { } } } Elements actressElements = doc.select("div#video_cast span.star"); if (CollectionUtils.isNotEmpty(actressElements)) { if (actressElements.size() <= 1) { video.setSingleFemaleFlag(true); } else { video.setSingleFemaleFlag(false); } } videoMapper.insertSelective(video); int videoId = videoMapper.queryByVideo(video).getId(); logger.info("handle " + videoId + "\n" + JSON.toJSONString(video)); if (CollectionUtils.isNotEmpty(actressElements)) { actressElements.stream().forEach(a -> { String aName = a.text().toString().trim(); if (StringUtils.isNotBlank(aName)) { Actress queryActress = new Actress(); queryActress.setName(aName); Actress actress = actressMapper.queryByActress(queryActress); if (null != actress) { VideoActress va = new VideoActress(); va.setActressCode(actress.getCode()); va.setVideoId(videoId); videoActressMapper.insertSelective(va); } else { actress = new Actress(); actress.setName(aName); actressMapper.insertSelective(actress); int actressId = actressMapper.queryByActress(actress).getId(); VideoActress va = new VideoActress(); va.setActressCode(actress.getCode()); va.setVideoId(videoId); videoActressMapper.insertSelective(va); } } }); } Elements categoryElements = doc.select("div#video_genres span.genre"); if (CollectionUtils.isNotEmpty(categoryElements)) { categoryElements.stream().forEach(c -> { String cDescription = c.text().toString().trim(); if (StringUtils.isNotBlank(cDescription)) { Category queryCategory = new Category(); queryCategory.setSubtype(cDescription); Category category = categoryMapper.queryByCategory(queryCategory); if (null != category) { VideoCategory vc = new VideoCategory(); vc.setCategoryId(category.getId()); vc.setCategoryDescription(category.getSubtype()); vc.setVideoId(videoId); videoCategoryMapper.insertSelective(vc); } else { category = new Category(); category.setSubtype(cDescription); categoryMapper.insertSelective(category); int categoryId = categoryMapper.queryByCategory(category).getId(); VideoCategory vc = new VideoCategory(); vc.setCategoryId(categoryId); vc.setCategoryDescription(category.getSubtype()); vc.setVideoId(videoId); videoCategoryMapper.insertSelective(vc); } } }); } } }
From source file:org.confab.PhpBB3Parser.java
/** * Parses each topic for a particular forum. * @param forum Document of html containing topics * @param parent Forum the threads belong to * @return List of ForumThread objects *///from w w w.ja v a 2 s.c om public List<ForumThread> parseForumThreads(Document forum, Forum parent) { Utilities.debug("parseForumThreads"); List<ForumThread> ret = new ArrayList<ForumThread>(); // Get topic table Elements thread_table_tds = forum.select("tbody[id*=threadbits_forum_] td"); if (thread_table_tds.isEmpty()) { Utilities.debug("It seems " + parent.url + " has no topics."); return ret; } // Get any stickies Elements stickies = thread_table_tds.select("td:contains(Sticky:) a[id*=thread_title_]"); // Get all topics Elements els_a = thread_table_tds.select("a[id*=thread_title_]"); assert !els_a.isEmpty(); // Loop topics and grab info about each for (Element el_a : els_a) { ForumThread new_topic = new ForumThread(parent); // Get topic new_topic.title = el_a.text(); assert new_topic.title != null; Utilities.debug("new_topic.title: " + new_topic.title); // Check if sticky if (stickies.html().contains(new_topic.title)) { new_topic.isSticky = true; Utilities.debug("new_topic.isSticky: " + new_topic.isSticky); } // Get URL new_topic.url = el_a.attr("href"); assert new_topic.url != null; Utilities.debug("new_topic.url:" + new_topic.url); ret.add(new_topic); } Utilities.debug("end printForumThreads"); return ret; }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void addFilmGenreList(Elements filmTitleElements, Film film) { Elements genreElements = filmTitleElements.select(".fm-genre"); if (CollectionUtils.isNotEmpty(genreElements) && genreElements.size() >= 2) { Element genreElement = genreElements.get(1); if (null != genreElement) { String genreStr = genreElement.text().toString(); if (StringUtils.isNotBlank(genreStr)) { List<String> genreList = SLASH_SPLITTER.splitToList(genreStr); if (CollectionUtils.isNotEmpty(genreList)) { genreList.forEach(genre -> { EnumGenre queryGenre = new EnumGenre(); queryGenre.setUrlGenre(genre); EnumGenre enumGenre = enumGenreMapper.queryEnumGenreByEnumGenre(queryGenre); if (null != enumGenre) { FilmGenre filmGenre = new FilmGenre(); filmGenre.setFilmCode(film.getCode()); filmGenre.setGenreId(enumGenre.getId()); Date now = new Date(); filmGenre.setCreateTime(now); filmGenre.setUpdateTime(now); filmGenreMapper.insertSelective(filmGenre); }// ww w. ja va2s .c om }); } } } } }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/* w w w. j a v a 2 s . c o m*/ Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() { final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>(); final Document document = SWAGGER_DOCUMENT.get(); final Elements tocElements = document.select(".toc"); final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children(); final Elements tocFilteredCategoryElements = new Elements(); for (final Element tocCategoryElement : tocCategoryElements) { final Element categoryLinkElement = tocCategoryElement.children().first(); tocFilteredCategoryElements.add(categoryLinkElement); }/*from ww w. j a va 2 s. co m*/ final Elements tocIndividualElements = tocElements.select(".sectlevel2"); addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null); int categoryCount = 1; for (final Element tocIndividualElement : tocIndividualElements) { final Elements tocIndividualElementLinks = tocIndividualElement.select("a"); addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount); categoryCount++; } return titleLinkMap; }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); logger.info(url);// www . j a va 2 s . c o m if (!url.startsWith("http://dianying.fm/movie/")) { return; } if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); if (null != doc) { Elements filmTitleElements = doc.select(".fm-title"); if (CollectionUtils.isNotEmpty(filmTitleElements)) { String filmTitle = StringUtils.trimToEmpty(filmTitleElements.select("a[name]").text()); if (StringUtils.isNotBlank(filmTitle)) { // film Film film = createOrQueryFilm(url, doc, filmTitle); logger.info("Add Film: " + filmTitle); if (null != film) { // film_actor addFilmActorList(doc, film); // film_album addFilmAlbumList(doc, film); // film_review addFilmReview(doc, film); // film_genre addFilmGenreList(filmTitleElements, film); // film_region addFilmRegionList(doc, film); } } } } } }
From source file:de.geeksfactory.opacclient.apis.Pica.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();/*from www.j a v a 2 s.com*/ } String html = httpGet(opac_url + "/LNG=" + getLang() + "/DB=" + db + "/ADVANCED_SEARCHFILTER", getDefaultEncoding()); Document doc = Jsoup.parse(html); List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=IKT0] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); field.setData(new JSONObject("{\"ADI\": false}")); Pattern pattern = Pattern.compile("\\[X?[A-Za-z]{2,3}:?\\]|\\(X?[A-Za-z]{2,3}:?\\)"); Matcher matcher = pattern.matcher(field.getDisplayName()); if (matcher.find()) { field.getData().put("meaning", matcher.group().replace(":", "").toUpperCase()); field.setDisplayName(matcher.replaceFirst("").trim()); } fields.add(field); } Elements sort = doc.select("select[name=SRT]"); if (sort.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(sort.first().parent().parent().select(".longval").first().text()); field.setId("SRT"); for (Element option : sort.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } for (Element input : doc.select("input[type=text][name^=ADI]")) { TextSearchField field = new TextSearchField(); field.setDisplayName(input.parent().parent().select(".longkey").text()); field.setId(input.attr("name")); field.setHint(input.parent().select("span").text()); field.setData(new JSONObject("{\"ADI\": true}")); fields.add(field); } for (Element dropdown : doc.select("select[name^=ADI]")) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(dropdown.parent().parent().select(".longkey").text()); field.setId(dropdown.attr("name")); for (Element option : dropdown.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } Elements fuzzy = doc.select("input[name=FUZZY]"); if (fuzzy.size() > 0) { CheckboxSearchField field = new CheckboxSearchField(); field.setDisplayName(fuzzy.first().parent().parent().select(".longkey").first().text()); field.setId("FUZZY"); fields.add(field); } Elements mediatypes = doc.select("input[name=ADI_MAT]"); if (mediatypes.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName("Materialart"); field.setId("ADI_MAT"); field.addDropdownValue("", "Alle"); for (Element mt : mediatypes) { field.addDropdownValue(mt.attr("value"), mt.parent().nextElementSibling().text().replace("\u00a0", "")); } fields.add(field); } return fields; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();//from w w w . java 2s.c o m } List<SearchField> fields = new ArrayList<>(); // Read branches and media types List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("link_profis.x", "0")); nameValuePairs.add(new BasicNameValuePair("link_profis.y", "1")); String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); Document doc = Jsoup.parse(html); Elements fieldElems = doc.select(".suchfeldinhalt"); for (Element fieldElem : fieldElems) { String name = fieldElem.select(".suchfeld_inhalt_titel label").text(); String hint = ""; if (fieldElem.select(".suchfeld_inhalt_input").size() > 0) { List<TextNode> textNodes = fieldElem.select(".suchfeld_inhalt_input").first().textNodes(); if (textNodes.size() > 0) { for (TextNode node : textNodes) { String text = node.getWholeText().replace("\n", ""); if (!text.equals("")) { hint = node.getWholeText().replace("\n", ""); break; } } } } Elements inputs = fieldElem .select(".suchfeld_inhalt_input input[type=text], " + ".suchfeld_inhalt_input select"); if (inputs.size() == 1) { fields.add(createSearchField(name, hint, inputs.get(0))); } else if (inputs.size() == 2 && inputs.select("input[type=text]").size() == 2) { // Two text fields, e.g. year from/to or two keywords fields.add(createSearchField(name, hint, inputs.get(0))); TextSearchField secondField = (TextSearchField) createSearchField(name, hint, inputs.get(1)); secondField.setHalfWidth(true); fields.add(secondField); } else if (inputs.size() == 2 && inputs.get(0).tagName().equals("select") && inputs.get(1).tagName().equals("input") && inputs.get(0).attr("name").equals("feld1")) { // A dropdown to select from different search field types. // Break it down into single text fields. for (Element option : inputs.get(0).select("option")) { TextSearchField field = new TextSearchField(); field.setHint(hint); field.setDisplayName(option.text()); field.setId(inputs.get(1).attr("name") + "$" + option.attr("value")); JSONObject data = new JSONObject(); JSONObject params = new JSONObject(); params.put(inputs.get(0).attr("name"), option.attr("value")); data.put("additional_params", params); field.setData(data); fields.add(field); } } } DropdownSearchField orderField = new DropdownSearchField("orderselect", stringProvider.getString(StringProvider.ORDER), false, null); orderField.addDropdownValue("1", stringProvider.getString(StringProvider.ORDER_DEFAULT)); orderField.addDropdownValue("2:desc", stringProvider.getString(StringProvider.ORDER_YEAR_DESC)); orderField.addDropdownValue("2:asc", stringProvider.getString(StringProvider.ORDER_YEAR_ASC)); orderField.addDropdownValue("3:desc", stringProvider.getString(StringProvider.ORDER_CATEGORY_DESC)); orderField.addDropdownValue("3:asc", stringProvider.getString(StringProvider.ORDER_CATEGORY_ASC)); orderField.setMeaning(Meaning.ORDER); fields.add(orderField); return fields; }