List of usage examples for org.jsoup.select Elements select
public Elements select(String query)
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findLanguage() { Elements language = doc.select(PathMapping.LANGUAGE); language.select("b").remove(); return language.html().trim(); }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private Double findShippingWeight() { Elements weight = doc.select(PathMapping.WEIGHT); weight.select("b").remove(); weight.select("a").remove(); String str = weight.html().replace("(", "").replace(")", "").split(" ")[0]; try {//from ww w .java2s .c om if (StringUtils.isNotBlank(str)) { return Double.valueOf(str); } } catch (Exception ex) { } return null; }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findPublisher() { Elements publisher = doc.select(PathMapping.PUBLISHER); publisher.select("b").remove(); String str = publisher.html().substring(0, publisher.html().indexOf("(")).trim(); if (str.lastIndexOf(";") != -1) { str = str.substring(0, str.lastIndexOf(";")); }/*ww w . j av a 2s .c om*/ return str; }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private Integer findPages() { Elements pages = doc.select(PathMapping.PAGES_HARDCOVER); pages.select("b").remove(); if (StringUtils.isNotBlank(pages.html())) { try {//w ww. j a v a 2s . co m return Integer.valueOf(pages.html().split(" ")[0].replaceAll(",", "")); } catch (Exception ex) { } } else { pages = doc.select(PathMapping.PAGES_PAPERBACK); pages.select("b").remove(); try { return Integer.valueOf(pages.html().split(" ")[0].replaceAll(",", "")); } catch (Exception ex) { } } return null; }
From source file:org.mashupmedia.task.MetaTaskScheduler.java
public void getMashupMediaLatestReleaseInformation() { String url = "http://www.mashupmedia.org/latest-release/final"; try {// w w w .ja v a2 s.c om ProxyTextFile proxyTextFile = (ProxyTextFile) proxyManager.loadProxyFile(url, ProxyType.TEXT_FILE); if (proxyTextFile == null) { logger.info( "Unable to find latest release from page: http://www.mashupmedia.org/latest-release/final"); return; } Document document = Jsoup.parse(proxyTextFile.getText()); Elements elements = document.select("div.view-latest-final-release div.views-row"); String releaseType = elements.select("div.views-field-field-release-type").text(); String version = elements.select("div.views-field-field-version").text(); logger.info("Found latest release information, type = " + releaseType + ", version = " + version); configurationManager.saveConfiguration(MashUpMediaConstants.LATEST_RELEASE_FINAL_VERSION, version); } catch (IOException e) { logger.error("Unable to get latest version information from www.mashupmedia.org", e); return; } }
From source file:org.javiermoreno.torrentscratcher.Runner.java
public Movie enrichMovieWithFilmAffinity(Movie movie) { try {/* w ww. j av a2s .c o m*/ String url = "http://www.filmaffinity.com/es/search.php?stext={title}&stype=all"; String title = URLEncoder.encode(movie.getTitle(), "UTF8"); url = url.replace("{title}", title); Document doc = Jsoup.connect(url).get(); if (doc.select("[property=og:title]").size() == 0) { // several results found, take the first Element firstResult = doc.select(".item-search .mc-title a").first(); if (firstResult == null) { // filmaffinity search engine failed log.warn("FilmAffinity 404: " + movie.getTitle()); return movie; } url = "http://www.filmaffinity.com" + firstResult.attr("href"); doc = Jsoup.connect(url).get(); } movie.setFilmAffinityId(doc.select("div.rate-movie-box").attr("data-movie-id")); Elements movieInfo = doc.select("dl.movie-info"); String originalTitle = movieInfo.select("dd").eq(0).text(); originalTitle = originalTitle.replaceAll("\\([^\\(]*\\)", "").replaceAll("\\[[^\\(]*\\]", "") .replaceAll("aka$", "").trim(); movie.setOriginalTitle(originalTitle); movie.setDescription(movieInfo.select("dd").eq(11).text()); } catch (IOException ex) { log.warn(ex.getMessage()); } return movie; }
From source file:org.hmzb.test.HttpClientTest.java
@Test public final void testPMS() throws IOException { // Map<String, String> data = new HashMap<String, String>(); // data.put("act", "module"); // data.put("name", "sns"); // data.put("do", "post"); // data.put("id", "137"); // data.put("replyid", ""); // data.put("postid", "150"); // data.put("reply_content", "?, ?"); String url = "http://pms.local.17173.com/task_list_department.php?action=search&employment_id=&state=0&time_id=plan&start_date=2014-01-01&end_date=2014-05-16&x=24&y=5"; String cookieValue = "SUV=1381469482625841; NUV=1381507200000; sohutag=8HsmeSc5NCwmcyc5NCwmYjc5NCwmYSc5NCwmZjc5NCwmZyc5Njwmbjc5NCwmaSc5NCwmdyc5NCwmaCc5NCwmYyc5NCwmZSc5NCwmbSc5NH0; __utma=113262040.1666690635.1382600575.1382600575.1382600575.1; vjuids=c639cb6b1.142370b45c7.0.ef4cedbb; Hm_lvt_0245ebe4fb30a09e371e4f011dec1f6a=1388137801; live_17173_unique=e7de7aed49953586fc1da607967cf847; _ga=GA1.2.1666690635.1382600575; pgv_pvi=2611450780; vjlast=1383902955.1399958818.22; ermpdockData=1,2,4,13,17; DIFF=1400117702510; IPLOC=CN3501; ErmpToken=Q1k1MzIw; ErmpTicket=MTAuNS4xNS4xNg; ppinf=2|1401269453|1402479053|bG9naW5pZDowOnx1c2VyaWQ6MTY6cHR6aHVmQDE3MTczLmNvbXxzZXJ2aWNldXNlOjMwOjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMHxjcnQ6MTA6MjAxMi0xMS0yMHxlbXQ6MTowfGFwcGlkOjQ6MTA3N3x0cnVzdDoxOjF8cGFydG5lcmlkOjE6MHxyZWxhdGlvbjowOnx1dWlkOjE2OmRiYmNhNTA3ZjNmMjRjMnR8dWlkOjk6czg3MDM4OTcwfHVuaXFuYW1lOjQ0OiVFNiU5MCU5QyVFNyU4QiU5MCVFNyVCRCU5MSVFNSU4RiU4QjMxNDI4NjcxfA; pprdig=Hs7tIw6klJdNasYa5mYo4aOzZnr2dL96PkIAMo8K4KGp4UM2yhx2LHuNOZ5zX7s4pKShi4GnXYFIIyAW-BWRJCAgmI2qeorvqshYjT5gs4gWKGgJNtoQAbdIt1liIK-Bt1aX_mYueEHUA_yRDVhRxRVLVt3mtlgywukd-stCIOE; lastdomain=1402479053|cHR6aHVmQDE3MTczLmNvbXw|17173.com; PHPSESSID=qcr7raandp6l0k7g9vpg0lgn22; PMS_cypms_username=fuzhu; PMS_cypms_auth=c0b47dad95a0e7ef7505d9ce057b6651"; Document resultDoc = Jsoup.connect(url).header("cookie", cookieValue).timeout(20000).get(); Elements table = resultDoc.select("table.list"); Elements trs = table.select("tr"); // /*from www. j av a2s .co m*/ trs.remove(0); // ?? trs.remove(trs.size() - 1); // Double totalTime = 0d; String regex = ".*?.*"; for (Element element : trs) { Elements tds = element.select("td"); // System.out.println(tds); String projectName = tds.get(3).text(); Double realTime = Double.valueOf(tds.get(7).text()); if (projectName.matches(regex)) { totalTime += realTime; } } System.out.println(totalTime); }
From source file:Leitura.Jxr.java
public String leituraJxr() throws IOException { //mtodo para pegar os nomes dos mtodos declarados Elements elements = document.getElementsByTag("pre"); elements.select("a.jxr_linenumber").remove(); // elements.select("strong.jxr_keyword").remove(); // elements.select("span.jxr_string").remove(); // elements.select("em.jxr_comment").remove(); for (Element children : elements) { children.getElementsByClass("jxr_comment").remove(); children.getElementsByClass("jxr_javadoccomment").remove(); }/*from w w w . j a v a 2s .co m*/ return elements.text(); // retorna o cdigo sem lixo }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");//from ww w . j a v a 2s . co m if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:co.dilaver.quoter.fragments.QODFragment.java
private void parseQodResponse(JSONObject response) throws JSONException { JSONObject parse = response.getJSONObject("parse"); JSONObject text = parse.getJSONObject("text"); String content = text.getString("*"); Document doc = Jsoup.parse(content); Elements table = doc.select("table[style=\"text-align:center; width:100%\"]"); Elements rows = table.select("tr"); Elements qod = rows.get(0).select("td"); Elements author = rows.get(1).select("td"); Whitelist whitelist = Whitelist.none(); String newQuote = Html.fromHtml(Jsoup.clean(qod.toString(), whitelist)).toString(); String newAuthor = Html.fromHtml(Jsoup.clean(author.toString(), whitelist).replace("~", "")).toString(); if (!qodString.equals("") && !authorString.equals("")) { if (!qodString.equals(newQuote) || !authorString.equals(newAuthor)) { Snackbar.make(rootLayout, getString(R.string.str_Refreshing), Snackbar.LENGTH_SHORT).show(); }//from ww w .j a v a2s . c o m } qodString = newQuote; authorString = newAuthor; sharedPrefStorage.setQodText(qodString); sharedPrefStorage.setQodAuthor(authorString); Log.e(TAG, "quote: " + qodString); Log.e(TAG, "author: " + authorString); qodText.setText(getString(R.string.str_WithinQuotation, qodString)); qodAuthor.setText(authorString); }