List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * remove paragraphs that have less than x number of words, would indicate that it's some sort of link *//*from w ww.j a v a 2 s . c om*/ private void removeParagraphsWithFewWords() { if (logger.isDebugEnabled()) { logger.debug("removeParagraphsWithFewWords starting..."); } Elements allNodes = this.topNode.getAllElements(); for (Element el : allNodes) { try { // get stop words that appear in each node WordStats stopWords = StopWords.getStopWordCount(el.text()); if (stopWords.getStopWordCount() < 5 && el.getElementsByTag("object").size() == 0 && el.getElementsByTag("embed").size() == 0) { el.remove(); } } catch (IllegalArgumentException e) { logger.error(e.getMessage()); } //} } }
From source file:com.example.android.expandingcells.ExpandingCells.java
public void getHoroscope() { AsyncHttpClient client = new AsyncHttpClient(); client.get(//from ww w . j av a 2 s.c o m "http://pipes.yahoo.com/pipes/pipe.run?_id=_omfgXdL3BGGadhGdrq02Q&_render=json&sign=Virgo&url=http%3A%2F%2Fwww.astrology.com%2Fhoroscopes%2Fdaily-horoscope.rss", new JsonHttpResponseHandler() { @Override public void onSuccess(JSONObject response) { JSONArray horoscopeJsonResults = null; try { horoscopeJsonResults = response.getJSONObject("value").getJSONArray("items"); // Pass the index value based on what sunsign, here, 0->Aries, 1->Taurus ... etc String horoscopeFullString = horoscopeJsonResults.getJSONObject(0) .getString("description").toString(); Document doc = Jsoup.parse(horoscopeFullString); Element p = doc.select("p").first(); horoscopeText = p.text(); Horo_Image = "drawable://" + R.drawable.mb_horoscope; Horo = "Daily Horoscope for " + CommonLib.findZodiacSign("9", "6") + ": \n" + horoscopeText; Log.d("DEBUG", Horo); Log.d("DEBUG", "Horo"); NewsList.get(1).setNews(Horo_Image, "Horoscope", Horo); adapter.notifyDataSetChanged(); Log.d("DEBUG", "Horo1"); adapter.notifyDataSetChanged(); //itemList.add(new Bytes(Horo_Image,"Horoscope", Horo)); // Replace this with birth date //todoAdapter.add("Daily Horoscope for " + CommonLib.findZodiacSign("12", "6") + ": \n" + horoscopeText); } catch (JSONException e) { e.printStackTrace(); Log.d("DEBUG", "pipes"); } } }); }
From source file:accountgen.controller.Controller.java
private void setPhone(Document doc, Person p) { Element tel = doc.select(".tel").first(); p.setPhone(tel.text()); }
From source file:accountgen.controller.Controller.java
private void setEmail(Document doc, Person p) { Element em = doc.select(".email").first(); p.setEmail(em.text().split(" ")[0]); }
From source file:com.amastigote.xdu.query.module.WaterAndElectricity.java
private List<String> query_metInfo() throws IOException { Document document = getPage("", METINFO_SUFFIX); Elements elements = document.select("td"); List<String> stringArrayList = new ArrayList<>(); for (Element td : elements) { String tmp = td.text(); if (!"".equals(tmp)) { stringArrayList.add(tmp);//from ww w .ja v a2 s. c o m } } for (int i = 0; i < stringArrayList.size(); i++) { stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 1)); } /* * (stringArrayList): * - 0, ???? * - ? [ ?? | ? | ? ] * - , (3n), n??? * * - ?: ?null! */ return stringArrayList; }
From source file:net.devietti.ArchConfMapServlet.java
/** * Returns the URL of the external conference website (not the WikiCFP page) for the given * eventid./*from www .j av a 2s . c om*/ */ private void getConfLink(HttpServletRequest req, HttpServletResponse resp) throws IOException { String eids = req.getParameter("eventid"); if (eids == null) { error("missing required URL parameter: eventid"); return; } Integer eid; try { eid = Integer.valueOf(eids); } catch (NumberFormatException e) { error(e.getMessage()); return; } if (eid == null || eid == 0) { error("error parsing eventid"); return; } // pull down the CFP Document cfp = getURL("http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=" + eids); for (Element a : cfp.select("tr td[align=center] a")) { Element td = a.parent(); if (td.text().contains("Link:") && a.hasAttr("href") && a.attr("href").contains("http://")) { // got the link! resp.setContentType("application/json"); resp.getWriter().println(GSON.toJson(a.attr("href"))); return; } } error("no matching link"); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;/*from ww w .j av a 2 s . c o m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:com.amastigote.xdu.query.module.WaterAndElectricity.java
private List<String> query_payInfo(Duration duration) throws IOException { getPageAttributes(PAYINFO_SUFFIX);/*from ww w .j a v a 2s . c om*/ String OUTPUT_DATA = "But_Seach3="; switch (duration) { case ONE_MONTH: OUTPUT_DATA += ONE_MONTH; break; case THREE_MONTH: OUTPUT_DATA += THREE_MONTH; break; default: throw new IllegalArgumentException("Bad parameter, check document for help"); } OUTPUT_DATA += "&__VIEWSTATE="; OUTPUT_DATA += VIEWSTATE; OUTPUT_DATA += "&HiddenField_webName="; OUTPUT_DATA += "&HiddenField_UserID="; OUTPUT_DATA += ID; Document document = getPage(OUTPUT_DATA, PAYINFO_SUFFIX); Elements elements = document.select("td"); List<String> stringArrayList = new ArrayList<>(); for (Element td : elements) { String tmp = td.text(); if (!"".equals(tmp)) { stringArrayList.add(tmp); } } for (int i = 0; i < stringArrayList.size(); i++) { if (stringArrayList.get(i).contains("")) { stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 2)); continue; } stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 1)); } /* * (stringArrayList): * - 0, ???? * - ? [ | ? ] * - , (2n), n??? * * - ?: ?null! */ return stringArrayList; }
From source file:com.amastigote.xdu.query.module.WaterAndElectricity.java
private List<String> query_useInfo(Duration duration) throws IOException { getPageAttributes(USEINFO_SUFFIX);//from w w w .j a va 2 s . c o m String OUTPUT_DATA = "But_Seach3="; switch (duration) { case ONE_MONTH: OUTPUT_DATA += ONE_MONTH; break; case THREE_MONTH: OUTPUT_DATA += THREE_MONTH; break; default: throw new IllegalArgumentException("Bad parameter, check document for help"); } OUTPUT_DATA += "&__VIEWSTATE="; OUTPUT_DATA += VIEWSTATE; OUTPUT_DATA += "&HiddenField_webName="; OUTPUT_DATA += "&HiddenField_UserID="; OUTPUT_DATA += ID; Document document = getPage(OUTPUT_DATA, USEINFO_SUFFIX); Elements elements = document.select("td"); List<String> stringArrayList = new ArrayList<>(); for (Element td : elements) { String tmp = td.text(); tmp = tmp.replaceAll(" ", ""); if (!"".equals(tmp)) { if (tmp.contains("???")) { stringArrayList.add(tmp.substring(0, tmp.indexOf("???"))); stringArrayList.add(tmp.substring(tmp.indexOf("???"))); continue; } stringArrayList.add(tmp); } } for (int i = 0; i < stringArrayList.size(); i++) { stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 1)); } /* * (stringArrayList): * - 0, ???? * - ? [ ? | | ? | ?/? | ??? ] * - , (5n), n??? * * - ?: ?null! */ return stringArrayList; }
From source file:feedzilla.Feed.java
private void parser(Element entry) { boolean source = false; for (Element element : entry.children()) { switch (element.nodeName()) { case "id": this.id = Integer.parseInt(element.text().split(":")[1]); break; case "title": if (source) { this.source_title = element.text(); } else { this.title = element.text(); }//from ww w . jav a2 s . com break; case "summary": this.summary = element.text().split("<br")[0]; break; case "published": this.published = element.text(); break; case "updated": this.updated = element.text(); break; case "author": this.author = element.text(); break; case "link": if (source) { this.source_link = element.attr("href"); } else { this.link = element.attr("href"); } break; case "rights": this.copyright = element.text(); break; case "source": source = true; break; default: Log.debug("Unknow TAG: " + element.nodeName()); break; } } }