List of usage examples for org.jsoup.nodes Element ownText
public String ownText()
From source file:Main.java
public static void main(String[] args) throws Exception { final Document document = Jsoup.parse( "<html><head/><body><a href=\"#\" class=\"artist\">Soulive<span class=\"create-play\">Play</span></a></body></html>"); final Element elem = document.getElementsByAttributeValue("class", "artist").first(); System.out.println(elem.ownText()); }
From source file:Main.java
public static String printNode(Element root, int indentation) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < indentation; i++) { sb.append(' '); }/*from ww w . j a va2 s.c om*/ sb.append(root.tagName()); sb.append(":"); sb.append(root.ownText()); sb.append("\n"); for (Element el : root.children()) { sb.append(printNode(el, indentation + 1)); sb.append("\n"); } return sb.toString(); }
From source file:Main.java
private static String printNode(Element root, int indentation) { StringBuilder sb = new StringBuilder(indentation); for (int i = 0; i < indentation; i++) { sb.append(' '); }/*from w w w . j a v a2s . co m*/ sb.append(root.tagName()); sb.append(':'); sb.append(root.ownText()); sb.append('\n'); for (Element el : root.children()) { sb.append(printNode(el, indentation + 1)); sb.append('\n'); } return sb.toString(); }
From source file:com.nuance.expertassistant.ContentExtractor.java
public static void extract(Document doc) { final Elements links = doc.getElementsByTag("a"); final Elements ps = doc.select("p"); final String title = doc.title(); print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">"); final Elements elements = doc.select("*"); final ArrayList<String> openHeaderList = new ArrayList<String>(); for (final Element element : elements) { if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") { } else if (element.tagName().toString().contains("a")) { } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); }/* w w w .j a va 2 s . co m*/ if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h1"); } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h2"); } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h3"); } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h4"); } else { print("<para>"); print(stripNonValidXMLCharacters(element.ownText())); print("</para>"); } /* * if (element.tagName().contains("img")) { print("<img src=\"" + * element.attr("src") + "\"></img>"); } */ } if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); } if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("</section>"); }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
private static Element getContentElement(Element cell) { if (cell.ownText().isEmpty() && cell.select("> span").size() == 1) { cell = cell.select("> span").first(); }//from ww w. j a v a 2 s . c om return cell; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static void handleRoom(Substitution subst, Element cell) { cell = getContentElement(cell);//w w w . j a v a2s. com if (cell.select("s").size() > 0) { subst.setPreviousRoom(cell.select("s").text()); if (cell.ownText().length() > 0) { subst.setRoom(cell.ownText().replaceFirst("^\\?", "").replaceFirst("", "")); } } else { subst.setRoom(cell.text()); } }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static void handleSubject(Substitution subst, Element cell) { cell = getContentElement(cell);//from w ww .ja va 2s.c om if (cell.select("s").size() > 0) { subst.setPreviousSubject(cell.select("s").text()); if (cell.ownText().length() > 0) { subst.setSubject(cell.ownText().replaceFirst("^\\?", "").replaceFirst("", "")); } } else { subst.setSubject(cell.text()); } }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static void handleTeacher(Substitution subst, Element cell, JSONObject data) { cell = getContentElement(cell);//from ww w . ja v a 2 s .c o m if (cell.select("s").size() > 0) { subst.setPreviousTeachers(splitTeachers(cell.select("s").text(), data)); if (cell.ownText().length() > 0) { subst.setTeachers( splitTeachers(cell.ownText().replaceFirst("^\\?", "").replaceFirst("", ""), data)); } } else { subst.setTeachers(splitTeachers(cell.text(), data)); } }
From source file:hello.Scraper.java
@Transformer(inputChannel = "channel3", outputChannel = "channel4") public DumpEntry convert(Element payload) throws ParseException { String dateStr = payload.ownText().substring(0, 19); DateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); format.setTimeZone(TimeZone.getTimeZone("GMT")); Date timestamp = format.parse(dateStr); Elements list = payload.select("a"); String id;//from w ww .j av a2 s . c o m String ref; if (list.size() > 0) { Element a = list.get(0); id = a.ownText(); ref = a.attr("href"); } else { id = "private data"; ref = null; } Element span = payload.select("span").get(0); String status = span.ownText(); return new DumpEntry(timestamp, id, ref, status); }
From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBPostProvider.java
/** * Return a mapping table mapping from RDF properties to XPath Value Mappers. Each entry in the * map is evaluated/*from ww w . ja v a 2 s.c o m*/ * in turn; in case the XPath expression yields a result, the property is added for the * processed resource. * * @return * @param requestUrl */ @Override protected Map<String, JSoupMapper> getMappings(String resource, String requestUrl) { URI uri = null; try { uri = new URI(requestUrl); Map<String, String> params = new HashMap<String, String>(); for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) { params.put(p.getName(), p.getValue()); } if (params.containsKey("p")) { // mappings for a reply that has directly been addressed using the ?p=... parameter // to viewtopic.php, e.g. http://www.carving-ski.de/phpBB/viewtopic.php?p=119208 Map<String, JSoupMapper> commentMappings = new HashMap<String, JSoupMapper>(); commentMappings.put(Namespaces.NS_DC + "title", new CssTextLiteralMapper( String.format("div#pagecontent table:has(a[name=p%s]) td.gensmall div", params.get("p"))) { @Override public List<Value> map(String resourceUri, Element elem, ValueFactory factory) { final String val = elem.ownText().replaceFirst("^\\s*:", "").replaceAll(" ", " ") .trim(); if (datatype != null) return Collections.singletonList((Value) factory.createLiteral(val, factory.createURI(Namespaces.NS_XSD + datatype))); else return Collections.singletonList((Value) factory.createLiteral(val)); } @Override public Elements select(Element htmlDoc) { final Element first = super.select(htmlDoc).first(); return first != null ? new Elements(first) : new Elements(); } }); commentMappings.put(Namespaces.NS_DC + "creator", new CssTextLiteralMapper( String.format("div#pagecontent table:has(a[name=p%s]) .postauthor", params.get("p")))); commentMappings.put(Namespaces.NS_DC + "description", new CssTextLiteralMapper( String.format("div#pagecontent table:has(a[name=p%s]) div.postbody", params.get("p")))); commentMappings.put(Namespaces.NS_DC + "date", new PHPBBDateMapper( String.format("div#pagecontent td.gensmall:has(a[name=p%s]) div", params.get("p"))) { @Override public Elements select(Element htmlDoc) { final Elements sel = super.select(htmlDoc); if (sel.size() > 0) { final Element e = sel.get(1); if (e != null) return new Elements(e); } return new Elements(); } }); return commentMappings; } else throw new RuntimeException( "the requested resource does not seem to identify a PHPBB Post (p=... parameter missing)"); } catch (URISyntaxException e) { throw new RuntimeException( "the requested resource does not seem to identify a PHPBB Post (URI syntax error)"); } }