List of usage examples for org.jsoup.nodes Document text
public String text()
From source file:ai.susi.server.api.susi.GenericScraper.java
/** * Article API//from w w w.j a v a 2s . com * @param URL * @param JSONObject genericScraperData * @return genericScraperData */ public JSONObject articleAPI(String url, JSONObject genericScraperData) throws MalformedURLException { URL qurl = new URL(url); String data = ""; try { data = null;// ArticleExtractor.INSTANCE.getText(qurl); genericScraperData.put("query", qurl); genericScraperData.put("data", data); genericScraperData.put("NLP", "true"); } catch (Exception e) { if ("".equals(data)) { try { Document htmlPage = Jsoup.connect(url).get(); data = htmlPage.text(); genericScraperData.put("query", qurl); genericScraperData.put("data", data); genericScraperData.put("NLP", "false"); } catch (Exception ex) { } } } return genericScraperData; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule6.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Element editor = null;//from ww w. j av a 2 s. c om try { editor = doc.select(".authlist").get(0).select("p").get(0); } catch (IndexOutOfBoundsException e) { try { editor = doc.select("h4:contains(Editors) ~ p").get(0); } catch (IndexOutOfBoundsException e1) { return null; } } String[] splitted = editor.html().split("<br />"); if (splitted.length < 2) splitted = editor.html().split("<br clear=\"none\" />"); for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;/*from w w w.j av a 2 s .c o m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }
From source file:org.wallride.web.support.Posts.java
public String summary(Post post, int length) { Document document = Jsoup.parse(post.getBody()); String summary = document.text(); if (!StringUtils.hasText(summary)) { return summary; }// w ww. ja v a 2 s .c o m summary = summary.replaceAll("<.+?>", ""); if (!StringUtils.hasText(summary)) { return summary; } if (summary.length() <= length) { return summary; } return summary.substring(0, length) + "..."; }
From source file:com.switchfly.inputvalidation.sanitizer.StripHtmlSanitizer.java
@Override public String execute(String content) { if (StringUtils.isBlank(content)) { return content; }//from ww w . java 2s . co m Document document = Jsoup.parse(content); document.outputSettings().escapeMode(Entities.EscapeMode.xhtml); for (Element element : document.select("script,link,iframe,style")) { element.remove(); } return document.text(); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("authors/editors") && !prev.text().trim().toLowerCase().startsWith("author/editor")) { skip = true;// w w w .j a v a 2 s.c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("authors/editors") || next.text().trim().toLowerCase().startsWith("author/editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", url + ": This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().contains("version") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;/* w w w.j a v a2 s . c o m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("h4:contains(Editor) ~ blockquote"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("h4")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().endsWith("version:") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;//from w w w. j a v a 2 s.c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("h4")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;//from w w w . j a va2 s . c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java
public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try {/*ww w .j a v a2 s . com*/ // Get the body from the WORD String htmlBody = FileUtils.readFileToString(file); Document doc = Jsoup.parse(htmlBody); if (uriroot != null) doc.setBaseUri(uriroot); setAnchors(doc, uriroot); // Setup the document document.setContent(doc.text(), bStoreBody); document.setSize((int) file.length()); document.setType("text/html"); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } return document; }