List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Handle the request by writing everything out to scratch space * @param request/* w w w . ja v a 2s . com*/ * @param response * @param urn * @throws MMLException */ public void handle(HttpServletRequest request, HttpServletResponse response, String urn) throws MMLException { try { parseRequest(request); StringBuilder log = new StringBuilder(); Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); parseBody(body); int totalWait = 0; while (Autosave.inProgress && totalWait < 100000) { Thread.sleep(4000); totalWait += 400; } if (totalWait >= 100000) throw new DbException("Save timed out"); if (!Autosave.inProgress) { Autosave.lock = true; saveCortex(log); saveCorcode(log); saveMetadata(log); Autosave.lock = false; } System.out.println(log.toString()); } catch (Exception e) { System.out.println(e.getMessage()); Autosave.lock = false; throw new MMLException(e); } }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse/* ww w . j a v a 2s . c o m*/ * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:info.mikaelsvensson.devtools.sitesearch.SiteSearchPlugin.java
private IndexEntry createIndexEntry(final File file) { try {//w w w. j a v a 2 s.com Document document = Jsoup.parse(file, "UTF-8", "http://invalid.host"); Element contentEl = document.getElementById("contentBox"); if (contentEl == null) { contentEl = document.body(); } if (contentEl != null) { String text = Jsoup.clean(contentEl.html(), Whitelist.simpleText()); Collection<WordCount> wordCount = getWordCount(text); Collection<WordCount> filteredWordCount = filterWordCount(wordCount); return new IndexEntry(document.title(), getRelativePath(getSiteOutputFolder(), file), filteredWordCount); } } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } return null; }
From source file:gui.InboxPanel.java
private String getTextBody() { String html = BodyTextPane.getText(); Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); return body.text(); }
From source file:com.crawler.app.run.CrawlSite.java
/** * This function is called when a page is fetched and ready to be processed * by your program./*from www. ja v a2s. c o m*/ */ public org.jsoup.nodes.Element convertUrlToDocument(String url) { try { Connection.Response response = Jsoup.connect(url) //enable for error urls .ignoreHttpErrors(true) //MAXIMUN TIME .timeout(50000) //This is to prevent producing garbage by attempting to parse a JPEG binary image .ignoreContentType(true).execute(); int status = response.statusCode(); //after done if (status == 200) { org.jsoup.nodes.Document doc = response.parse(); Element body = doc.body(); return body; } else { return null; } } catch (SocketTimeoutException se) { System.out.println("getContentOnly: SocketTimeoutException"); System.out.println(se.getMessage()); return null; } catch (Exception e) { System.out.println("getContentOnly: Exception"); e.printStackTrace(); return null; } }
From source file:fr.eolya.extraction.tika.TikaWrapper.java
private void processWithPdfToText(InputStream input) { File tempFile = null;/*from ww w . ja v a2s.co m*/ File tempFile2 = null; try { if (input != null && pdfToTextPath != null && !"".equals(pdfToTextPath)) { // Get a local copy of the file tempFile = createTempFile("tmp", ".pdf", tmpPath); if (!writeToFile(tempFile, input)) return; meta2 = new HashMap<String, String>(); meta2.put(META_CONTENTSIZE, String.valueOf(tempFile.length())); tempFile2 = createTempFile("tmp", ".html", tmpPath); Shell sh = new Shell(); // Convert with PDFTOTEXT - pdftotext -enc UTF-8 -raw -q -htmlmeta -eol unix in.pdf out.html sh.exec(pdfToTextPath, "-enc", "UTF-8", "-raw", "-q", "-htmlmeta", "-eol", "unix", tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString(); tempFile.delete(); // Load in string and add the <meta http-equiv='Content-Type' content='text/html; charset=utf-8'> line InputStreamReader fr1 = new InputStreamReader(new FileInputStream(tempFile2), "UTF-8"); BufferedReader br1 = new BufferedReader(fr1); StringBuilder sb = new StringBuilder(); while (br1.ready()) { String line = br1.readLine(); sb.append(line).append("\n"); if ("</head>".equals(line)) { sb.append("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>") .append("\n"); } } br1.close(); tempFile2.delete(); meta2.put(META_CONTENTTYPE, CONTENT_TYPE_PDF); text = sb.toString(); Document doc = Jsoup.parse(text); if (doc != null) { meta2.put(META_TITLE, doc.select("title").text()); meta2.put(META_AUTHOR, getMetaContent(doc, "Author")); String creationDate = getMetaContent(doc, "CreationDate"); if (creationDate != null) { // 20130322143113Z00'00' -> 2013-03-22T14:31:13Z Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'"); Matcher m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } else { // 20130322143113+02'00' -> 2013-03-22T14:31:13Z p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'"); m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } } } if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) { Document doc2 = new Cleaner(Whitelist.basic()).clean(doc); text = doc2.body().text(); } } } } catch (Exception e) { if (tempFile != null && tempFile.exists()) tempFile.delete(); if (tempFile2 != null && tempFile2.exists()) tempFile2.delete(); e.printStackTrace(); text = null; meta2 = null; } }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java
private DBpediaOntologyOld() { Document doc = null; try {// ww w . java2 s . c om doc = Jsoup.connect(DBPEDIA_CLASSES_URL).get(); } catch (IOException ex) { Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex); } traverseHierarchy(doc.body().children().get(1).children().get(1), root, categoryMap); for (DBpediaCategory c : categoryMap.values()) { categoriesByUri.put(c.uri, c); } File dir = new File(DBPEDIA_CSV_FOLDER); for (File f : dir.listFiles()) { if (f.isFile() && f.getName().endsWith(".csv.gz")) { BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f)))); } catch (IOException ex) { Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex); } String label = f.getName().replace(".csv.gz", ""); DBpediaCategory category = categoryMap.get(label); System.out.println("Processing category " + label); if (category == null) { System.out.println("Category " + label + " not found"); continue; } try { processFile(in, category, attributeMap); } catch (Exception ex) { Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex); } System.out.println(category.domainOfAttributes.size() + " attributes found"); for (DBpediaAttribute a : category.domainOfAttributes) { System.out.println(a.getName()); } } } try { typicalityEvaluator = new TypicalityEvaluator(DBPEDIA_CSV_FOLDER + "counts.bin"); } catch (IOException ex) { Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException ex) { Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex); } try { BufferedReader in = new BufferedReader(new FileReader(SUPERPAGES_FILE)); String l = in.readLine(); while (l != null) { StringTokenizer st = new StringTokenizer(l, "\t "); String category = st.nextToken(); String superpage = st.nextToken(); Integer count = Integer.parseInt(st.nextToken()); DBpediaCategory c = categoryMap.get(category); if (c != null) { c.updateMostPopularEntity(superpage, count); } l = in.readLine(); } in.close(); for (DBpediaCategory c : categoryMap.values()) { if (c.getMostPopularEntity() != null) { c.updateAncestorsPopularity(); } } } catch (Exception e) { e.printStackTrace(); } }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
void parseMultipleMonitorDays(SubstitutionSchedule v, Document doc, JSONObject data) throws JSONException, CredentialInvalidException { if (doc.select(".mon_head").size() > 1) { for (int j = 0; j < doc.select(".mon_head").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_head").get(j).clone()); Element next = doc.select(".mon_head").get(j).nextElementSibling(); if (next != null && next.tagName().equals("center")) { doc2.body().appendChild(next.select(".mon_title").first().clone()); if (next.select("table:has(tr.list)").size() > 0) { doc2.body().appendChild(next.select("table:has(tr.list)").first()); }//from w w w . j a va2 s . c o m if (next.select("table.info").size() > 0) { doc2.body().appendChild(next.select("table.info").first()); } } else if (doc.select(".mon_title").size() - 1 >= j) { doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); doc2.body().appendChild(doc.select("table:has(tr.list)").get(j).clone()); } else { continue; } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else if (doc.select(".mon_title").size() > 1) { for (int j = 0; j < doc.select(".mon_title").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); Element next = doc.select(".mon_title").get(j).nextElementSibling(); while (next != null && !next.tagName().equals("center")) { doc2.body().appendChild(next); next = doc.select(".mon_title").get(j).nextElementSibling(); } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else { SubstitutionScheduleDay day = parseMonitorDay(doc, data); v.addDay(day); } }
From source file:org.wallride.web.support.Posts.java
protected String parse(String html) { Document document = Jsoup.parse(html); Elements elements = document.select("img"); for (Element element : elements) { String src = element.attr("src"); if (src.startsWith(wallRideProperties.getMediaUrlPrefix())) { String style = element.attr("style"); Pattern pattern = Pattern.compile("width: ([0-9]+)px;"); Matcher matcher = pattern.matcher(element.attr("style")); if (matcher.find()) { String replaced = src + "?w=" + Integer.parseInt(matcher.group(1)) * 2; element.attr("src", replaced); }//from w w w . j a v a 2s. c o m } } return document.body().html(); }
From source file:app.data.parse.WebPageUtil.java
public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException { String original = url;//from w w w . ja va2 s . c o m // hit toutiao.io // fixme http://toutiao.io/shares/640539/url if (original.startsWith("https://toutiao.io/posts/")) { original = original.replace("/posts/", "/k/"); } // check cache WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null; if (info != null) { return info; } else { info = new WebPageInfo(); info.url = original; } // attach url Document doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() // hit gold.xitu.io if (info.url.startsWith("http://gold.xitu.io/entry/")) { Elements origin = doc.select("div[class=ellipsis]"); Elements originLink = origin.select("a[class=share-link]"); info.url = originLink.attr("href"); // reconnect doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() } info.url = smartUri(info.url); // get title Elements metaTitle = doc.select("meta[property=og:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } if (StringUtils.isEmpty(info.title)) { metaTitle = doc.select("meta[property=twitter:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title; } // get desc Elements metaDesc = doc.select("meta[property=og:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[property=twitter:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[name=description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.body().select("p"); if (metaDesc != null) { for (Element element : metaDesc) { info.description = element.text(); if (info.description != null && info.description.length() >= 20) { break; } } } } } } info.description = ellipsis(info.description, 140, "..."); // cache info if (urlInfoCache != null) { urlInfoCache.put(original, info); } return info; }