List of usage examples for org.jsoup.nodes Document title
public String title()
From source file:org.brunocvcunha.taskerbox.impl.crawler.SniptAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".grid-block").select("a")) { final String id = el.attr("href").replace("http://snipt.org/", ""); final String title = id + " - " + el.text(); if (canAct(id)) { addAct(id);/*from w w w .ja va2 s . co m*/ spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:org.brunocvcunha.taskerbox.impl.crawler.PastebinAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".maintable").select("a")) { final String id = el.attr("href").substring(1); if (id.startsWith("archive")) { continue; }/*w w w . j av a 2 s .com*/ final String title = id + " - " + el.text(); if (canAct(id)) { addAct(id); spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:com.johan.vertretungsplan.parser.UntisMonitorParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, url.getBoolean("following"), docs); }//from ww w . j a va 2 s . c o m LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.title().contains("Untis")) { VertretungsplanTag tag = parseMonitorVertretungsplanTag(doc, schule.getData()); if (!tage.containsKey(tag.getDatum())) { tage.put(tag.getDatum(), tag); } else { VertretungsplanTag tagToMerge = tage.get(tag.getDatum()); tagToMerge.merge(tag); tage.put(tag.getDatum(), tagToMerge); } } else { //Fehler } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:org.brunocvcunha.taskerbox.impl.crawler.CodepadAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".section")) { final String id = el.select("a").attr("href").replace("http://codepad.org/", ""); String code = el.select("pre").text().replaceAll("\r?\n", " "); if (code.length() > 32) { code = code.substring(0, 32); }/*from w w w .j a v a 2s . com*/ final String title = id + " - " + code; if (canAct(id)) { addAct(id); spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:org.brunocvcunha.taskerbox.impl.crawler.PastieAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".pastePreview")) { final String id = el.select("a").attr("href").replace("http://pastie.org/pastes/", ""); String code = el.select("pre").text().replaceAll("\r?\n", " "); if (code.length() > 32) { code = code.substring(0, 32); }/*from w w w . j a v a 2s . c om*/ final String title = id + " - " + code; if (canAct(id)) { addAct(id); spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:cn.edu.hfut.dmic.webcollector.example.TutorialCrawler.java
@Override public Links visitAndGetNextLinks(Page page) { Document doc = page.getDoc(); String title = doc.title(); System.out.println("URL:" + page.getUrl() + " :" + title); /*??mysql*//*from www . j a v a2s . c o m*/ if (jdbcTemplate != null) { int updates = jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)", title, page.getUrl(), page.getHtml()); if (updates == 1) { System.out.println("mysql??"); } } /*?2.0*/ /*?page?? ?URL???URL*/ Links nextLinks = new Links(); /*???URL Links.addAllFromDocument??*/ nextLinks.addAllFromDocument(doc, regexRule); /*LinksArrayList<String>,?add?addAllURL ?????return null ???seed?return null */ return nextLinks; }
From source file:com.qubole.rubix.hadoop1.Hadoop1ClusterManager.java
private List<String> extractNodes(String dfsnodelist) { Document doc = Jsoup.parse(dfsnodelist); String title = doc.title(); List<String> workers = new ArrayList<String>(); Elements links = doc.select(".name"); for (int i = 0; i < links.size(); i++) { Elements nodes = links.get(i).select("a[href]"); if (nodes != null && nodes.size() > 0) { String node = nodes.get(0).ownText(); if (node != null && !node.isEmpty()) { workers.add(node);//from w w w .ja v a 2 s .c o m } } } Collections.sort(workers); return workers; }
From source file:fi.helsinki.opintoni.service.usefullink.UsefulLinkService.java
public SearchPageTitleDto searchPageTitle(SearchPageTitleDto searchPageTitleDto) throws NotFoundException { try {/* www. j a va 2 s . c o m*/ HttpHeaders headers = new HttpHeaders(); headers.setAccept(Lists.newArrayList(MediaType.TEXT_HTML)); headers.add("User-Agent", "Mozilla"); HttpEntity<String> entity = new HttpEntity<>("parameters", headers); ResponseEntity<String> responseEntity = linkUrlLoaderRestTemplate.exchange(searchPageTitleDto.searchUrl, HttpMethod.GET, entity, String.class); if (responseEntity.getStatusCode().equals(HttpStatus.OK)) { Document document = Jsoup.parse(responseEntity.getBody()); searchPageTitleDto.searchResult = document.title(); } } catch (Exception e) { } return searchPageTitleDto; }
From source file:org.manalith.ircbot.plugin.uriinfo.UriInfoPlugin.java
private String getInfo(String uri) { String result = null;/*from w ww. ja va2s . c om*/ Response response; try { // ? ?? User Agent ? response = Jsoup.connect(uri).userAgent(USER_AGENT).execute(); } catch (UnsupportedMimeTypeException e) { return enablePrintContentType ? "[?? ?] " + e.getMimeType() : null; } catch (IOException e) { logger.warn(e.getMessage(), e); return null; } String contentType = response.contentType(); // ? title? ? . try { Document document = response.parse(); String title = document.title(); if (StringUtils.isBlank(title)) throw new IOException(); title = title.trim().replaceAll("(\\s){1,}", " "); // ?? String stitle = getSiteSpecificTitle(uri, document); if (stitle == null) result = "[?? ] " + title; else result = "[?? ] " + stitle + " | " + title; } catch (IOException e) { // parse ? title -- HTML? // ??? // content type if (contentType.startsWith("text/html")) result = "[?? ]"; } if (result == null && enablePrintContentType) { result = "[?? ?] " + contentType; } return result; }
From source file:app.data.parse.WebPageUtil.java
public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException { String original = url;/*from w w w . j a va 2s .co m*/ // hit toutiao.io // fixme http://toutiao.io/shares/640539/url if (original.startsWith("https://toutiao.io/posts/")) { original = original.replace("/posts/", "/k/"); } // check cache WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null; if (info != null) { return info; } else { info = new WebPageInfo(); info.url = original; } // attach url Document doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() // hit gold.xitu.io if (info.url.startsWith("http://gold.xitu.io/entry/")) { Elements origin = doc.select("div[class=ellipsis]"); Elements originLink = origin.select("a[class=share-link]"); info.url = originLink.attr("href"); // reconnect doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() } info.url = smartUri(info.url); // get title Elements metaTitle = doc.select("meta[property=og:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } if (StringUtils.isEmpty(info.title)) { metaTitle = doc.select("meta[property=twitter:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title; } // get desc Elements metaDesc = doc.select("meta[property=og:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[property=twitter:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[name=description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.body().select("p"); if (metaDesc != null) { for (Element element : metaDesc) { info.description = element.text(); if (info.description != null && info.description.length() >= 20) { break; } } } } } } info.description = ellipsis(info.description, 140, "..."); // cache info if (urlInfoCache != null) { urlInfoCache.put(original, info); } return info; }