List of usage examples for org.jsoup.nodes Document title
public String title()
From source file:io.github.carlomicieli.footballdb.starter.parsers.SeasonGamesParser.java
protected static String extractYear(Document doc) { return doc.title().substring(0, 4); }
From source file:io.github.carlomicieli.footballdb.starter.documents.WebDocumentDownloader.java
private static Document downloadFromURL(String url) { validateUrl(url);// w w w . j a v a 2 s .co m try { Document doc = Jsoup.connect(url).userAgent(CHROME_USER_AGENT).get(); App.log().info("Downloading '{}'...", doc.title()); return doc; } catch (IOException e) { App.log().error("Error for '{}': {}", url, e); return null; } }
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/*from w w w . j a v a 2s . com*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:com.zack6849.alphabot.api.Utils.java
public static String getTitle(String link) { String response = ""; try {//from ww w .j av a 2 s .co m HttpURLConnection conn = (HttpURLConnection) new URL(link).openConnection(); conn.addRequestProperty("User-Agent", USER_AGENT); String type = conn.getContentType(); int length = conn.getContentLength() / 1024; response = String.format("HTTP %s: %s", conn.getResponseCode(), conn.getResponseMessage()); String info; if (type.contains("text") || type.contains("application")) { Document doc = Jsoup.connect(link).userAgent(USER_AGENT).followRedirects(true).get(); String title = doc.title() == null || doc.title().isEmpty() ? "No title found!" : doc.title(); info = String.format("%s - (Content Type: %s Size: %skb)", title, type, length); return info; } info = String.format("Content Type: %s Size: %skb", type, length); return info; } catch (IOException ex) { if (ex.getMessage().contains("UnknownHostException")) { return Colors.RED + "Unknown hostname!"; } return response.isEmpty() ? Colors.RED + "An error occured" : response; } }
From source file:index.IndexManager.java
public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) { final SolrInputDocument index = new SolrInputDocument(); index.setField("id", document.location()); index.setField("time", String.valueOf(System.currentTimeMillis())); index.setField("title", document.title()); final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href")) .collect(Collectors.toSet()); final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src")) .collect(Collectors.toSet()); links.forEach(link -> index.addField("link", link)); media.forEach(link -> index.addField("media", link)); formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e)); formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e)); formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e)); formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e)); formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e)); formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e)); formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e)); formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e)); int i = 0;//from w ww . ja va 2 s .c o m Collection<String> text = chunkToLength(document.text()); for (String chunk : text) index.addField(++i + "_text", chunk); return Triple.of(index, links, media); }
From source file:com.nuance.expertassistant.ContentExtractor.java
public static void extract(Document doc) { final Elements links = doc.getElementsByTag("a"); final Elements ps = doc.select("p"); final String title = doc.title(); print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">"); final Elements elements = doc.select("*"); final ArrayList<String> openHeaderList = new ArrayList<String>(); for (final Element element : elements) { if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") { } else if (element.tagName().toString().contains("a")) { } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); }/*from w w w . j ava 2 s. c om*/ if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h1"); } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h2"); } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h3"); } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h4"); } else { print("<para>"); print(stripNonValidXMLCharacters(element.ownText())); print("</para>"); } /* * if (element.tagName().contains("img")) { print("<img src=\"" + * element.attr("src") + "\"></img>"); } */ } if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); } if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("</section>"); }
From source file:io.github.carlomicieli.footballdb.starter.parsers.DraftParser.java
protected Year extractYear(Document doc) { String title = doc.title(); return Year.parse(title.substring(0, 4)); }
From source file:co.foxdev.foxbot.utils.Utils.java
public static String parseChatUrl(String stringToParse, User sender) { try {/*w w w . j av a 2 s .c o m*/ Connection conn = Jsoup.connect(stringToParse); conn.followRedirects(true).userAgent( "FoxBot // http://foxbot.foxdev.co // Seeing this? It means your web address was posted on IRC and FoxBot is getting page info (title, size, content type) to send to the channel. Nothing to worry about.") .timeout(3000).maxBodySize(100000).ignoreContentType(true); Connection.Response response = conn.execute(); Document doc = response.parse(); String size = response.header("Content-Length") == null ? "Unknown" : (Integer.parseInt(response.header("Content-Length")) / 1024) + "kb"; String contentType = response.contentType().contains(";") ? response.contentType().split(";")[0] : response.contentType(); if (response.statusCode() != 200 && response.statusCode() != 302 && response.statusCode() != 301) { return colourise(String.format("(%s's URL) &cError: &r%s %s ", munge(sender.getNick()), response.statusCode(), response.statusMessage())); } if (!contentType.contains("html")) { return colourise(String.format("(%s's URL) &2Content Type: &r%s &2Size: &r%s", munge(sender.getNick()), contentType, size)); } String title = doc.title() == null || doc.title().isEmpty() ? "No title found" : doc.title(); if (stringToParse.matches("^https?://(www\\.)?youtube\\.com/watch.*")) { title = doc.select("span#eow-title").first().text(); String views = doc.select("span.watch-view-count").first().text(); String likes = doc.select("span.likes-count").first().text(); String dislikes = doc.select("span.dislikes-count").first().text(); String uploader = doc.select("a.g-hovercard.yt-uix-sessionlink.yt-user-name").first().text(); return colourise(String.format( "(%s's URL) &2Title: &r%s &2Uploader: &r%s &2Views: &r%s &2Rating: &a%s&r/&c%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), uploader, views, likes, dislikes)); } if (stringToParse.matches("^https?://(www\\.)?reddit\\.com/r/.*/comments/.*")) { String poster = doc.select("p.tagline").select("a.author").text().split(" ")[0]; String comments = doc.select("a.comments").first().text().split(" ")[0]; String likes = doc.select("span.upvotes").first().text().split(" ")[0]; String dislikes = doc.select("span.downvotes").first().text().split(" ")[0]; return colourise(String.format( "(%s's URL) &2Title: &r%s &2Poster: &r%s &2Comments: &r%s &2Rating: &6%s&r/&9%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), poster, comments, likes, dislikes)); } return colourise(String.format("(%s's URL) &2Title: &r%s &2Content Type: &r%s &2Size: &r%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), contentType, size)); } catch (IllegalArgumentException ignored) { } catch (Exception ex) { foxbot.getLogger().error("Error occurred while parsing URL", ex); } return null; }
From source file:cn.edu.hfut.dmic.webcollector.example.TutorialCrawler2.java
/** * ???????visit//from ww w . j a v a 2 s .co m * @param page * @param nextLinks ???URLautoParsetrue??nextLinks */ @Override public void visit(Page page, Links nextLinks) { Document doc = page.getDoc(); String title = doc.title(); System.out.println("URL:" + page.getUrl() + " :" + title); /*??mysql*/ if (jdbcTemplate != null) { int updates = jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)", title, page.getUrl(), page.getHtml()); if (updates == 1) { System.out.println("mysql??"); } } /* //nextLinksx?URL?????URL //????? //?????Crawler.addForcedSeedURL nextLinks.add("http://www.csdn.net"); */ }
From source file:org.brunocvcunha.taskerbox.impl.crawler.SlexyAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".main").select("a")) { final String id = el.attr("href").replace("/view/", ""); final String title = id; if (canAct(id)) { addAct(id);/*www . ja va 2 s.com*/ spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }