List of usage examples for org.jsoup.nodes Document location
String location
To view the source code for org.jsoup.nodes Document location.
Click Source Link
From source file:index.IndexManager.java
public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) { final SolrInputDocument index = new SolrInputDocument(); index.setField("id", document.location()); index.setField("time", String.valueOf(System.currentTimeMillis())); index.setField("title", document.title()); final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href")) .collect(Collectors.toSet()); final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src")) .collect(Collectors.toSet()); links.forEach(link -> index.addField("link", link)); media.forEach(link -> index.addField("media", link)); formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e)); formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e)); formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e)); formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e)); formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e)); formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e)); formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e)); formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e)); int i = 0;//from w w w.jav a 2s . c om Collection<String> text = chunkToLength(document.text()); for (String chunk : text) index.addField(++i + "_text", chunk); return Triple.of(index, links, media); }
From source file:com.ndemyanovskyi.backend.site.Site.java
private static void checkRedirected(Document doc, String url) throws IOException { if (!doc.location().equals(url)) { LOG.log(Level.WARNING, "Connection is redirected: " + "original url = '" + url + "; final url = " + doc.location() + "."); /*throw new ConnectException("Connection is redirected: " + "original url = '" + url + "; final url = " + doc.location() + ".");*/ }//from w w w . j av a 2s . c om }
From source file:com.techcavern.wavetact.eventListeners.FunMsgListener.java
@Override public void onMessage(MessageEvent event) throws Exception { boolean funmsg = false; Record rec = DatabaseUtils.getChannelProperty(IRCUtils.getNetworkNameByNetwork(event.getBot()), event.getChannel().getName(), "funmsg"); if (rec != null && rec.getValue(Channelproperty.CHANNELPROPERTY.VALUE).equalsIgnoreCase("true")) funmsg = true;/*from ww w . ja v a 2 s. c o m*/ final boolean funmsg2 = funmsg; class process implements Runnable { public void run() { String commandchar = IRCUtils.getCommandChar(event.getBot(), event.getChannel()); if (commandchar == null) { return; } if (PermUtils.getPermLevel(event.getBot(), event.getUser().getNick(), event.getChannel()) > -2 && !event.getMessage().startsWith(commandchar)) { String[] message = StringUtils.split(event.getMessage(), " "); for (String arg : message) { try { arg = Colors.removeFormattingAndColors(arg); if (arg.toLowerCase().replaceAll("o+", "o").replaceAll("0+", "o").contains("yolo") && funmsg2) { if (IRCUtils.checkIfCanKick(event.getChannel(), event.getBot(), event.getUser())) { IRCUtils.sendKick(event.getBot().getUserBot(), event.getUser(), event.getBot(), event.getChannel(), "YOLO"); } else { IRCUtils.sendAction(event.getUser(), event.getBot(), event.getChannel(), "kicks " + IRCUtils.noPing(event.getUser().getNick()) + " (YOLO)", ""); } return; } Record autourlRecord = DatabaseUtils.getChannelProperty( IRCUtils.getNetworkNameByNetwork(event.getBot()), event.getChannel().getName(), "autourl"); boolean autourl = autourlRecord != null && autourlRecord.getValue(CHANNELPROPERTY.VALUE).equalsIgnoreCase("true"); Record ignorehttpRecord = DatabaseUtils.getChannelProperty( IRCUtils.getNetworkNameByNetwork(event.getBot()), event.getChannel().getName(), "ignorehttp"); boolean ignorehttp = ignorehttpRecord != null && ignorehttpRecord.getValue(CHANNELPROPERTY.VALUE).equalsIgnoreCase("true"); if (ignorehttp && !arg.startsWith("https://") && !arg.startsWith("http://")) { arg = "http://" + arg; } if ((funmsg2 || autourl) && Registry.urlValidator.isValid(arg)) { try { Document doc = Jsoup.connect(arg).userAgent(Registry.USER_AGENT).get(); if (doc.location().contains("stop-irc-bullying.eu") && funmsg2) { if (IRCUtils.checkIfCanKick(event.getChannel(), event.getBot(), event.getUser())) { IRCUtils.sendKick(event.getBot().getUserBot(), event.getUser(), event.getBot(), event.getChannel(), "? \\ ()/ ? [https://goo.gl/Tkb9dh]"); } else { IRCUtils.sendAction(event.getUser(), event.getBot(), event.getChannel(), "kicks " + IRCUtils.noPing(event.getUser().getNick()) + " (? \\ ()/ ?) [https://goo.gl/Tkb9dh]", ""); } /** * My apologies to those using this site responsibly. But in my experience, this site has been linked numerous times for entertainment purposes * In fact, I have yet to notice a time when it is linked for its intended purpose. And if you are using this site for its intended purpose, please think of * better of way of expressing how you feel. Linking a generic site rarely solves any problems. Instead explain to the person how and why they offended you. If * they ignore you, then you ignore them. */ } else if (autourl) { String title = doc.title(); if (!title.isEmpty()) { IRCUtils.sendMessage(event.getBot(), event.getChannel(), "[" + IRCUtils.noPing(event.getUser().getNick()) + "] " + title, ""); } } } catch (Exception e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); } } } } } Registry.threadPool.execute(new process()); }
From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java
private String fetchStory(Website website) throws IOException { int roll = 0; String result;// w w w.j a va 2s. c o m int resultLength; int resultLines; //noinspection ConstantConditions do { roll++; final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get(); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); logger.info("Fetched a story from {}", doc.location()); final Element story = doc.select(website.getCssQuery()).first(); if (story == null) { return ERROR_COULD_NOT_PARSE; } story.select("div").remove(); story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), ""))); story.select("br").after("\\n"); story.select("p").before("\\n\\n"); final String storyHtml = story.html().replaceAll("\\\\n", "\n"); result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) .trim(); resultLength = result.length(); resultLines = countLines(result); } while (CONFIG_REROLL_LONG_STORIES && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES) && roll <= CONFIG_MAX_ROLLS); return result; }
From source file:org.ow2.proactive_grid_cloud_portal.cli.cmd.sched.PackageDownloader.java
/** * This method browses a web directory and all its subdirectories and returns a set containing all the urls of their contents. * * @param dirUrl//from www .j a v a 2 s . c om * @param cummulativeRelativeUrl a string used for keeping track of a directory structure in the recursive context. MUST BE empty ("") in the first call. * @return * @throws IOException * @throws URISyntaxException */ private Set<String> listWebDirectoryContent(URL dirUrl, String cummulativeRelativeUrl) throws IOException, URISyntaxException { Set<String> result = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); //Load the directory listing page and extract all the links it contains. Document doc = Jsoup.connect(dirUrl.toString()).get(); logger.info("Listing directories from: " + doc.location()); for (Element file : doc.select("a[href]")) { String relativeURL = file.attr("href"); // skip sort urls in Apache Tomcat if (relativeURL.startsWith("?")) { continue; } // skip parent directory url if (isRelativeParentDirectoryUrl(relativeURL)) { continue; } result.add(cummulativeRelativeUrl + relativeURL); } for (String relativeURL : result) { URL absoluteUrl = new URL(dirUrl, relativeURL); if (!isFileURL(relativeURL)) { result.addAll(listWebDirectoryContent(absoluteUrl, relativeURL)); } } return result; }