List of usage examples for org.jsoup.nodes Document html
public String html()
From source file:Main.java
public static void main(String[] args) throws Exception { Document doc = Jsoup.parse(new File("t2.html"), "UTF-8"); doc.select("area#area1").after("<area id=\"newArea\" />"); System.out.println(doc.html()); }
From source file:isc_415_practica_1.ISC_415_Practica_1.java
/** * @param args the command line arguments *//*from www. j av a2s . c om*/ public static void main(String[] args) { String urlString; Scanner input = new Scanner(System.in); Document doc; try { urlString = input.next(); if (urlString.equals("servlet")) { urlString = "http://localhost:8084/ISC_415_Practica1_Servlet/client"; } urlString = urlString.contains("http://") || urlString.contains("https://") ? urlString : "http://" + urlString; doc = Jsoup.connect(urlString).get(); } catch (Exception ex) { System.out.println("El URL ingresado no es valido."); return; } ArrayList<NameValuePair> formInputParams; formInputParams = new ArrayList<>(); String[] plainTextDoc = new TextNode(doc.html(), "").getWholeText().split("\n"); System.out.println(String.format("Nmero de lineas del documento: %d", plainTextDoc.length)); System.out.println(String.format("Nmero de p tags: %d", doc.select("p").size())); System.out.println(String.format("Nmero de img tags: %d", doc.select("img").size())); System.out.println(String.format("Nmero de form tags: %d", doc.select("form").size())); Integer index = 1; ArrayList<NameValuePair> urlParameters = new ArrayList<>(); for (Element e : doc.select("form")) { System.out.println(String.format("Form %d: Nmero de Input tags %d", index, e.select("input").size())); System.out.println(e.select("input")); for (Element formInput : e.select("input")) { if (formInput.attr("id") != null && formInput.attr("id") != "") { urlParameters.add(new BasicNameValuePair(formInput.attr("id"), "PRACTICA1")); } else if (formInput.attr("name") != null && formInput.attr("name") != "") { urlParameters.add(new BasicNameValuePair(formInput.attr("name"), "PRACTICA1")); } } index++; } if (!urlParameters.isEmpty()) { try { CloseableHttpClient httpclient = HttpClients.createDefault(); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(urlParameters, Consts.UTF_8); HttpPost httpPost = new HttpPost(urlString); httpPost.setHeader("User-Agent", USER_AGENT); httpPost.setEntity(entity); HttpResponse response = httpclient.execute(httpPost); System.out.println(response.getStatusLine()); } catch (IOException ex) { Logger.getLogger(ISC_415_Practica_1.class.getName()).log(Level.SEVERE, null, ex); } } }
From source file:Main.java
public static String toXHTML(String html) { html = html.replaceAll("(?s)<script>.*?</script>", "<!-- removed scripts --!>"); final Document document = Jsoup.parse(html); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); return document.html(); }
From source file:com.facultyshowcase.app.ui.UIUtil.java
public static String scrubHtml(String html, Request request, Response response) { if (!StringFactory.isEmptyString(html)) { // Process the HTML converting links as necessary (adding JSESSIONID(s) /// for URL based session tracking, converting resource links to increase concurrent loading limit, /// CMS link externalization, etc). /// This is *not* sanitation and should always be done before rendering - never before persisting. /// We are doing this before sanitizing the HTML to avoid having to whitelist internal URL protocols, etc. try {/* ww w . j a v a 2 s .com*/ html = XMLRenderer.parseWithRoot(html, request, response); } catch (IOException e) { _logger.error("Unable to accept HTML: " + html, e); } // We don't trust the input, so we sanitize it with a whitelist of allowed HTML. Document dirty = Jsoup.parseBodyFragment(html, ""); Whitelist whitelist = Whitelist.relaxed(); // Don't allow users to use our website as a link farm whitelist.addEnforcedAttribute("a", "rel", "nofollow"); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); html = clean.html(); return html; } else { return ""; } }
From source file:cognition.common.utils.StringTools.java
public static String addMetaDataToHtml(String htmlText, String metaData) { Document doc = Jsoup.parse(htmlText); doc.head().append(metaData);/*w ww . j a v a2s .c o m*/ return doc.html(); }
From source file:models.NotificationMail.java
private static String getHtmlMessage(Lang lang, String message, String urlToView, Resource resource) { String content = getRenderedHTMLWithTemplate(lang, Markdown.render(message), urlToView, resource); Document doc = Jsoup.parse(content); handleLinks(doc);//from w w w.j a va 2 s . com handleImages(doc); return doc.html(); }
From source file:automation.Launcher.java
public static String br2nl(String html) { if (html == null) { return html; }/* w w w . jav a 2s. com*/ Document document = Jsoup.parse(html); document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing document.select("p").prepend("\\n\\n"); document.select("div").prepend("\\n"); // System.out.println(document.html()); document.select("br").append("\\n"); // System.out.println(document.html()); String s = document.html().replaceAll("\\\\n", "\n"); // System.out.println(s); return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
From source file:ru.xxlabaza.popa.pack.PackingService.java
private String processHtml(Document document) { String content = commentRemoveService.removeComments(document.html(), HTML); return Parser.unescapeEntities(content, false); }
From source file:de.limod.portals.AutoScout.java
@Override public List<Car> getCars() { Document page = super.getPage("AutoScout", AutoScout.HOST, this.getQuery()); String content = page.html(); // parse json object from request Pattern pattern = Pattern.compile("articlesFromServer\\s+=\\s+\\Q[{\\E.*\\Q}]\\E"); Matcher matcher = pattern.matcher(content); List<Car> cars = new ArrayList<>(); // check occurance if (matcher.find()) { String json = matcher.group().trim(); json = json.substring(json.indexOf("["), json.length()); JSONArray carsArray = new JSONArray(json); // iterate json object for (Object carEntry : carsArray) { JSONObject obj = (JSONObject) carEntry; String title = obj.getString("mk") + " " + obj.getString("vr"); String price = String.valueOf(obj.getDouble("price_raw")); String url = String.format(AutoScout.URL, String.valueOf(obj.getInt("ei"))); String erstzulassung = obj.getString("fr"); String kilomter = obj.getString("ma"); String id = String.valueOf(obj.getInt("ei")); Car c = new Car(title, "--", price, url, "AutoScout", id); c.setFound(new Date()); cars.add(c);// www .j a va 2 s.com if (cars.size() > this.getMaxHits()) { break; } } } return cars; }
From source file:com.betel.flowers.pdf.util.XMLtoHtml.java
public String checkHTML(String htmlString) throws IOException { String checkedhtml = null;/*w w w .j ava2 s . c o m*/ try { Document docHtml = Jsoup.parse(htmlString); docHtml.outputSettings().syntax(Document.OutputSettings.Syntax.xml); String value = new String(docHtml.html()); checkedhtml = StringEscapeUtils.unescapeHtml4(value); } catch (Exception ex) { throw ex; } return checkedhtml; }