List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:net.sf.texprinter.utils.StringUtils.java
/** * Escapes HTML entities and tags to a TeX format. This method tries to * replace HTML code by the TeX equivalent macros. * * @param text The input text.//w w w. j av a2 s .c o m * @return A new text formatted from HTML to TeX. */ public static String escapeHTMLtoTeX(String text) { // replace bold tags String newText = text.replaceAll("<b>", "\\\\textbf{"); newText = newText.replaceAll("</b>", "}"); // replace bold tags newText = newText.replaceAll("<strong>", "\\\\textbf{"); newText = newText.replaceAll("</strong>", "}"); // replace italic tags newText = newText.replaceAll("<i>", "\\\\textit{"); newText = newText.replaceAll("</i>", "}"); // replace emphasized tags newText = newText.replaceAll("<em>", "\\\\emph{"); newText = newText.replaceAll("</em>", "}"); // replace paragraphs tags newText = newText.replaceAll("<p>", ""); newText = newText.replaceAll("</p>", "\n\n"); // replace ordered lists tags newText = newText.replaceAll("<ol>", "\\\\begin{enumerate}\n"); newText = newText.replaceAll("</ol>", "\\\\end{enumerate}\n"); // replace unordered lists tags newText = newText.replaceAll("<ul>", "\\\\begin{itemize}\n"); newText = newText.replaceAll("</ul>", "\\\\end{itemize}\n"); // replace item tags newText = newText.replaceAll("<li>", "\\\\item "); newText = newText.replaceAll("</li>", "\n"); // replace blockquote tags newText = newText.replaceAll("<blockquote>", "\\\\begin{quotation}\n"); newText = newText.replaceAll("</blockquote>", "\\\\end{quotation}\n"); // replace code tags newText = newText.replaceAll("<pre><code>", "\\\\begin{TeXPrinterListing}\n"); newText = newText.replaceAll("<pre class=.*\"><code>", "\\\\begin{TeXPrinterListing}\n"); newText = newText.replaceAll("</code></pre>", "\\\\end{TeXPrinterListing}\n\n"); // replace inline code tags newText = newText.replaceAll("<code>", "\\\\lstinline|"); newText = newText.replaceAll("</code>", "|"); // replace links tags newText = newText.replaceAll("alt=\".*\" ", ""); // parse the text Document docLinks = Jsoup.parse(newText); // get all the links Elements links = docLinks.getElementsByTag("a"); // if there are links if (links.size() > 0) { // for every link for (Element link : links) { // get the outer HTML String temp = link.outerHtml(); // replace it newText = newText.replaceFirst(Pattern.quote(temp), "\\\\href{" + link.attr("href") + "}{" + link.text() + "}"); } } // create a list of images ArrayList<ImageGroup> images = new ArrayList<ImageGroup>(); // parse the current text Document doc = Jsoup.parse(text); // fetch all the media found Elements media = doc.select("[src]"); // for all media found for (Element m : media) { // if it's an image tag if (m.tagName().equals("img")) { // create a new image group with the image link ImageGroup image = new ImageGroup(m.attr("abs:src")); // add to the list of images images.add(image); // set the current image to null image = null; } } // create a new loop saver LoopSaver lps = null; // for every image in the list of images for (ImageGroup img : images) { // create a new object lps = new LoopSaver(); // while there are references for that image in the text while (newText.indexOf(img.getURL()) != -1) { // tick loop lps.tick(); // replace the occurrence of that image newText = newText.replaceFirst("<img src=\"" + img.getURL() + "\" />", "\\\\begin{figure}[h!]\n\\\\centering\n\\\\includegraphics[scale=0.5]{" + img.getName() + "}\n\\\\end{figure}"); } // lets try try { // finally, download the image to the current directory Downloader.download(img.getURL(), img.getName()); } catch (Exception exception) { // log message log.log(Level.WARNING, "An error occurred while getting the current image. Trying to set the replacement image instead. MESSAGE: {0}", StringUtils.printStackTrace(exception)); // image could not be downloaded for any reason try { // open a file stream FileOutputStream f = new FileOutputStream(img.getName()); // write a replacement image f.write(Base64.decode( "iVBORw0KGgoAAAANSUhEUgAAALAAAABKCAIAAACU3El2AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAcjSURBVHhe7VzrmeMgDExdKSj1pJptZovZi3lqhAQ4n7HhrPt1STAaRoOELa0ff/bPGCAMPIwNY4AyYIIwPQADJggThAnCNKAzYBHC1GERwjRgEcI00MmApYxOou4yzARxF093rtME0UnUXYaZIO7i6c51miA6ibrLMBPEXTzduc4vBPHzejwez/cvt+C+f7x+GpY7h/2+nz2zdS5z6LCFoLZ5qAli8wj/9xHCzIK4xDeXGG27No7YB68dIfx8JCJoguiHOG7kvsUfhOMSo/3Y98H7XhBvlyFIWAfDPjFIOWT/MLgC9IlW/E/xH0tqeWiR7tx1z7dfkE95dKo00TbF8/2TrMTsiIyLhsh8eJVHG00EJGAizUeScT+8KieSqL4VRGCOHggILySI/L5feNzYP0wThGBF2Q2frwPn0oBImuhgZ8T/ElzjZqK6JHPKhmhM/Xlt15e7wpkISIiFIBaF54AJ4LkP6vhmYPlWEFHT4JR0DPR4hINnWjSB3RxWFQRaaYdHIeGxpMjmyIqAS1EFwvk3jy5NaiYASSkaIpQULhR4sl+acvj7GyOItJuEnKGkDH57ohGupAwI9cKdDgnkhVQl97DDdNYvRHevRm1FwZAgUkSTU2uPILRrfVSob9QLBeFNs71HvgOfNYZpEQKP0dL2CSPoBGqEwD0nha12hBANCavTzuXdgmjCmytC5Exapb9zGGREksfFy0WqHc+eQymZ8S1MEzDdU/QMISdp2RALahsQxUSPILRr1QiBP7SCxKCUEU5HrbuMzmEkA4VTvveudHn6DjZRjrOvl7tXoI/V2jGdnNqer1e8l5EOoYqhjBTuWUhiIodK6fyLQmdpg8ILKxOimXqqA420BdGS1KDfBS8NstQ9rRbouydYYeC0gpiQ/QkhHS+xKQURA2yzLnI8H7UZTRDn8m3WZmBgyggxAzF3xWCCuKvnlXWbIEwQa9x2mp+uYaAeIbRndiOx7nsAsW/0cbilp+2HzX7Ior5EuIogeElH7kU4zCXNib6kuzmvGzCvIPrwHztKZqOsHfj65iHcfbGAmwqC0B3qNq1mHrKTc8GAbW94Vo8tQ6qLIXkRbzBBkOpG0fXHLJGqQ+oLVi5PgknXhIqGWJigdRahGk1KwNt07Ras2JgDvVUfSHWqOcJe0ddTBhdEKAtF3txyiaty/bFUEusbAEe6KYSWD7KIHkEoc4qooDzse7oqkDwQcg0tfArtSbwpKhBGCq6EOr9yuXwqfR/r/EINTEPYq4bPuJ2CaBfigu0MzW8DV110vEiRHhSB8qDzQSsb3YjNOUVUWPVksaZEIRQQs1tTrMjRK0+4/c9VWTecIdSmWny9pQUfl4uJCqnG/kyla60ikIMFgckh96yw/0EU5N24REEZuJx1YFvzc2euvQuoyp4u/XKPAp3B/c7yI673M7XPDLEVIowGb0PMis2IXAFlCAjs5ZgUkXx5yjlSEHSPZeQ0L0sdXn3hDFIGuYTYxM2Uxsio4s+ZNuVypkmBbmkTk95tL4XPF5up0Nsd0mNbEKy5Ja1FXpQWw/oo9qMOFwTJk879JEJSXJqD5bY7TKV0noKZ4k/HeIiOqIpdqkMqQ0R5hpCSaVj80+nBr+H5+ZAgdggCFIFJqOwBo0EBEO5QxJGCoGGYNCaxWIyHx9wzhE8Wcgj2i+mIEHlYmhT607eD65bI6eHDjcxVdg1qJDT9Do1b+GccoEh0S/gkd2+KKSPnqrAmgT3oAdMQdktieC1DCGOTtTl0c3WLgaMFgWf3VlS+BeVzL3K0IFK05/cSc9NyX3QnCOK+5K64chPEil4biNkEMZDcFac2QazotYGYTRADyV1x6l2CaD7dXZEBwwwMdD+pTM8B+TPEOQlltcs5Qc6IygQxo1cuxFQTRPHKppAyirdLffDTmqYUQ8jv8ck1LRxAETG/7ikUpppvf2J/CA4F1qIlQLLrC0/C+6M6lnah9waY3h8h6m+XgrceJbz08OFfskQfYpMiXXRlEA37qDY1lfNrKUOxGxs06i9ochf/55WY/YIoO3wY+SVt5WFU6iEoezz4G2g0Q8JhVxGEZld720ZzaQP26LVTHiEIVjRmJWWpM1ptBGIOkPxRvv1Jcr4sCNWuJojW0q513gjrhwmicvPB3RALXqwPMTUc5qgsCaI0JMyvtedLEaJ8oVgedb8b7cZzCCQEPpEPrao2eIycIcouo3qE6Ho1k59fe7ESXYLch4Zy1ZbWWvKIzXvKnK0HU+nAnk6CQpdw5LBsf0pryAd/7EpkjUANQeiGKvOzkAK3IM3mJc3ibQVxiirNyDwMtCLEPEgNySkMmCBOoXkdIyaIdXx1ClITxCk0r2PEBLGOr05BaoI4heZ1jJgg1vHVKUhNEKfQvI4RE8Q6vjoFqQniFJrXMWKCWMdXpyA1QZxC8zpGTBDr+OoUpP8Arv92hCPEu+kAAAAASUVORK5CYII=")); // close the file f.close(); } catch (IOException ioexception) { // log message log.log(Level.SEVERE, "An IO exception occured while trying to create the image replacement. MESSAGE: {0}", StringUtils.printStackTrace(ioexception)); } catch (Exception except) { // log message log.log(Level.SEVERE, "An error occured while trying to create the image replacement. MESSAGE: {0}", StringUtils.printStackTrace(except)); } } } // unescape all HTML entities newText = StringEscapeUtils.unescapeHtml(newText); // return new text return newText; }
From source file:net.intelliant.util.UtilCommon.java
/** * 1. Compressed JPEG images.// ww w.jav a2s . co m * 2. Prefixes (if required) image server URL to image src locations. * * @return a <code>String</code> value */ public static String parseHtmlAndGenerateCompressedImages(String html) throws IOException { if (UtilValidate.isEmpty(html)) { return html; } org.jsoup.nodes.Document doc = Jsoup.parse(html); Elements images = doc.select("img[src~=(?i)\\.(jpg|jpeg|png|gif)]"); if (images != null && images.size() > 0) { Set<String> imageLocations = new HashSet<String>(); for (Element image : images) { String srcAttributeValue = image.attr("src"); if (!(imageLocations.contains(srcAttributeValue))) { if (Debug.infoOn()) { Debug.logInfo( "[parseHtmlAndGenerateCompressedImages] originalSource >> " + srcAttributeValue, module); } if (!UtilValidate.isUrl(srcAttributeValue)) { int separatorIndex = srcAttributeValue.lastIndexOf("/"); if (separatorIndex == -1) { separatorIndex = srcAttributeValue .lastIndexOf("\\"); /** just in case some one plays with html source. */ } if (separatorIndex != -1) { String originalFileName = srcAttributeValue.substring(separatorIndex + 1); /* Handling spaces in file-name to make url friendly. */ String outputFileName = StringEscapeUtils.escapeHtml(originalFileName); /** Compression works for jpeg's only. if (originalFileName.endsWith("jpg") || originalFileName.endsWith("jpeg")) { try { outputFileName = generateCompressedImageForInputFile(imageUploadLocation, originalFileName); } catch (NoSuchAlgorithmException e) { Debug.logError(e, module); return html; } } */ StringBuilder finalLocation = new StringBuilder(campaignBaseURL); finalLocation.append(imageUploadWebApp).append(outputFileName); html = StringUtil.replaceString(html, srcAttributeValue, finalLocation.toString()); imageLocations.add(srcAttributeValue); } } else { Debug.logWarning("[parseHtmlAndGenerateCompressedImages] ignoring encountered HTML URL..", module); } } } } else { if (Debug.infoOn()) { Debug.logInfo("[parseHtmlAndGenerateCompressedImages] No jpeg images, doing nothing..", module); } } if (Debug.infoOn()) { Debug.logInfo("[parseHtmlAndGenerateCompressedImages] returning html >> " + html, module); } return html; }
From source file:com.ettoremastrogiacomo.sktradingjava.starters.Temp.java
public static void fetchEuroNext() throws Exception { String u0 = "https://www.euronext.com/en/equities/directory"; com.ettoremastrogiacomo.utils.HttpFetch httpf = new com.ettoremastrogiacomo.utils.HttpFetch(); if (Init.use_http_proxy.equals("true")) { httpf.setProxy(Init.http_proxy_host, Integer.parseInt(Init.http_proxy_port), Init.http_proxy_user, Init.http_proxy_password); }// w w w .j av a2 s .c o m String s = new String(httpf.HttpGetUrl(u0, Optional.empty(), Optional.empty())); int k1 = s.indexOf("\\/en\\/popup\\/data\\/download?"); int k2 = s.indexOf("\"", k1); String u1 = s.substring(k1, k2 - 1); //LOG.debug(u1); u1 = u1.replace("\\u0026", "&"); u1 = "https://www.euronext.com" + u1.replace("/", ""); u1 = u1.replace("\\", "/"); LOG.debug(u1); s = new String(httpf.HttpGetUrl(u1, Optional.empty(), Optional.empty())); Document doc = Jsoup.parse(s); java.util.HashMap<String, String> vmap = new java.util.HashMap<>(); vmap.put("format", "1"); vmap.put("layout", "2"); vmap.put("decimal_separator", "1"); vmap.put("date_format", "1"); vmap.put("op", "Go"); Elements links = doc.select("input[name=\"form_build_id\"]"); links.forEach((x) -> { vmap.put("form_build_id", x.attr("value")); }); links = doc.select("input[name=\"form_id\"]"); links.forEach((x) -> { vmap.put("form_id", x.attr("value")); }); HttpURLConnection post = httpf.sendPostRequest(u1, vmap); StringBuffer response; try (BufferedReader in = new BufferedReader(new InputStreamReader(post.getInputStream()))) { String inputLine; response = new StringBuffer(); while ((inputLine = in.readLine()) != null) { response.append("\n").append(inputLine); } } String res = response.toString(); String[] lines = res.split("\n"); for (String line : lines) { String[] row = line.split("\t"); if (row.length == 13) { LOG.debug(row[0] + "\t" + row[1] + "\t" + row[2] + "\t" + row[3] + "\t" + row[4] + "\t" + row[5]); } } }
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/*from ww w .j av a 2 s.c o m*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:me.vertretungsplan.parser.DaVinciParser.java
@NotNull static List<String> getDayUrls(String url, Document doc) throws IOException { List<String> dayUrls = new ArrayList<>(); if (doc.select("ul.classes").size() > 0) { // List of classes Elements classes = doc.select("ul.classes li a"); for (Element klasse : classes) { dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString()); }//w w w .j a va 2 s . c om } else if (doc.select("ul.month").size() > 0) { // List of days in calendar view Elements days = doc.select("ul.month li input[onclick]"); for (Element day : days) { String urlFromOnclick = urlFromOnclick(day.attr("onclick")); if (urlFromOnclick == null) continue; dayUrls.add(new URL(new URL(url), urlFromOnclick).toString()); } } else if (doc.select("ul.day-index").size() > 0) { // List of days in list view Elements days = doc.select("ul.day-index li a"); for (Element day : days) { dayUrls.add(new URL(new URL(url), day.attr("href")).toString()); } } else if (doc.select("table td[align=left] a").size() > 0) { // Table of classes (DaVinci 5) Elements classes = doc.select("table td[align=left] a"); for (Element klasse : classes) { dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString()); } } else { // Single day dayUrls.add(url); } return dayUrls; }
From source file:io.jari.geenstijl.API.API.java
/** * Get article and comments (note that getArticles doesn't get the comments) * * @param url The direct url to the geenstijl article * @return Artikel The fetched article//from w ww .jav a2 s . co m * @throws IOException * @throws ParseException */ public static Artikel getArticle(String url, Context context) throws IOException, ParseException { ensureCookies(); domain = context.getSharedPreferences("geenstijl", 0).getString("gsdomain", "www.geenstijl.nl"); Artikel artikel; Log.i(TAG, "GETARTICLE STEP 1/2: Getting/parsing article page & images... " + url); Document document = Jsoup.connect(url).get(); Element artikel_el = document.select("#content>article").first(); artikel = parseArtikel(artikel_el, context); Log.i(TAG, "GETARTICLE STEP 2/2: Parsing comments..."); ArrayList<Comment> comments = new ArrayList<Comment>(); int i = 0; Elements comments_el = document.select("#comments article"); for (Element comment_el : comments_el) { i++; Comment comment = new Comment(); comment.id = Integer.parseInt(comment_el.attr("id").substring(1)); Element footer = comment_el.select("footer").first(); StringTokenizer footer_items = new StringTokenizer(footer.text(), "|"); comment.auteur = footer_items.nextToken().trim(); try { SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MM-yyHH:mm", Locale.US); comment.datum = simpleDateFormat .parse(footer_items.nextToken().trim() + footer_items.nextToken().trim()); } catch (ParseException parseEx) { //fuck gebruikers met pipe chars in hun naam, pech, gehad. continue; } comment.inhoud = comment_el.select("p").first().html(); Log.d(TAG + ".perf", "CommentParser: Parsed " + comment.id + ": " + i + "/" + comments_el.size()); comments.add(comment); } Comment[] comm = new Comment[comments.size()]; comments.toArray(comm); artikel.comments = comm; Log.i(TAG, "GETARTICLE: DONE"); return artikel; }
From source file:com.normalexception.app.rx8club.html.HtmlFormUtils.java
/** * Report the value inside of an input element * @param pan The panel where all of the input elements reside * @param name The name of the input to get the value for * @return The string value of the input */// ww w . ja v a2 s. c o m public static String getInputElementValueByName(Document pan, String name) { try { return pan.select("input[name=" + name + "]").attr("value"); } catch (NullPointerException npe) { return ""; } }
From source file:com.normalexception.app.rx8club.html.HtmlFormUtils.java
/** * Report the value inside of an input element * @param pan The panel where all of the input elements reside * @param name The name of the input to get the value for * @return The string value of the input *//*www .j av a 2 s. c om*/ public static String getInputElementValueById(Document pan, String name) { try { return pan.select("input[id=" + name + "]").attr("value"); } catch (NullPointerException npe) { return ""; } }
From source file:mailbox.CreationViaEmail.java
private static String replaceCidWithAttachments(String html, Map<String, Attachment> attachments) { Document doc = Jsoup.parse(html); String[] attrNames = { "src", "href" }; for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { String uriString = tag.attr(attrName).trim(); if (!uriString.toLowerCase().startsWith("cid:")) { continue; }/*from www. j a va 2s . c om*/ String cid = uriString.substring("cid:".length()); if (!attachments.containsKey(cid)) { continue; } Long id = attachments.get(cid).id; tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url()); } } Elements bodies = doc.getElementsByTag("body"); if (bodies.size() > 0) { return bodies.get(0).html(); } else { return doc.html(); } }
From source file:models.NotificationMail.java
/** * Make every link to be absolute and to have 'rel=noreferrer' if * necessary./* w w w . j a v a2s.co m*/ */ public static void handleLinks(Document doc) { String hostname = Config.getHostname(); String[] attrNames = { "src", "href" }; Boolean noreferrer = play.Configuration.root().getBoolean("application.noreferrer", false); for (String attrName : attrNames) { Elements tags = doc.select("*[" + attrName + "]"); for (Element tag : tags) { boolean isNoreferrerRequired = false; String uriString = tag.attr(attrName); if (noreferrer && attrName.equals("href")) { isNoreferrerRequired = true; } try { URI uri = new URI(uriString); if (!uri.isAbsolute()) { tag.attr(attrName, Url.create(uriString)); } if (uri.getHost() == null || uri.getHost().equals(hostname)) { isNoreferrerRequired = false; } } catch (URISyntaxException e) { play.Logger.info("A malformed URI is detected while" + " checking an email to send", e); } if (isNoreferrerRequired) { tag.attr("rel", tag.attr("rel") + " noreferrer"); } } } }