List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:gpxparser.GpxParser.java
/** * @param args the command line arguments *///from w w w .java2 s .co m public static void main(String[] args) { File input = new File("/home/yonseca/4.gpx"); Track track = new Track(); try { Document doc = Jsoup.parse(input, "UTF-8"); //System.out.println(doc.text()); Elements trackData = doc.getElementsByTag("trk"); Elements trackName = trackData.select("name"); track.setName(trackName.text()); Elements trkPt = trackData.select("trkseg").select("trkpt"); for (Iterator<Element> iterator = trkPt.iterator(); iterator.hasNext();) { Element dataPoint = iterator.next(); Double lat = NumberUtils.toDouble(dataPoint.attr("lat")); Double lon = NumberUtils.toDouble(dataPoint.attr("lon")); Double altitude = NumberUtils.toDouble(dataPoint.select("ele").text()); track.addPoint(lat, lon, altitude); } System.out.println(""); } catch (IOException ex) { Logger.getLogger(GpxParser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:DataCrawler.OpenAIRE.XMLGenerator.java
public static void main(String[] args) { String text = ""; try {//w ww .j a v a 2s . co m if (args.length < 4) { System.out.println("<command> template_file csv_file output_dir log_file [start_id]"); } // InputStream fis = new FileInputStream("E:/Downloads/result-r-00000"); InputStream fis = new FileInputStream(args[1]); BufferedReader br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8"))); // String content = new String(Files.readAllBytes(Paths.get("publications_template.xml"))); String content = new String(Files.readAllBytes(Paths.get(args[0]))); Document doc = Jsoup.parse(content, "UTF-8", Parser.xmlParser()); // String outputDirectory = "G:/"; String outputDirectory = args[2]; // PrintWriter logWriter = new PrintWriter(new FileOutputStream("publication.log",false)); PrintWriter logWriter = new PrintWriter(new FileOutputStream(args[3], false)); Element objectId = null, title = null, publisher = null, dateofacceptance = null, bestlicense = null, resulttype = null, originalId = null, originalId2 = null; boolean start = true; // String startID = "dedup_wf_001::207a098867b64f3b5af505fa3aeecd24"; String startID = ""; if (args.length >= 5) { start = false; startID = args[4]; } String previousText = ""; while ((text = br.readLine()) != null) { /* For publications: 0. dri:objIdentifier context 9. title context 12. publisher context 18. dateofacceptance 19. bestlicense @classname 21. resulttype @classname 26. originalId context (Notice that the prefix is null and will use space to separate two different "originalId") */ if (!previousText.isEmpty()) { text = previousText + text; start = true; previousText = ""; } String[] items = text.split("!"); for (int i = 0; i < items.length; ++i) { items[i] = StringUtils.strip(items[i], "#"); } if (objectId == null) objectId = doc.getElementsByTag("dri:objIdentifier").first(); objectId.text(items[0]); if (!start && items[0].equals(startID)) { start = true; } if (title == null) title = doc.getElementsByTag("title").first(); title.text(items[9]); if (publisher == null) publisher = doc.getElementsByTag("publisher").first(); if (items.length < 12) { previousText = text; continue; } publisher.text(items[12]); if (dateofacceptance == null) dateofacceptance = doc.getElementsByTag("dateofacceptance").first(); dateofacceptance.text(items[18]); if (bestlicense == null) bestlicense = doc.getElementsByTag("bestlicense").first(); bestlicense.attr("classname", items[19]); if (resulttype == null) resulttype = doc.getElementsByTag("resulttype").first(); resulttype.attr("classname", items[21]); if (originalId == null || originalId2 == null) { Elements elements = doc.getElementsByTag("originalId"); String[] context = items[26].split(" "); if (elements.size() > 0) { if (elements.size() >= 1) { originalId = elements.get(0); if (context.length >= 1) { int indexOfnull = context[0].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[0].trim().length() >= (indexOfnull + 5)) value = context[0].trim().substring(indexOfnull + 5); } else { value = context[0].trim(); } originalId.text(value); } } if (elements.size() >= 2) { originalId2 = elements.get(1); if (context.length >= 2) { int indexOfnull = context[1].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[1].trim().length() >= (indexOfnull + 5)) value = context[1].trim().substring(indexOfnull + 5); } else { value = context[1].trim(); } originalId2.text(value); } } } } else { String[] context = items[26].split(" "); if (context.length >= 1) { int indexOfnull = context[0].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[0].trim().length() >= (indexOfnull + 5)) value = context[0].trim().substring(indexOfnull + 5); } else { value = context[0].trim(); } originalId.text(value); } if (context.length >= 2) { int indexOfnull = context[1].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[1].trim().length() >= (indexOfnull + 5)) value = context[1].trim().substring(indexOfnull + 5); } else { value = context[1].trim(); } originalId2.text(value); } } if (start) { String filePath = outputDirectory + items[0].replace(":", "#") + ".xml"; PrintWriter writer = new PrintWriter(new FileOutputStream(filePath, false)); logWriter.write(filePath + " > Start" + System.lineSeparator()); writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + System.lineSeparator()); writer.write(doc.getElementsByTag("response").first().toString()); writer.close(); logWriter.write(filePath + " > OK" + System.lineSeparator()); logWriter.flush(); } } logWriter.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.geecko.QuickLyric.lyrics.MetalArchives.java
@Reflection public static Lyrics fromMetaData(String artist, String title) { String baseURL = "http://www.metal-archives.com/search/ajax-advanced/searching/songs/?bandName=%s&songTitle=%s&releaseType[]=1&exactSongMatch=1&exactBandMatch=1"; String urlArtist = artist.replaceAll("\\s", "+"); String urlTitle = title.replaceAll("\\s", "+"); String url;// w w w .jav a 2 s . c o m String text; try { String response = Net.getUrlAsString(String.format(baseURL, urlArtist, urlTitle)); JSONObject jsonResponse = new JSONObject(response); JSONArray track = jsonResponse.getJSONArray("aaData").getJSONArray(0); StringBuilder builder = new StringBuilder(); for (int i = 0; i < track.length(); i++) builder.append(track.getString(i)); Document trackDocument = Jsoup.parse(builder.toString()); url = trackDocument.getElementsByTag("a").get(1).attr("href"); String id = trackDocument.getElementsByClass("viewLyrics").get(0).id().substring(11); text = Jsoup.connect("http://www.metal-archives.com/release/ajax-view-lyrics/id/" + id).get().body() .html(); } catch (IOException e) { return new Lyrics(ERROR); } catch (JSONException e) { return new Lyrics(NO_RESULT); } Lyrics lyrics = new Lyrics(POSITIVE_RESULT); lyrics.setArtist(artist); lyrics.setTitle(title); lyrics.setText(text); lyrics.setSource(domain); lyrics.setURL(url); return lyrics; }
From source file:com.bibisco.export.HTMLParser.java
public static void parse(String pStrHTML, IExporter pExporter) { mLog.debug("Start parse(String, IExporter)"); // parse html Document lDocument = Jsoup.parse(pStrHTML); // get body element Element lElementBody = lDocument.getElementsByTag("body").get(0); // parse body parseNode(lElementBody, new TextFormatting(), pExporter); mLog.debug("End parse(String, IExporter)"); }
From source file:damo.three.ie.util.HtmlUtilities.java
/** * Parses the My3 account usage page to nicer JSON format. * * @param pageContent Page content as HTML. * @return Usage information stripped out and formatted as JSON. * @throws JSONException/*from w w w. j a v a2 s. c om*/ */ public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException { // The HTML on prepay is pig-ugly, so we will use JSoup to // clean and parse it. Document doc = Jsoup.parse(pageContent); HtmlUtilities.removeComments(doc); Elements elements = doc.getElementsByTag("table"); JSONArray jsonArray = new JSONArray(); // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items. // .. feck them! boolean three2threeCallsBug = false; for (Element element : elements) { for (Element subelement : element.select("tbody > tr")) { if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) { three2threeCallsBug = true; } Elements subsubelements = subelement.select("td"); if (subsubelements.size() == 3) { // skip the "total" entries if (subsubelements.select("td").get(0).text().contains("Total")) { continue; } JSONObject currentItem = new JSONObject(); if (three2threeCallsBug) { currentItem.put("item", "3 to 3 Calls"); } else { // Get rid of that "non-breaking space" character if it exists String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "") .trim(); currentItem.put("item", titleToClean); } /** * Check if date contains "Today", if so, change it to a date. * Otherwise we will never know when usage ends, unless user refreshes, As 'today' * is 'today', tomorrow.. see! */ String value1 = subsubelements.select("td").get(1).text(); if (value1.equals("Today")) { DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK); DateTime dt = new DateTime(); // current datetime value1 = "Expires " + formatter.print(dt); } currentItem.put("value1", value1); currentItem.put("value2", subsubelements.select("td").get(2).text()); // Out of Bundle charges have an extra property if (currentItem.getString("item").startsWith("Internet")) { Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL); Matcher m1 = p1.matcher(pageContent); StringBuilder cleanedDate = new StringBuilder(); if (m1.matches()) { cleanedDate.append(m1.group(1)); cleanedDate.append(' '); cleanedDate.append(m1.group(2)); cleanedDate.append(' '); cleanedDate.append(m1.group(3)); currentItem.put("value3", cleanedDate.toString()); } } jsonArray.put(currentItem); } } // reset the 3-to-3 call bug flag for next Element if (three2threeCallsBug) { three2threeCallsBug = false; } } return jsonArray; }
From source file:org.keycloak.testsuite.util.saml.LoginBuilder.java
public static HttpUriRequest handleLoginPage(UserRepresentation user, String loginPage) { String username = user.getUsername(); String password = getPasswordOf(user); org.jsoup.nodes.Document theLoginPage = Jsoup.parse(loginPage); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theLoginPage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (Objects.equals(input.id(), "username")) { parameters.add(new BasicNameValuePair(input.attr("name"), username)); } else if (Objects.equals(input.id(), "password")) { parameters.add(new BasicNameValuePair(input.attr("name"), password)); } else { parameters.add(new BasicNameValuePair(input.attr("name"), input.val())); }/*from w ww .j a va 2 s.c o m*/ } if (isPost) { HttpPost res = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid login form: " + loginPage); }
From source file:com.nuance.expertassistant.ContentExtractor.java
public static void extract(Document doc) { final Elements links = doc.getElementsByTag("a"); final Elements ps = doc.select("p"); final String title = doc.title(); print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">"); final Elements elements = doc.select("*"); final ArrayList<String> openHeaderList = new ArrayList<String>(); for (final Element element : elements) { if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") { } else if (element.tagName().toString().contains("a")) { } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); }//from w w w . jav a 2 s.co m if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h1"); } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h2"); } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h3"); } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h4"); } else { print("<para>"); print(stripNonValidXMLCharacters(element.ownText())); print("</para>"); } /* * if (element.tagName().contains("img")) { print("<img src=\"" + * element.attr("src") + "\"></img>"); } */ } if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); } if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("</section>"); }
From source file:net.sf.texprinter.utils.StringUtils.java
/** * Escapes HTML entities and tags to a TeX format. This method tries to * replace HTML code by the TeX equivalent macros. * * @param text The input text./* w w w . j ava 2 s .c o m*/ * @return A new text formatted from HTML to TeX. */ public static String escapeHTMLtoTeX(String text) { // replace bold tags String newText = text.replaceAll("<b>", "\\\\textbf{"); newText = newText.replaceAll("</b>", "}"); // replace bold tags newText = newText.replaceAll("<strong>", "\\\\textbf{"); newText = newText.replaceAll("</strong>", "}"); // replace italic tags newText = newText.replaceAll("<i>", "\\\\textit{"); newText = newText.replaceAll("</i>", "}"); // replace emphasized tags newText = newText.replaceAll("<em>", "\\\\emph{"); newText = newText.replaceAll("</em>", "}"); // replace paragraphs tags newText = newText.replaceAll("<p>", ""); newText = newText.replaceAll("</p>", "\n\n"); // replace ordered lists tags newText = newText.replaceAll("<ol>", "\\\\begin{enumerate}\n"); newText = newText.replaceAll("</ol>", "\\\\end{enumerate}\n"); // replace unordered lists tags newText = newText.replaceAll("<ul>", "\\\\begin{itemize}\n"); newText = newText.replaceAll("</ul>", "\\\\end{itemize}\n"); // replace item tags newText = newText.replaceAll("<li>", "\\\\item "); newText = newText.replaceAll("</li>", "\n"); // replace blockquote tags newText = newText.replaceAll("<blockquote>", "\\\\begin{quotation}\n"); newText = newText.replaceAll("</blockquote>", "\\\\end{quotation}\n"); // replace code tags newText = newText.replaceAll("<pre><code>", "\\\\begin{TeXPrinterListing}\n"); newText = newText.replaceAll("<pre class=.*\"><code>", "\\\\begin{TeXPrinterListing}\n"); newText = newText.replaceAll("</code></pre>", "\\\\end{TeXPrinterListing}\n\n"); // replace inline code tags newText = newText.replaceAll("<code>", "\\\\lstinline|"); newText = newText.replaceAll("</code>", "|"); // replace links tags newText = newText.replaceAll("alt=\".*\" ", ""); // parse the text Document docLinks = Jsoup.parse(newText); // get all the links Elements links = docLinks.getElementsByTag("a"); // if there are links if (links.size() > 0) { // for every link for (Element link : links) { // get the outer HTML String temp = link.outerHtml(); // replace it newText = newText.replaceFirst(Pattern.quote(temp), "\\\\href{" + link.attr("href") + "}{" + link.text() + "}"); } } // create a list of images ArrayList<ImageGroup> images = new ArrayList<ImageGroup>(); // parse the current text Document doc = Jsoup.parse(text); // fetch all the media found Elements media = doc.select("[src]"); // for all media found for (Element m : media) { // if it's an image tag if (m.tagName().equals("img")) { // create a new image group with the image link ImageGroup image = new ImageGroup(m.attr("abs:src")); // add to the list of images images.add(image); // set the current image to null image = null; } } // create a new loop saver LoopSaver lps = null; // for every image in the list of images for (ImageGroup img : images) { // create a new object lps = new LoopSaver(); // while there are references for that image in the text while (newText.indexOf(img.getURL()) != -1) { // tick loop lps.tick(); // replace the occurrence of that image newText = newText.replaceFirst("<img src=\"" + img.getURL() + "\" />", "\\\\begin{figure}[h!]\n\\\\centering\n\\\\includegraphics[scale=0.5]{" + img.getName() + "}\n\\\\end{figure}"); } // lets try try { // finally, download the image to the current directory Downloader.download(img.getURL(), img.getName()); } catch (Exception exception) { // log message log.log(Level.WARNING, "An error occurred while getting the current image. Trying to set the replacement image instead. MESSAGE: {0}", StringUtils.printStackTrace(exception)); // image could not be downloaded for any reason try { // open a file stream FileOutputStream f = new FileOutputStream(img.getName()); // write a replacement image f.write(Base64.decode( "iVBORw0KGgoAAAANSUhEUgAAALAAAABKCAIAAACU3El2AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAcjSURBVHhe7VzrmeMgDExdKSj1pJptZovZi3lqhAQ4n7HhrPt1STAaRoOELa0ff/bPGCAMPIwNY4AyYIIwPQADJggThAnCNKAzYBHC1GERwjRgEcI00MmApYxOou4yzARxF093rtME0UnUXYaZIO7i6c51miA6ibrLMBPEXTzduc4vBPHzejwez/cvt+C+f7x+GpY7h/2+nz2zdS5z6LCFoLZ5qAli8wj/9xHCzIK4xDeXGG27No7YB68dIfx8JCJoguiHOG7kvsUfhOMSo/3Y98H7XhBvlyFIWAfDPjFIOWT/MLgC9IlW/E/xH0tqeWiR7tx1z7dfkE95dKo00TbF8/2TrMTsiIyLhsh8eJVHG00EJGAizUeScT+8KieSqL4VRGCOHggILySI/L5feNzYP0wThGBF2Q2frwPn0oBImuhgZ8T/ElzjZqK6JHPKhmhM/Xlt15e7wpkISIiFIBaF54AJ4LkP6vhmYPlWEFHT4JR0DPR4hINnWjSB3RxWFQRaaYdHIeGxpMjmyIqAS1EFwvk3jy5NaiYASSkaIpQULhR4sl+acvj7GyOItJuEnKGkDH57ohGupAwI9cKdDgnkhVQl97DDdNYvRHevRm1FwZAgUkSTU2uPILRrfVSob9QLBeFNs71HvgOfNYZpEQKP0dL2CSPoBGqEwD0nha12hBANCavTzuXdgmjCmytC5Exapb9zGGREksfFy0WqHc+eQymZ8S1MEzDdU/QMISdp2RALahsQxUSPILRr1QiBP7SCxKCUEU5HrbuMzmEkA4VTvveudHn6DjZRjrOvl7tXoI/V2jGdnNqer1e8l5EOoYqhjBTuWUhiIodK6fyLQmdpg8ILKxOimXqqA420BdGS1KDfBS8NstQ9rRbouydYYeC0gpiQ/QkhHS+xKQURA2yzLnI8H7UZTRDn8m3WZmBgyggxAzF3xWCCuKvnlXWbIEwQa9x2mp+uYaAeIbRndiOx7nsAsW/0cbilp+2HzX7Ior5EuIogeElH7kU4zCXNib6kuzmvGzCvIPrwHztKZqOsHfj65iHcfbGAmwqC0B3qNq1mHrKTc8GAbW94Vo8tQ6qLIXkRbzBBkOpG0fXHLJGqQ+oLVi5PgknXhIqGWJigdRahGk1KwNt07Ras2JgDvVUfSHWqOcJe0ddTBhdEKAtF3txyiaty/bFUEusbAEe6KYSWD7KIHkEoc4qooDzse7oqkDwQcg0tfArtSbwpKhBGCq6EOr9yuXwqfR/r/EINTEPYq4bPuJ2CaBfigu0MzW8DV110vEiRHhSB8qDzQSsb3YjNOUVUWPVksaZEIRQQs1tTrMjRK0+4/c9VWTecIdSmWny9pQUfl4uJCqnG/kyla60ikIMFgckh96yw/0EU5N24REEZuJx1YFvzc2euvQuoyp4u/XKPAp3B/c7yI673M7XPDLEVIowGb0PMis2IXAFlCAjs5ZgUkXx5yjlSEHSPZeQ0L0sdXn3hDFIGuYTYxM2Uxsio4s+ZNuVypkmBbmkTk95tL4XPF5up0Nsd0mNbEKy5Ja1FXpQWw/oo9qMOFwTJk879JEJSXJqD5bY7TKV0noKZ4k/HeIiOqIpdqkMqQ0R5hpCSaVj80+nBr+H5+ZAgdggCFIFJqOwBo0EBEO5QxJGCoGGYNCaxWIyHx9wzhE8Wcgj2i+mIEHlYmhT607eD65bI6eHDjcxVdg1qJDT9Do1b+GccoEh0S/gkd2+KKSPnqrAmgT3oAdMQdktieC1DCGOTtTl0c3WLgaMFgWf3VlS+BeVzL3K0IFK05/cSc9NyX3QnCOK+5K64chPEil4biNkEMZDcFac2QazotYGYTRADyV1x6l2CaD7dXZEBwwwMdD+pTM8B+TPEOQlltcs5Qc6IygQxo1cuxFQTRPHKppAyirdLffDTmqYUQ8jv8ck1LRxAETG/7ikUpppvf2J/CA4F1qIlQLLrC0/C+6M6lnah9waY3h8h6m+XgrceJbz08OFfskQfYpMiXXRlEA37qDY1lfNrKUOxGxs06i9ochf/55WY/YIoO3wY+SVt5WFU6iEoezz4G2g0Q8JhVxGEZld720ZzaQP26LVTHiEIVjRmJWWpM1ptBGIOkPxRvv1Jcr4sCNWuJojW0q513gjrhwmicvPB3RALXqwPMTUc5qgsCaI0JMyvtedLEaJ8oVgedb8b7cZzCCQEPpEPrao2eIycIcouo3qE6Ho1k59fe7ESXYLch4Zy1ZbWWvKIzXvKnK0HU+nAnk6CQpdw5LBsf0pryAd/7EpkjUANQeiGKvOzkAK3IM3mJc3ibQVxiirNyDwMtCLEPEgNySkMmCBOoXkdIyaIdXx1ClITxCk0r2PEBLGOr05BaoI4heZ1jJgg1vHVKUhNEKfQvI4RE8Q6vjoFqQniFJrXMWKCWMdXpyA1QZxC8zpGTBDr+OoUpP8Arv92hCPEu+kAAAAASUVORK5CYII=")); // close the file f.close(); } catch (IOException ioexception) { // log message log.log(Level.SEVERE, "An IO exception occured while trying to create the image replacement. MESSAGE: {0}", StringUtils.printStackTrace(ioexception)); } catch (Exception except) { // log message log.log(Level.SEVERE, "An error occured while trying to create the image replacement. MESSAGE: {0}", StringUtils.printStackTrace(except)); } } } // unescape all HTML entities newText = StringEscapeUtils.unescapeHtml(newText); // return new text return newText; }
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/* w ww .jav a 2 s .com*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:index.IndexManager.java
public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) { final SolrInputDocument index = new SolrInputDocument(); index.setField("id", document.location()); index.setField("time", String.valueOf(System.currentTimeMillis())); index.setField("title", document.title()); final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href")) .collect(Collectors.toSet()); final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src")) .collect(Collectors.toSet()); links.forEach(link -> index.addField("link", link)); media.forEach(link -> index.addField("media", link)); formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e)); formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e)); formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e)); formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e)); formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e)); formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e)); formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e)); formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e)); int i = 0;/*from w w w. ja v a2 s .co m*/ Collection<String> text = chunkToLength(document.text()); for (String chunk : text) index.addField(++i + "_text", chunk); return Triple.of(index, links, media); }