List of usage examples for org.jsoup.nodes Document text
public String text()
From source file:FILER.java
public static String getDescription(String query, long Doc_id) throws FileNotFoundException, IOException { boolean phrase; String description = ""; String content = ""; File f = new File("C:\\Users\\mennna\\Documents\\NetBeansProjects\\Search\\" + Doc_id + ".html"); org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); content = content + " " + doc.text(); content = content.toLowerCase();/*from w ww .j a v a 2 s .c om*/ if (query.endsWith("\"") == true && query.startsWith("\"") == true) { phrase = true; } else phrase = false; int query_length = 0; String query_words[] = query.split("\\P{Alpha}+"); query_length = query_words.length; String words[] = content.split("\\P{Alpha}+"); int index = ArrayUtils.indexOf(words, query_words[0]); System.out.println("index " + index); int i = 0, start = 0, end = 0; if (phrase && query_length > 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (query.indexOf(words[i]) != -1) { description += "<b> " + words[i] + "</b>"; } else description += " " + words[i]; } } else if (query_length == 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (words[i].equals(query)) { description += "<b> " + words[i] + "</b>"; } else { description += " " + words[i]; } } } else if (!phrase && query_length > 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (query.indexOf(words[i]) != -1) { description += "<b> " + words[i] + "</b>"; } else description += " " + words[i]; } } System.out.println("description " + description); return description; }
From source file:edu.usu.sdl.openstorefront.common.util.StringProcessor.java
public static String stripHtml(String text) { if (StringUtils.isNotBlank(text)) { Document doc = Jsoup.parse(text); return doc.text().trim(); }/* www . j a va2s .co m*/ return text; }
From source file:module.entities.NameFinder.DB.java
public static TreeMap<Integer, String> getDemocracitArticles(int consId) throws SQLException { TreeMap<Integer, String> all = new TreeMap<>(); String sql = "SELECT id, body " + "FROM articles " + "WHERE consultation_id = " + consId + " AND id NOT IN (SELECT enhancedentities.article_id FROM enhancedentities);"; Statement stmt = connection.createStatement(); ResultSet rs = stmt.executeQuery(sql); // PreparedStatement preparedStatement = connection.prepareStatement(sql); // preparedStatement.setInt(1, consId); // System.out.println(sql); // ResultSet rs = preparedStatement.executeQuery(); Document doc; while (rs.next()) { int articleID = rs.getInt("id"); String article_text = rs.getString("body"); doc = Jsoup.parseBodyFragment(article_text); all.put(articleID, doc.text()); }/*from w w w. j a va2 s.co m*/ return all; }
From source file:mas.MAS_TOP_PAPERS.java
public static String getData2(String url_org, int start) { try {/*from w ww. j ava2 s.c om*/ String complete_url = url_org + "&$skip=" + start; // String url_str = generateURL(url_org, prop); Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get(); return doc.text(); } catch (IOException ex) { System.out.println(ex.getMessage() + " Cause: " + ex.getCause()); Logger.getLogger(MAS_TOP_PAPERS.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:mas.MAS_VLDB.java
public static String getData2(String url_org, int start) { try {/* w ww.j ava 2s. c o m*/ String complete_url = url_org + "&$skip=" + start; // String url_str = generateURL(url_org, prop); Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get(); return doc.text(); } catch (IOException ex) { System.out.println(ex.getMessage() + " Cause: " + ex.getCause()); Logger.getLogger(MAS_VLDB.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:mas.MAS.java
public static String getData2(String url_org, int start) { try {//from www . j a va2 s . c o m String complete_url = url_org + "&$skip=" + start; // String url_str = generateURL(url_org, prop); Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get(); return doc.text(); } catch (IOException ex) { System.out.println(ex.getMessage() + " Cause: " + ex.getCause()); Logger.getLogger(MAS.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:com.example.app.support.service.AppUtil.java
/** * Check if the provided HTML content has anything visible to present to a user. * * @param markup the markup/* w w w.j a v a2 s . com*/ * * @return true if there is something to show */ public static boolean isEmptyMarkup(String markup) { final Document document = Jsoup.parse(markup); return document.text().trim().isEmpty(); }
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;//w w w.j av a2 s . c o m for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:de.tudarmstadt.lt.ltbot.text.JSoupTextExtractor.java
@Override public String getPlaintext(final String htmltext) { try {/*w ww . ja va 2s. co m*/ // preserve newlines // html = html.replaceAll("(?i)<br[^>]*>", "br2nl"); // <br>s are often just inserted for style String hhtmltext = _end_prgrph_ptrn.matcher(htmltext).replaceAll("</p>br2nl"); hhtmltext = _nwln_ptrn.matcher(hhtmltext).replaceAll("br2nl"); Document soup = Jsoup.parse(hhtmltext); String plaintext = soup.text(); plaintext = _tmp_nwln_ptrn.matcher(plaintext).replaceAll("\n"); plaintext = _emptln_ptrn.matcher(plaintext.trim()).replaceAll(""); return plaintext; } catch (Throwable t) { for (int i = 1; t != null && i < 10; i++) { LOG.log(Level.SEVERE, String.format("Failed to get plaintext from while '%s' (%d %s:%s).", StringUtils.abbreviate(htmltext, 100), i, t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } return "Failed to get plaintext content \n" + htmltext; } }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.CybozuLanguageIdentifier.java
@Override public String identifyLanguage(String html) throws IOException { // extracting plain html text Document doc = Jsoup.parse(html); String text = doc.text(); // we might have removed everything -> no lang if (text.isEmpty()) { return UNKNOWN_LANGUAGE; }/*from w ww .ja v a 2 s. c om*/ try { Detector detector = DetectorFactory.create(); detector.append(text); String detectedLang = detector.detect(); ArrayList<Language> detectedProbabilities = detector.getProbabilities(); if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) { return detectedLang; } else { return UNKNOWN_LANGUAGE; } } catch (LangDetectException e) { return UNKNOWN_LANGUAGE; } }