Example usage for org.jsoup.nodes Document text

List of usage examples for org.jsoup.nodes Document text

Introduction

In this page you can find the example usage for org.jsoup.nodes Document text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:FILER.java

public static String getDescription(String query, long Doc_id) throws FileNotFoundException, IOException {
    boolean phrase;
    String description = "";
    String content = "";
    File f = new File("C:\\Users\\mennna\\Documents\\NetBeansProjects\\Search\\" + Doc_id + ".html");
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    content = content + " " + doc.text();
    content = content.toLowerCase();/*from w  ww .j a v a  2  s .c om*/
    if (query.endsWith("\"") == true && query.startsWith("\"") == true) {
        phrase = true;
    } else
        phrase = false;
    int query_length = 0;

    String query_words[] = query.split("\\P{Alpha}+");
    query_length = query_words.length;
    String words[] = content.split("\\P{Alpha}+");
    int index = ArrayUtils.indexOf(words, query_words[0]);
    System.out.println("index " + index);
    int i = 0, start = 0, end = 0;
    if (phrase && query_length > 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (query.indexOf(words[i]) != -1) {
                description += "<b> " + words[i] + "</b>";
            } else
                description += " " + words[i];
        }

    } else if (query_length == 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (words[i].equals(query)) {
                description += "<b> " + words[i] + "</b>";
            } else {
                description += " " + words[i];
            }
        }
    } else if (!phrase && query_length > 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (query.indexOf(words[i]) != -1) {
                description += "<b> " + words[i] + "</b>";
            } else
                description += " " + words[i];
        }
    }
    System.out.println("description  " + description);
    return description;
}

From source file:edu.usu.sdl.openstorefront.common.util.StringProcessor.java

public static String stripHtml(String text) {
    if (StringUtils.isNotBlank(text)) {
        Document doc = Jsoup.parse(text);
        return doc.text().trim();
    }/*  www .  j a va2s .co m*/
    return text;
}

From source file:module.entities.NameFinder.DB.java

public static TreeMap<Integer, String> getDemocracitArticles(int consId) throws SQLException {
    TreeMap<Integer, String> all = new TreeMap<>();
    String sql = "SELECT id, body " + "FROM articles " + "WHERE consultation_id = " + consId
            + " AND id NOT IN (SELECT enhancedentities.article_id FROM enhancedentities);";
    Statement stmt = connection.createStatement();
    ResultSet rs = stmt.executeQuery(sql);
    //        PreparedStatement preparedStatement = connection.prepareStatement(sql);
    //        preparedStatement.setInt(1, consId);
    //        System.out.println(sql);
    //        ResultSet rs = preparedStatement.executeQuery();
    Document doc;
    while (rs.next()) {
        int articleID = rs.getInt("id");
        String article_text = rs.getString("body");
        doc = Jsoup.parseBodyFragment(article_text);
        all.put(articleID, doc.text());
    }/*from w  w w. j  a va2 s.co m*/
    return all;
}

From source file:mas.MAS_TOP_PAPERS.java

public static String getData2(String url_org, int start) {

    try {/*from   w ww. j ava2 s.c  om*/
        String complete_url = url_org + "&$skip=" + start;
        //            String url_str = generateURL(url_org, prop);
        Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get();
        return doc.text();
    } catch (IOException ex) {
        System.out.println(ex.getMessage() + " Cause: " + ex.getCause());
        Logger.getLogger(MAS_TOP_PAPERS.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:mas.MAS_VLDB.java

public static String getData2(String url_org, int start) {

    try {/*  w  ww.j ava  2s. c o m*/
        String complete_url = url_org + "&$skip=" + start;
        //            String url_str = generateURL(url_org, prop);
        Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get();
        return doc.text();
    } catch (IOException ex) {
        System.out.println(ex.getMessage() + " Cause: " + ex.getCause());
        Logger.getLogger(MAS_VLDB.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:mas.MAS.java

public static String getData2(String url_org, int start) {

    try {//from www  . j a va2 s .  c o m
        String complete_url = url_org + "&$skip=" + start;
        //            String url_str = generateURL(url_org, prop);
        Document doc = Jsoup.connect(complete_url).timeout(25000).ignoreContentType(true).get();
        return doc.text();
    } catch (IOException ex) {
        System.out.println(ex.getMessage() + " Cause: " + ex.getCause());
        Logger.getLogger(MAS.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:com.example.app.support.service.AppUtil.java

/**
 * Check if the provided HTML content has anything visible to present to a user.
 *
 * @param markup the markup/*  w  w w.j a  v  a2  s  .  com*/
 *
 * @return true if there is something to show
 */
public static boolean isEmptyMarkup(String markup) {
    final Document document = Jsoup.parse(markup);
    return document.text().trim().isEmpty();
}

From source file:FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;//w w  w.j av  a2  s  .  c  o  m
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:de.tudarmstadt.lt.ltbot.text.JSoupTextExtractor.java

@Override
public String getPlaintext(final String htmltext) {
    try {/*w ww . ja va  2s. co  m*/
        // preserve newlines
        // html = html.replaceAll("(?i)<br[^>]*>", "br2nl"); // <br>s are often just inserted for style
        String hhtmltext = _end_prgrph_ptrn.matcher(htmltext).replaceAll("</p>br2nl");
        hhtmltext = _nwln_ptrn.matcher(hhtmltext).replaceAll("br2nl");

        Document soup = Jsoup.parse(hhtmltext);
        String plaintext = soup.text();

        plaintext = _tmp_nwln_ptrn.matcher(plaintext).replaceAll("\n");
        plaintext = _emptln_ptrn.matcher(plaintext.trim()).replaceAll("");

        return plaintext;
    } catch (Throwable t) {
        for (int i = 1; t != null && i < 10; i++) {
            LOG.log(Level.SEVERE,
                    String.format("Failed to get plaintext from while '%s' (%d %s:%s).",
                            StringUtils.abbreviate(htmltext, 100), i, t.getClass().getName(), t.getMessage()),
                    t);
            t = t.getCause();
        }
        return "Failed to get plaintext content \n" + htmltext;
    }
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.CybozuLanguageIdentifier.java

@Override
public String identifyLanguage(String html) throws IOException {
    // extracting plain html text
    Document doc = Jsoup.parse(html);
    String text = doc.text();

    // we might have removed everything -> no lang
    if (text.isEmpty()) {
        return UNKNOWN_LANGUAGE;
    }/*from w ww  .ja v a 2  s. c  om*/

    try {
        Detector detector = DetectorFactory.create();
        detector.append(text);
        String detectedLang = detector.detect();

        ArrayList<Language> detectedProbabilities = detector.getProbabilities();

        if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
            return detectedLang;
        } else {
            return UNKNOWN_LANGUAGE;
        }
    } catch (LangDetectException e) {
        return UNKNOWN_LANGUAGE;
    }
}