Java HTML Parse Jsoup getFirstSentence(final String html)

Here you can find the source of getFirstSentence(final String html)

Description

Returns the first sentence of the specified HTML text.

License

Open Source License

Parameter

Parameter Description
html The HTML text.

Return

The first sentence.

Declaration

public static String getFirstSentence(final String html) 

Method Source Code


//package com.java2s;
/*/*from  w  ww . ja v  a2 s  .c o m*/
 * Copyright (C) 2012 Klaus Reimer <k@ailis.de>
 * See LICENSE.md for licensing information.
 */

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

public class Main {
    /**
     * Returns the first sentence of the specified HTML text.
     * 
     * @param html
     *            The HTML text.
     * @return The first sentence.
     */
    public static String getFirstSentence(final String html) {
        final Document newDoc = Document.createShell("");
        final Element newBody = newDoc.body();
        final Document document = parse(html);
        final Element body = document.body();
        for (final Node node : body.childNodes()) {
            if (node instanceof TextNode) {
                final String text = ((TextNode) node).text();
                final String[] parts = text.split("\\.(\\s+|$)", 2);
                if (parts.length == 2) {
                    newBody.appendText(parts[0] + ".");
                    break;
                }
            }
            newBody.appendChild(node.clone());
        }
        return newDoc.body().html().trim();
    }

    /**
     * Parses the specified html code.
     * 
     * @param html
     *            The HTML code to parse.
     * @return The parsed document.
     */
    public static Document parse(final String html) {
        Document doc = Jsoup.parseBodyFragment(html);
        doc.outputSettings().prettyPrint(false);
        return doc;
    }
}

Related

  1. getDoc(String url)
  2. getDoctypeName(InputStream s)
  3. getErrorMessage(String htmlStr)
  4. getExplanation(String html)
  5. getFirstImageSrc(String html)
  6. getFirstTableFromHTML(String result)
  7. getHtml(String url, String ruta_fich)
  8. getHtmlBodyContent(String html)
  9. getHtmlInTag(String html, String tag)