Java HTML to String html2Text(String html)

Here you can find the source of html2Text(String html)

Description

Cuts all the html tags/comments/styles from the html-text and returns the only printable text.

License

Open Source License

Parameter

Parameter Description
html a parameter

Return

plain text

Declaration

public static String html2Text(String html) 

Method Source Code

//package com.java2s;
/******************************************************************************* 
 * Copyright (c) 2014 Red Hat, Inc. /*from ww w.  j  av a2s .  co m*/
 * Distributed under license by Red Hat, Inc. All rights reserved. 
 * This program is made available under the terms of the 
 * Eclipse Public License v1.0 which accompanies this distribution, 
 * and is available at http://www.eclipse.org/legal/epl-v10.html 
 * 
 * Contributors: 
 * Red Hat, Inc. - initial API and implementation 
 ******************************************************************************/

public class Main {
    /**
     * Cuts all the html tags/comments/styles from the html-text and returns the only printable text.
     * 
     * @param html
     * @return plain text
     */
    public static String html2Text(String html) {
        StringBuilder sb = new StringBuilder();
        int state = 0;

        // 
        // JBIDE-16135: CSS part contains the fontnames that are OS and setup dependent,
        // So we should exclude it from compare
        // 
        int styleStart = html.toLowerCase().indexOf("<style");
        int styleEnd = html.toLowerCase().indexOf("/style>");

        while (styleStart != -1 && styleEnd > styleStart) {
            html = html.substring(0, styleStart) + html.substring(styleEnd + "/style>".length());
            styleStart = html.toLowerCase().indexOf("<style");
            styleEnd = html.toLowerCase().indexOf("/style>");
        }
        // JBIDE-16135: pragmas and comments should be remived also
        int commentStart = html.indexOf("<!--");
        int commentEnd = html.indexOf("-->");
        while (commentStart != -1 && commentEnd > commentStart) {
            html = html.substring(0, commentStart) + html.substring(commentEnd + "-->".length());
            commentStart = html.indexOf("<!--");
            commentEnd = html.indexOf("-->");
        }
        html = html.trim();

        for (char ch : html.toCharArray()) {
            switch (state) {
            case (int) '<':
                // Read to null until '>'-char is read
                if (ch != '>')
                    continue;
                state = 0;
                break;
            default:
                if (ch == '<') {
                    state = '<';
                    continue;
                }
                sb.append(ch);
                break;
            }
        }
        return sb.toString().trim();
    }
}

Related

  1. html2Plain(String text)
  2. htmlToStr(String htmlStr, int max_count)
  3. htmlToString(String aS_Text)
  4. htmlToString(String s)
  5. htmlToString(String string)