Html utils for working with tag's names and attributes. : Document HTML « Development Class « Java






Html utils for working with tag's names and attributes.

   

import java.util.Locale;


/**
 * Html utils for working with tag's names and attributes.
 * http://jodd.sourceforge.net
 *
 * @author najgor@users.sourceforge.net
 * @author Lingo
 * @since 2007-03-17
 * @version 1.0
 */
public final class HtmlUtil {
    // ---------------------------------------------------------------- tag name

    /**
     * Returns tag's name. Given string represents a HTML body of a tag,
     * therefore it <b>must</b> start with '<'.
     *
     * @param tagBody tag's body
     *
     * @return tag's name, or <code>null</code> if tag not found
     */
    public static String getTagName(String tagBody) {
        return getTagName(tagBody, 0);
    }

    /**
     * Returns tag's name. Given string represents a HTML body and given starting index
     * <b>must</b> be the index of tag's start (i.e. '<').
     * <p>
     *
     * Names of ending tags will always start with '/' character.
     *
     * @param body   hmtl body
     * @param i      index of tag's start
     *
     * @return tag's name, or <code>null</code> if tag not found
     */
    public static String getTagName(String body, int i) {
        if (body == null) {
            return null;
        }

        if (body.charAt(i) != '<') {
            return null; // no tag
        }

        int start = i + 1; // skip '<'
        int len = body.length();
        boolean isEndTag = false;

        // skip all non-letters
        while (start < len) {
            char c = body.charAt(start);

            if (c == '>') {
                return null; // tag end found => name not found
            }

            if (c == '/') { // this is an end tag
                start++;
                isEndTag = true;

                continue;
            }

            if (!Character.isWhitespace(c)) {
                break;
            }

            start++;
        }

        if (start == len) {
            return null; // tag name not found
        }

        int end = start;

        // skip all letters
        while (end < len) {
            char c = body.charAt(end);

            if (Character.isWhitespace(c) || (c == '>')) {
                break;
            }

            end++;
        }

        if (end == len) {
            return null; // tag end not found
        }

        String tagName = body.substring(start, end);

        if (isEndTag) {
            tagName = "/" + tagName;
        }

        return tagName;
    }

    // ---------------------------------------------------------------- tag attribute

    /**
     * Returns value of the first founded attribute that matches given name.
     * It is assumed that given string represents tag's body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     *
     * @param tagBody  tag body
     * @param attrName attribute name
     *
     * @return attribute value or <code>null</code> if attribute not found
     */
    public static String getAttribute(String tagBody, String attrName) {
        return getAttribute(tagBody, attrName, 0);
    }

    /**
     * Returns value of the first founded attribute that matches given name.
     * Given string may not be just a tag's body, however, start and end
     * parameters must define tags body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     *
     * @param body     html body
     * @param attrName attribute name
     * @param start    index of tag's start
     *
     * @return attribute value or <code>null</code> if attribute not found
     */
    public static String getAttribute(String body, String attrName,
        int start) {
        if (body == null) {
            return null;
        }

        char quote = '\"';
        int end = body.indexOf('>');

        if (end == -1) {
            return null; // tag's end not found
        }

        int i = indexOfIgnoreCase(body, attrName + "=\"", start);

        if ((i == -1) || (i > end)) {
            i = indexOfIgnoreCase(body, attrName + "='", start);

            if ((i == -1) || (i > end)) {
                return null;
            }

            quote = '\'';
        }

        String value = null;
        i += (attrName.length() + 2);

        int s = i;
        int j = -1;

        while (true) {
            j = body.indexOf(quote, s);

            if (j == -1) {
                break; // closed quation not found
            }

            if (body.charAt(j - 1) == '\\') {
                s = j + 1;

                continue;
            } else {
                value = body.substring(i, j);

                break;
            }
        }

        return value;
    }

    // ---------------------------------------------------------------- add attribute & value

    /**
     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     *
     * @param tagBody tag body
     * @param name    attribute name
     * @param value   attribute value
     *
     * @return tag string with added attribute and value
     */
    public static String addAttribute(String tagBody, String name,
        String value) {
        return addAttribute(tagBody, name, value, 0);
    }

    /**
     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     *
     * @param body   html body
     * @param name   attribute name
     * @param value  attribute value
     * @param i      tag's offset in html body
     *
     * @return tag string with added attribute and value
     */
    public static String addAttribute(String body, String name,
        String value, int i) {
        if (body == null) {
            return null;
        }

        if (name == null) {
            return body;
        }

        if (value == null) {
            value = "";
        }

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;
        }

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');
        result.append(name).append('=').append('"');
        //result.append(ServletUtil.encodeHtml(value)).append('"');
        result.append(HtmlEncoder.encode(value)).append('"');
        result.append(body.substring(end));

        return result.toString();
    }

    // ---------------------------------------------------------------- add attribute, no value

    /**
     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     *
     * @param tagBody tag body
     * @param name    attribute name
     *
     * @return tag string with added attribute
     */
    public static String addAttribute(String tagBody, String name) {
        return addAttribute(tagBody, name, 0);
    }

    /**
     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     *
     * @param body   html body
     * @param name   attribute name
     * @param i      tag's offset in html body
     *
     * @return tag string with added attribute
     */
    public static String addAttribute(String body, String name, int i) {
        if (body == null) {
            return null;
        }

        if (name == null) {
            return body;
        }

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;
        }

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');
        result.append(name).append(body.substring(end));

        return result.toString();
    }

    /**
     * Finds first index of a substring in the given source string with ignored
     * case. This seems to be the fastest way doing this, with common string
     * length and content (of course, with no use of Boyer-Mayer type of
     * algorithms). Other implementations are slower: getting char array frist,
     * lowercasing the source string, using String.regionMatch etc.
     *
     * @param src        source string for examination
     * @param subS       substring to find
     * @param startIndex starting index from where search begins
     *
     * @return index of founded substring or -1 if substring is not found
     */
    public static int indexOfIgnoreCase(String src, String subS,
        int startIndex) {
        String sub = subS.toLowerCase(Locale.CHINA);
        int sublen = sub.length();
        int total = src.length() - sublen + 1;

        for (int i = startIndex; i < total; i++) {
            int j = 0;

            while (j < sublen) {
                char source = Character.toLowerCase(src.charAt(i + j));

                if (sub.charAt(j) != source) {
                    break;
                }

                j++;
            }

            if (j == sublen) {
                return i;
            }
        }

        return -1;
    }
}
class HtmlEncoder {
    public static final float NEW_SIZE_FACTOR = 1.3f;

    /**
     * Lookup table for use in encode() method.
     *
     * @see #encode
     */
    private static final String[] TABLE_HTML = new String[256];

    /**
     * Lookup table for use in encodeTextXxx() methods.
     *
     * @see #encodeText
     * @see #encodeTextSmart
     * @see #encodeTextStrict
     */
    private static final String[] TABLE_HTML_STRICT = new String[256];

    static {
        for (int i = 0; i < 10; i++) {
            TABLE_HTML[i] = "&#00" + i + ";";
        }

        for (int i = 10; i < 32; i++) {
            TABLE_HTML[i] = "&#0" + i + ";";
        }

        for (int i = 32; i < 128; i++) {
            TABLE_HTML[i] = String.valueOf((char) i);
        }

        for (int i = 128; i < 256; i++) {
            TABLE_HTML[i] = "&#" + i + ";";
        }

        // special characters
        TABLE_HTML['\''] = "&#039;"; // apostrophe ('&apos;' doesn't work - it is not by the w3 specs).
        TABLE_HTML['\"'] = "&quot;"; // double quote.
        TABLE_HTML['&'] = "&amp;"; // ampersand.
        TABLE_HTML['<'] = "&lt;"; // lower than.
        TABLE_HTML['>'] = "&gt;"; // greater than.

        // strict table
        System.arraycopy(TABLE_HTML, 0, TABLE_HTML_STRICT, 0, 256);
        TABLE_HTML_STRICT[' '] = "&nbsp;"; // ??.
        TABLE_HTML_STRICT['\n'] = "<br>"; // ascii 10.
        TABLE_HTML_STRICT['\r'] = "<br>"; // ascii 13.
    }

    // ---------------------------------------------------------------- encoding

    /**
     * Encode string to HTML-safe text. Extra characters are encoded as decimals,
     * and five special characters are replaced with their HTML values:
     * <li>' with &amp;#039;</li>
     * <li>" with &amp;quot;</li>
     * <li>&amp; with &amp;amp;</li>
     * <li>&lt; with &amp;lt;</li>
     * <li>&gt; with &amp;gt;</li>
     *
     * @param string input string
     *
     * @return HTML-safe string
     * @see #encodeText
     */
    public static String encode(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML.length;
        char c;

        for (int i = 0; i < n; i++) {
            c = string.charAt(i);

            if (c < tableLen) {
                buffer.append(TABLE_HTML[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format. Additionaly, the following
     * characters are replaced:
     * <li>' ' with &amp;nbsp;</li>
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     *
     * Common problem with this method is that spaces are not breakable, so they
     * may break the outline of the page.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeTextStrict(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format except spaces.
     * Additionaly, the following characters are replaced:
     *
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeText(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                buffer.append(' ');

                continue;
            }

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format using smart spaces.
     * Additionaly, the following characters are replaced:
     *
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.<br>
     *
     * This method is special since it preserves format, but with combination of
     * not-breakable spaces and common spaces, so breaks are availiable.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeTextSmart(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;
        boolean prevSpace = false;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                if (prev != ' ') {
                    prevSpace = false;
                }

                if (!prevSpace) {
                    buffer.append(' ');
                } else {
                    buffer.append("&nbsp;");
                }

                prevSpace = !prevSpace;

                continue;
            }

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }


 
}

   
    
    
  








Related examples in the same category

1.HTMLDocument: Element Iterator Example
2.HTMLEditorKit DemoHTMLEditorKit Demo
3.SimpleAttributeSet ExampleSimpleAttributeSet Example
4.Text Tab SampleText Tab Sample
5.Styled DocumentStyled Document
6.Escape HTML
7.Escape HTML
8.Encode HTML
9.Replace all the occurences of HTML escape strings with the respective characters.
10.HTML Rewriter
11.HTML Encode
12.XMLWriter is a generic class that provides common behavior to writers of a tagged language such as XML, WordML and HTML.
13.Escape html entities.
14.Html Encoder
15.Remove Comment