Java HTML Unescape htmlUnescape(String s)

Here you can find the source of htmlUnescape(String s)

Description

Turn HTML character references into their plain text UNICODE equivalent.

License

Apache License

Declaration

public static String htmlUnescape(String s) 

Method Source Code


//package com.java2s;
/*/*from ww  w . j a va2 s .co  m*/
 * Copyright 2002-2004 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.HashMap;
import java.util.Map;

public class Main {
    private static final String EMPTY_REFERENCE = "&;";
    private static final String MALFORMED_REFERENCE = "&#;";
    private static final Map ENTITIES = new HashMap();

    /**
     * Turn HTML character references into their plain text UNICODE equivalent.
     * <p>Handles complete character set defined in HTML 4.01 recommendation
     * and all reference types (decimal, hex, and entity).
     * <p>Correctly converts the following formats:
     * <blockquote>
     * &#<i>Decimal</i>; - <i>(Example: &#68;)</i><br>
     * &#x<i>Hex</i>;<br> - <i>(Example: &#xE5;) case insensitive</i>
     * &#<i>Entity</i>; - <i>(Example: &amp;) case sensitive</i>
     * </blockquote>
     * Gracefully handles malformed character references by copying original
     * characters as is when encountered.<p>
     * <p>Reference:
     * <a href="http://www.w3.org/TR/html4/sgml/entities.html">
     * http://www.w3.org/TR/html4/sgml/entities.html
     * </a>
     */
    public static String htmlUnescape(String s) {
        if (s == null) {
            return null;
        }

        StringBuffer unescaped = new StringBuffer(s.length());
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (c == '&') {
                // don't look more than 12 chars ahead as reference like strings
                // should not be longer than 12 chars in length (including ';')
                // prevents the entire string from being searched when an '&'
                // with no following ';' is an encountered
                int start = Math.min(i + 1, s.length() - 1);
                int end = Math.min(s.length(), start + 12);

                String reference = s.substring(start, end);
                int semi = reference.indexOf(';');

                if (semi == -1) {
                    unescaped.append(c);
                    continue;
                }

                reference = reference.substring(0, semi);
                i = start + semi;

                // try entity reference first
                Integer iso = (Integer) ENTITIES.get(reference);
                if (iso != null) {
                    unescaped.append((char) iso.intValue());
                    continue;
                }

                if (reference.length() == 0) {
                    unescaped.append(EMPTY_REFERENCE);
                    continue;
                }

                if (reference.charAt(0) == '#') {
                    if (reference.length() > 2) {
                        int index = 1;
                        if (reference.charAt(1) == 'x' || reference.charAt(1) == 'X') {
                            index = 2;
                        }
                        try {
                            unescaped.append(
                                    (char) Integer.parseInt(reference.substring(index), (index == 1) ? 10 : 16));
                            continue;
                        } catch (NumberFormatException e) {
                            // wasn't hex or decimal, copy original chars
                            unescaped.append('&' + reference + ';');
                            continue;
                        }
                    }
                    unescaped.append(MALFORMED_REFERENCE);
                    continue;
                }

                // may not be valid reference, forget it
                i = start - 1;
            }
            unescaped.append(c);
        }
        return unescaped.toString();
    }
}

Related

  1. htmlUnescape(String source)
  2. unEscapeHTML(final String escapedHTML)
  3. unescapeHtml(final String input)
  4. unescapeHTML(String comment)