 * LibXML : a free Java layouting library
 * Project Info:  http://reporting.pentaho.org/libxml/
 * (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
 * This library is free software; you can redistribute it and/or modify it under the terms
 * of the GNU Lesser General Public License as published by the Free Software Foundation;
 * either version 2.1 of the License, or (at your option) any later version.
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
 * in the United States and other countries.]
 * ------------
 * HtmlCharacterEntities.java
 * ------------

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;

 * A collection of all character entites defined in the HTML4 standard. The key
 * is the entity name, the property value is the decoded string.
 * @author Thomas Morgner
public class HtmlCharacterEntities extends Properties
   * The singleton instance for this entity-parser implementation.
  private static CharacterEntityParser entityParser;
  private static final long serialVersionUID = 5118172339379209383L;

   * Gets the character entity parser for HTML content. The CharacterEntity
   * parser translates known characters into predefined entities.
   * @return the character entity parser instance.
  public static CharacterEntityParser getEntityParser()
    if (entityParser == null)
      entityParser = new CharacterEntityParser(new HtmlCharacterEntities());
    return entityParser;

   * Creates an instance.
  public HtmlCharacterEntities()
    setProperty("ang", "\u2220");
    setProperty("spades", "\u2660");
    setProperty("frasl", "\u2044");
    setProperty("copy", "\u00a9");
    setProperty("Upsilon", "\u03a5");
    setProperty("rsquo", "\u2019");
    setProperty("sdot", "\u22c5");
    setProperty("beta", "\u03b2");
    setProperty("egrave", "\u00e8");
    setProperty("Pi", "\u03a0");
    setProperty("micro", "\u00b5");
    setProperty("lArr", "\u21d0");
    setProperty("Beta", "\u0392");
    setProperty("eacute", "\u00e9");
    setProperty("agrave", "\u00e0");
    setProperty("sbquo", "\u201a");
    setProperty("ucirc", "\u00fb");
    setProperty("mdash", "\u2014");
    setProperty("rho", "\u03c1");
    setProperty("Nu", "\u039d");
    setProperty("ne", "\u2260");
    setProperty("nsub", "\u2284");
    setProperty("AElig", "\u00c6");
    setProperty("raquo", "\u00bb");
    setProperty("aacute", "\u00e1");
    setProperty("le", "\u2264");
    setProperty("harr", "\u2194");
    setProperty("frac34", "\u00be");
    setProperty("bdquo", "\u201e");
    setProperty("cup", "\u222a");
    setProperty("frac14", "\u00bc");
    setProperty("exist", "\u2203");
    setProperty("Ccedil", "\u00c7");
    setProperty("phi", "\u03c6");
    setProperty("Lambda", "\u039b");
    setProperty("alpha", "\u03b1");
    setProperty("sigma", "\u03c3");
    setProperty("thetasym", "\u03d1");
    setProperty("Rho", "\u03a1");
    setProperty("hArr", "\u21d4");
    setProperty("Dagger", "\u2021");
    setProperty("otilde", "\u00f5");
    setProperty("Epsilon", "\u0395");
    setProperty("iuml", "\u00ef");
    setProperty("Phi", "\u03a6");
    setProperty("prod", "\u220f");
    setProperty("Aring", "\u00c5");
    setProperty("rlm", "\u200f");
    setProperty("yen", "\u00a5");
    setProperty("emsp", "\u2003");
    setProperty("rang", "\u232a");
    setProperty("Atilde", "\u00c3");
    setProperty("Iuml", "\u00cf");
    setProperty("iota", "\u03b9");
    setProperty("deg", "\u00b0");
    setProperty("prop", "\u221d");
    setProperty("and", "\u2227");
    setProperty("para", "\u00b6");
    setProperty("darr", "\u2193");
    setProperty("curren", "\u00a4");
    setProperty("crarr", "\u21b5");
    setProperty("not", "\u00ac");
    setProperty("Iota", "\u0399");
    setProperty("aelig", "\u00e6");
    setProperty("rdquo", "\u201d");
    setProperty("Ocirc", "\u00d4");
    setProperty("ntilde", "\u00f1");
    setProperty("reg", "\u00ae");
    setProperty("zeta", "\u03b6");
    setProperty("middot", "\u00b7");
    setProperty("cent", "\u00a2");
    setProperty("quot", "\"");
    setProperty("hellip", "\u2026");
    setProperty("Zeta", "\u0396");
    setProperty("rceil", "\u2309");
    setProperty("eta", "\u03b7");
    setProperty("nbsp", "\u00a0");
    setProperty("rarr", "\u2192");
    setProperty("frac12", "\u00bd");
    setProperty("real", "\u211c");
    setProperty("mu", "\u03bc");
    setProperty("dArr", "\u21d3");
    setProperty("divide", "\u00f7");
    setProperty("cap", "\u2229");
    setProperty("chi", "\u03c7");
    setProperty("times", "\u00d7");
    setProperty("euml", "\u00eb");
    setProperty("Gamma", "\u0393");
    setProperty("loz", "\u25ca");
    setProperty("acute", "\u00b4");
    setProperty("Omega", "\u03a9");
    setProperty("ndash", "\u2013");
    setProperty("clubs", "\u2663");
    setProperty("macr", "\u00af");
    setProperty("Yacute", "\u00dd");
    setProperty("Ugrave", "\u00d9");
    setProperty("Euml", "\u00cb");
    setProperty("Eta", "\u0397");
    setProperty("sect", "\u00a7");
    setProperty("asymp", "\u2248");
    setProperty("ordm", "\u00ba");
    setProperty("rArr", "\u21d2");
    setProperty("radic", "\u221a");
    setProperty("Uacute", "\u00da");
    setProperty("omicron", "\u03bf");
    setProperty("Chi", "\u03a7");
    setProperty("aring", "\u00e5");
    setProperty("Theta", "\u0398");
    setProperty("supe", "\u2287");
    setProperty("ensp", "\u2002");
    setProperty("uml", "\u00a8");
    setProperty("ccedil", "\u00e7");
    setProperty("lambda", "\u03bb");
    setProperty("gt", "\u003e");
    setProperty("uarr", "\u2191");
    setProperty("alefsym", "\u2135");
    setProperty("auml", "\u00e4");
    setProperty("sup3", "\u00b3");
    setProperty("circ", "\u02c6");
    setProperty("lsquo", "\u2018");
    setProperty("Auml", "\u00c4");
    setProperty("dagger", "\u2020");
    setProperty("Kappa", "\u039a");
    setProperty("cong", "\u2245");
    setProperty("zwnj", "\u200c");
    setProperty("shy", "\u00ad");
    setProperty("ouml", "\u00f6");
    setProperty("diams", "\u2666");
    setProperty("uArr", "\u21d1");
    setProperty("atilde", "\u00e3");
    setProperty("THORN", "\u00de");
    setProperty("or", "\u2228");
    setProperty("Ograve", "\u00d2");
    setProperty("ocirc", "\u00f4");
    setProperty("plusm", "\u00b1");
    setProperty("Ouml", "\u00d6");
    setProperty("nabla", "\u2207");
    setProperty("psi", "\u03c8");
    setProperty("sigmaf", "\u03c2");
    setProperty("euro", "\u20ac");
    setProperty("sube", "\u2286");
    setProperty("sup2", "\u00b2");
    setProperty("laquo", "\u00ab");
    setProperty("forall", "\u2200");
    setProperty("Oacute", "\u00d3");
    setProperty("iexcl", "\u00a1");

   * Externalized initialization method to make CheckStyle happy.
  private void fillMoreEntities()
    setProperty("piv", "\u03d6");
    setProperty("minus", "\u2212");
    setProperty("zwj", "\u200d");
    setProperty("tau", "\u03c4");
    setProperty("Mu", "\u039c");
    setProperty("gamma", "\u03b3");
    setProperty("sup", "\u2283");
    setProperty("Psi", "\u03a8");
    setProperty("omega", "\u03c9");
    setProperty("Oslash", "\u00d8");
    setProperty("weierp", "\u2118");
    setProperty("Igrave", "\u00cc");
    setProperty("OElig", "\u0152");
    setProperty("sup1", "\u00b9");
    setProperty("cedil", "\u00b8");
    setProperty("upsilon", "\u03c5");
    setProperty("equiv", "\u2261");
    setProperty("isin", "\u2208");
    setProperty("Delta", "\u0394");
    setProperty("yacute", "\u00fd");
    setProperty("ugrave", "\u00f9");
    setProperty("ge", "\u2265");
    setProperty("Iacute", "\u00cd");
    setProperty("brvbar", "\u00a6");
    setProperty("Tau", "\u03a4");
    setProperty("Prime", "\u2033");
    setProperty("rfloor", "\u22a7");
    setProperty("Ecirc", "\u00ca");
    setProperty("ETH", "\u00d0");
    setProperty("int", "\u222b");
    setProperty("xi", "\u03be");
    setProperty("uacute", "\u00fa");
    setProperty("bull", "\u2022");
    setProperty("Scaron", "\u0160");
    setProperty("theta", "\u03b8");
    setProperty("yuml", "\u00ff");
    setProperty("oplus", "\u2295");
    setProperty("part", "\u2202");
    setProperty("ldquo", "\u201c");
    setProperty("Icirc", "\u00ce");
    setProperty("Yuml", "\u0178");
    setProperty("eth", "\u00f0");
    setProperty("Acirc", "\u00c2");
    setProperty("sub", "\u2282");
    setProperty("lceil", "\u2308");
    setProperty("Egrave", "\u00c8");
    setProperty("tilde", "\u02dc");
    setProperty("pi", "\u03c0");
    setProperty("rsaquo", "\u203a");
    setProperty("kappa", "\u03ba");
    setProperty("upsih", "\u03d2");
    setProperty("Omicron", "\u039f");
    setProperty("otimes", "\u2297");
    setProperty("ni", "\u220b");
    setProperty("amp", "\u0026");
    setProperty("Eacute", "\u00c9");
    setProperty("nu", "\u03bd");
    setProperty("Ucirc", "\u00db");
    setProperty("uuml", "\u00fc");
    setProperty("oslash", "\u00f8");
    setProperty("thorn", "\u00fe");
    setProperty("trade", "\u2122");
    setProperty("epsilon", "\u03b5");
    setProperty("ograve", "\u00f2");
    setProperty("hearts", "\u2665");
    setProperty("iquest", "\u00bf");
    setProperty("Uuml", "\u00dc");
    setProperty("empty", "\u2205");
    setProperty("lowast", "\u2217");
    setProperty("sum", "\u2211");
    setProperty("lfloor", "\u22a6");
    setProperty("lrm", "\u200e");
    setProperty("oacute", "\u00f3");
    setProperty("image", "\u2111");
    setProperty("Agrave", "\u00c0");
    setProperty("oline", "\u203e");
    setProperty("oelig", "\u0153");
    setProperty("Sigma", "\u03a3");
    setProperty("permil", "\u2030");
    setProperty("perp", "\u22a5");
    setProperty("lt", "\u003c");
    setProperty("Aacute", "\u00c1");
    setProperty("acirc", "\u00e2");
    setProperty("lang", "\u2329");
    setProperty("delta", "\u03b4");
    setProperty("infin", "\u221e");
    setProperty("igrave", "\u00ec");
    setProperty("ordf", "\u00aa");
    setProperty("lsaquo", "\u2039");
    setProperty("prime", "\u2032");
    setProperty("ecirc", "\u00ea");
    setProperty("there4", "\u2234");
    setProperty("iacute", "\u00ed");
    setProperty("sim", "\u223c");
    setProperty("Alpha", "\u0391");
    setProperty("pound", "\u00a3");
    setProperty("notin", "\u2209");
    setProperty("Ntilde", "\u00d1");
    setProperty("Xi", "\u039e");
    setProperty("thinsp", "\u2009");
    setProperty("Otilde", "\u00d5");
    setProperty("icirc", "\u00ee");
    setProperty("scaron", "\u0161");
    setProperty("szlig", "\u00df");
    setProperty("larr", "\u2190");

 * The character entity parser replaces all known occurrences of an entity in
 * the format &entityname;.
 * @author Thomas Morgner
 class CharacterEntityParser
  private String[] charMap;
   * the entities, keyed by entity name.
  private final HashMap entities;

   * Creates a new CharacterEntityParser and initializes the parser with the
   * given set of entities.
   * @param characterEntities the entities used for the parser
  public CharacterEntityParser(final Properties characterEntities)
    if (characterEntities == null)
      throw new NullPointerException("CharacterEntities must not be null");

    entities = new HashMap(characterEntities);
    charMap = new String[65536];
    final Iterator entries = entities.entrySet().iterator();
    while (entries.hasNext())
      final Map.Entry entry = (Map.Entry) entries.next();
      final String value = (String) entry.getValue();
      final String entityName = (String) entry.getKey();
      if (value.length() != 1)
        throw new IllegalStateException();
      charMap[value.charAt(0)] = entityName;

   * Creates a new CharacterEntityParser and initializes the parser with the
   * given set of entities.
   * @param characterEntities the entities used for the parser
  public CharacterEntityParser(final HashMap characterEntities)
    if (characterEntities == null)
      throw new NullPointerException("CharacterEntities must not be null");

    entities = (HashMap) characterEntities.clone();
    charMap = new String[65536];

    final Iterator entries = entities.entrySet().iterator();
    while (entries.hasNext())
      final Map.Entry entry = (Map.Entry) entries.next();
      final String value = (String) entry.getValue();
      final String entityName = (String) entry.getKey();
      if (value.length() != 1)
        throw new IllegalStateException();
      charMap[value.charAt(0)] = entityName;

   * create a new Character entity parser and initializes the parser with the
   * entities defined in the XML standard.
   * @return the CharacterEntityParser initialized with XML entities.
  public static CharacterEntityParser createXMLEntityParser()
    final HashMap entities = new HashMap();
    entities.put("amp", "&");
    entities.put("quot", "\"");
    entities.put("lt", "<");
    entities.put("gt", ">");
    entities.put("apos", "\u0027");
    return new CharacterEntityParser(entities);

   * returns the entities used in the parser.
   * @return the properties for this parser.
  private HashMap getEntities()
    return entities;

   * Looks up the character for the entity name specified in <code>key</code>.
   * @param key the entity name
   * @return the character as string with a length of 1
  private String lookupCharacter(final String key)
    return (String) getEntities().get(key);

   * Encode the given String, so that all known entites are encoded. All
   * characters represented by these entites are now removed from the string.
   * @param value the original string
   * @return the encoded string.
  public String encodeEntities(final String value)
    if (value == null)
      throw new NullPointerException();

    final int length = value.length();
    final StringBuffer writer = new StringBuffer(length);
    for (int i = 0; i < length; i++)
      final char character = value.charAt(i);
      final String lookup = charMap[character];
      if (lookup == null)
    return writer.toString();

   * Decode the string, all known entities are replaced by their resolved
   * characters.
   * @param value the string that should be decoded.
   * @return the decoded string.
  public String decodeEntities(final String value)
    if (value == null)
      throw new NullPointerException();

    int parserIndex = 0;
    int subStart = value.indexOf('&', parserIndex);
    if (subStart == -1)
      return value;
    int subEnd = value.indexOf(';', subStart);
    if (subEnd == -1)
      return value;

    final StringBuffer bufValue = new StringBuffer(value.substring(0, subStart));
      // at this point we know, that there is at least one entity ..
      if (value.charAt(subStart + 1) == '#')
        final int subValue = parseInt(value.substring(subStart + 2, subEnd), 0);
        if ((subValue >= 1) && (subValue <= 65536))
          final char[] chr = new char[1];
          chr[0] = (char) subValue;
          // invalid entity, do not decode ..
          bufValue.append(value.substring(subStart, subEnd));
        final String entity = value.substring(subStart + 1, subEnd);
        final String replaceString = lookupCharacter(entity);
        if (replaceString != null)
      parserIndex = subEnd + 1;
      subStart = value.indexOf('&', parserIndex);
      if (subStart == -1)
        subEnd = -1;
        subEnd = value.indexOf(';', subStart);
        if (subEnd == -1)
          bufValue.append(value.substring(parserIndex, subStart));
    while (subStart != -1 && subEnd != -1);

    return bufValue.toString();

   * Parses the given string into an int-value. On errors the default value
   * is returned.
   * @param s          the string
   * @param defaultVal the default value that should be used in case of errors
   * @return the parsed int or the default value.
  private int parseInt(final String s, final int defaultVal)
    if (s == null)
      return defaultVal;
      return Integer.parseInt(s);
    catch (Exception e)
      // ignored ..
    return defaultVal;

