/*
* Copyright (c) 2001 - 2005 ivata limited.
* All rights reserved.
* -----------------------------------------------------------------------------
* ivata masks may be redistributed under the GNU General Public
* License as published by the Free Software Foundation;
* version 2 of the License.
*
* These programs are free software; you can redistribute them and/or
* modify them under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2 of the License.
*
* These programs are distributed in the hope that they will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU General Public License in the file LICENSE.txt for more
* details.
*
* If you would like a copy of the GNU General Public License write to
*
* Free Software Foundation, Inc.
* 59 Temple Place - Suite 330
* Boston, MA 02111-1307, USA.
*
*
* To arrange commercial support and licensing, contact ivata at
* http://www.ivata.com/contact.jsp
* -----------------------------------------------------------------------------
* $Log: CharacterEntityFormat.java,v $
* Revision 1.5 2005/10/03 10:17:25 colinmacleod
* Fixed some style and javadoc issues.
*
* Revision 1.4 2005/10/02 14:06:33 colinmacleod
* Added/improved log4j logging.
*
* Revision 1.3 2005/04/11 14:45:38 colinmacleod
* Changed HTMLFormat from an abstract class
* into an interface.
*
* Revision 1.2 2005/04/09 18:04:18 colinmacleod
* Changed copyright text to GPL v2 explicitly.
*
* Revision 1.1 2005/01/06 22:41:01 colinmacleod
* Moved up a version number.
* Changed copyright notices to 2005.
* Updated the documentation:
* - started working on multiproject:site docu.
* - changed the logo.
* Added checkstyle and fixed LOADS of style issues.
* Added separate thirdparty subproject.
* Added struts (in web), util and webgui (in webtheme) from ivata op.
*
* Revision 1.4 2004/11/03 17:07:21 colinmacleod
* Fixed bug in entity matching.
*
* Revision 1.3 2004/03/21 21:16:37 colinmacleod
* Shortened name to ivata op.
*
* Revision 1.2 2004/02/01 22:07:32 colinmacleod
* Added full names to author tags
*
* Revision 1.1.1.1 2004/01/27 20:59:47 colinmacleod
* Moved ivata op to SourceForge.
*
* Revision 1.2 2003/10/15 14:13:39 colin
* Fixes for XDoclet.
*
* Revision 1.3 2003/02/26 08:15:03 colin
* Fixed bug in append routine.
*
* Revision 1.2 2003/02/26 08:13:43 colin
* added toString to entity StringBuffer - not supported in JDK 1.3
*
* Revision 1.1 2003/02/24 19:33:32 colin
* Moved to new subproject.
*
* Revision 1.5 2003/02/04 17:43:46 colin
* copyright notice
*
* Revision 1.4 2002/09/06 15:08:34 colin
* split off the character entity map into a new file
*
* Revision 1.3 2002/09/04 08:10:36 colin
* fixed bug when entities are converted by browser
*
* Revision 1.1 2002/06/21 11:58:37 colin
* restructured com.ivata.mask.jsp into
* format, JavaScript, theme and tree.
* -----------------------------------------------------------------------------
*/
package com.ivata.mask.web.format;
import org.apache.log4j.Logger;
/**
* Convert characters to their HTML character entity equivalents.
*
* @since ivata masks 0.4 (2002-06-19)
* @author Colin MacLeod
* <a href='mailto:colin.macleod@ivata.com'>colin.macleod@ivata.com</a>
* @version $Revision: 1.5 $
*/
public class CharacterEntityFormat implements HTMLFormat {
/**
* Logger for this class.
*/
private static final Logger logger = Logger
.getLogger(CharacterEntityFormat.class);
/**
* Maintain the mapping via this translation array.
*/
static final String[][] ENTITIES = {
// quotation mark = APL quote
// NOTE: JBuilder compiler wouldn't accept the Unicode value here
// (\u0022): probably an internal compiler problem with the string
// quotes
{"\"", "quot" }, //\u0022
{"\"", "#34"}, //\u0022
// ampersand
{"\u0026", "amp"}, {"\u0026", "#38"}, {"&", "amp"},
{"\u0027", "#39"},
// less-than sign
{"\u003C", "lt"}, {"\u003C", "#60"}, {"<", "lt"},
// greater-than sign
{"\u003E", "gt"}, {"\u003E", "#62"}, {">", "gt"},
// Latin capital ligature OE
{"\u0152", "OElig"}, {"\u0152", "#338"},
// Latin small ligature oe
{"\u0153", "oelig"}, {"\u0153", "#339"},
// Latin capital letter S with caron
{"\u0160", "Scaron"}, {"\u0160", "#352"},
// Latin small letter s with caron
{"\u0161", "scaron"}, {"\u0161", "#353"},
// Latin capital letter Y with diaeresis
{"\u0178", "Yuml"}, {"\u0178", "#376"},
// Latin small f with hook = function = florin
{"\u0192", "fnof"}, {"\u0192", "#402"},
// modifier letter circumflex accent
{"\u02C6", "circ"}, {"\u02C6", "#710"},
// small tilde
{"\u02DC", "tilde"}, {"\u02DC", "#732"},
// greek capital letter alpha
{"\u0391", "Alpha"}, {"\u0391", "#913"},
// greek capital letter beta
{"\u0392", "Beta"}, {"\u0392", "#914"},
// greek capital letter gamma
{"\u0393", "Gamma"}, {"\u0393", "#915"},
// greek capital letter delta
{"\u0394", "Delta"}, {"\u0394", "#916"},
// greek capital letter epsilon
{"\u0395", "Epsilon"}, {"\u0395", "#917"},
// greek capital letter zeta
{"\u0396", "Zeta"}, {"\u0396", "#918"},
// greek capital letter eta
{"\u0397", "Eta"}, {"\u0397", "#919"},
// greek capital letter theta
{"\u0398", "Theta"}, {"\u0398", "#920"},
// greek capital letter iota
{"\u0399", "Iota"}, {"\u0399", "#921"},
// greek capital letter kappa
{"\u039A", "Kappa"}, {"\u039A", "#922"},
// greek capital letter lambda
{"\u039B", "Lambda"}, {"\u039B", "#923"},
// greek capital letter mu
{"\u039C", "Mu"}, {"\u039C", "#924"},
// greek capital letter nu
{"\u039D", "Nu"}, {"\u039D", "#925"},
// greek capital letter xi
{"\u039E", "Xi"}, {"\u039E", "#926"},
// greek capital letter omicron
{"\u039F", "Omicron"}, {"\u039F", "#927"},
// greek capital letter pi
{"\u03A0", "Pi"}, {"\u03A0", "#928"},
// greek capital letter rho
{"\u03A1", "Rho"}, {"\u03A1", "#929"},
// greek capital letter sigma
{"\u03A3", "Sigma"}, {"\u03A3", "#931"},
// greek capital letter tau
{"\u03A4", "Tau"}, {"\u03A4", "#932"},
// greek capital letter upsilon
{"\u03A5", "Upsilon"}, {"\u03A5", "#933"},
// greek capital letter phi
{"\u03A6", "Phi"}, {"\u03A6", "#934"},
// greek capital letter chi
{"\u03A7", "Chi"}, {"\u03A7", "#935"},
// greek capital letter psi
{"\u03A8", "Psi"}, {"\u03A8", "#936"},
// greek capital letter omega
{"\u03A9", "Omega"}, {"\u03A9", "#937"},
// greek small letter alpha
{"\u03B1", "alpha"}, {"\u03B1", "#945"},
// greek small letter beta
{"\u03B2", "beta"}, {"\u03B2", "#946"},
// greek small letter gamma
{"\u03B3", "gamma"}, {"\u03B3", "#947"},
// greek small letter delta
{"\u03B4", "delta"}, {"\u03B4", "#948"},
// greek small letter epsilon
{"\u03B5", "epsilon"}, {"\u03B5", "#949"},
// greek small letter zeta
{"\u03B6", "zeta"}, {"\u03B6", "#950"},
// greek small letter eta
{"\u03B7", "eta"}, {"\u03B7", "#951"},
// greek small letter theta
{"\u03B8", "theta"}, {"\u03B8", "#952"},
// greek small letter iota
{"\u03B9", "iota"}, {"\u03B9", "#953"},
// greek small letter kappa
{"\u03BA", "kappa"}, {"\u03BA", "#954"},
// greek small letter lambda
{"\u03BB", "lambda"}, {"\u03BB", "#955"},
// greek small letter mu
{"\u03BC", "mu"}, {"\u03BC", "#956"},
// greek small letter nu
{"\u03BD", "nu"}, {"\u03BD", "#957"},
// greek small letter xi
{"\u03BE", "xi"}, {"\u03BE", "#958"},
// greek small letter omicron
{"\u03BF", "omicron"}, {"\u03BF", "#959"},
// greek small letter pi
{"\u03C0", "pi"}, {"\u03C0", "#960"},
// greek small letter rho
{"\u03C1", "rho"}, {"\u03C1", "#961"},
// greek small letter final sigma
{"\u03C2", "sigmaf"}, {"\u03C2", "#962"},
// greek small letter sigma
{"\u03C3", "sigma"}, {"\u03C3", "#963"},
// greek small letter tau
{"\u03C4", "tau"}, {"\u03C4", "#964"},
// greek small letter upsilon
{"\u03C5", "upsilon"}, {"\u03C5", "#965"},
// greek small letter phi
{"\u03C6", "phi"}, {"\u03C6", "#966"},
// greek small letter chi
{"\u03C7", "chi"}, {"\u03C7", "#967"},
// greek small letter psi
{"\u03C8", "psi"}, {"\u03C8", "#968"},
// greek small letter omega
{"\u03C9", "omega"}, {"\u03C9", "#969"},
// greek small letter theta symbol
{"\u03D1", "thetasym"}, {"\u03D1", "#977"},
// greek upsilon with hook symbol
{"\u03D2", "upsih"}, {"\u03D2", "#978"},
// greek pi symbol
{"\u03D6", "piv"}, {"\u03D6", "#982"},
// en space
{"\u2002", "ensp"}, {"\u2002", "#8194"},
// em space
{"\u2003", "emsp"}, {"\u2003", "#8195"},
// thin space
{"\u2009", "thinsp"}, {"\u2009", "#8201"},
// zero width non-joiner
{"\u200C", "zwnj"}, {"\u200C", "#8204"},
// zero width joiner
{"\u200D", "zwj"}, {"\u200D", "#8205"},
// left-to-right mark
{"\u200E", "lrm"}, {"\u200E", "#8206"},
// right-to-left mark
{"\u200F", "rlm"}, {"\u200F", "#8207"},
// en dash
{"\u2013", "ndash"}, {"\u2013", "#8211"},
// em dash
{"\u2014", "mdash"}, {"\u2014", "#8212"},
// left single quotation mark
{"\u2018", "lsquo"}, {"\u2018", "#8216"},
// right single quotation mark
{"\u2019", "rsquo"}, {"\u2019", "#8217"},
// single low-9 quotation mark
{"\u201A", "sbquo"}, {"\u201A", "#8218"},
// left double quotation mark
{"\u201C", "ldquo"}, {"\u201C", "#8220"},
// right double quotation mark
{"\u201D", "rdquo"}, {"\u201D", "#8221"},
// double low-9 quotation mark
{"\u201E", "bdquo"}, {"\u201E", "#8222"},
// dagger
{"\u2020", "dagger"}, {"\u2020", "#8224"},
// double dagger
{"\u2021", "Dagger"}, {"\u2021", "#8225"},
// bullet = black small circle
{"\u2022", "bull"}, {"\u2022", "#8226"},
// horizontal ellipsis = three dot leader
{"\u2026", "hellip"}, {"\u2026", "#8230"},
// per mille sign
{"\u2030", "permil"}, {"\u2030", "#8240"},
// double prime = seconds = inches
{"\u2033", "Prime"}, {"\u2033", "#8243"},
// single left-pointing angle quotation mark
{"\u2039", "lsaquo"}, {"\u2039", "#8249"},
// single right-pointing angle quotation mark
{"\u203A", "rsaquo"}, {"\u203A", "#8250"},
// prime = minutes = feet
{"\u2032", "prime"}, {"\u2032", "#8242"},
// overline = spacing overscore
{"\u203E", "oline"}, {"\u203E", "#8254"},
// fraction slash
{"\u2044", "frasl"}, {"\u2044", "#8260"},
// euro sign
{"\u20AC", "euro"}, {"\u20AC", "#8364"},
// script capital P = power set = Weierstrass p
{"\u2118", "weierp"}, {"\u2118", "#8472"},
// blackletter capital I = imaginary part
{"\u2111", "image"}, {"\u2111", "#8465"},
// blackletter capital R = real part symbol
{"\u211C", "real"}, {"\u211C", "#8476"},
// trade mark sign
{"\u2122", "trade"}, {"\u2122", "#8482"},
// alef symbol = first transfinite cardinal
{"\u2135", "alefsym"}, {"\u2135", "#8501"},
// leftwards arrow
{"\u2190", "larr"}, {"\u2190", "#8592"},
// upwards arrow
{"\u2191", "uarr"}, {"\u2191", "#8593"},
// rightwards arrow
{"\u2192", "rarr"}, {"\u2192", "#8594"},
// downwards arrow
{"\u2193", "darr"}, {"\u2193", "#8595"},
// left right arrow
{"\u2194", "harr"}, {"\u2194", "#8596"},
// downwards arrow with corner leftwards = carriage return
{"\u21B5", "crarr"}, {"\u21B5", "#8629"},
// leftwards double arrow
{"\u21D0", "lArr"}, {"\u21D0", "#8656"},
// upwards double arrow
{"\u21D1", "uArr"}, {"\u21D1", "#8657"},
// rightwards double arrow
{"\u21D2", "rArr"}, {"\u21D2", "#8658"},
// downwards double arrow
{"\u21D3", "hArr"}, {"\u21D3", "#8659"},
// left right double arrow
{"\u21D4", "hArr"}, {"\u21D4", "#8660"},
// for all
{"\u2200", "forall"}, {"\u2200", "#8704"},
// partial differential
{"\u2202", "part"}, {"\u2202", "#8706"},
// there exists
{"\u2203", "exist"}, {"\u2203", "#8707"},
// empty set = null set = diameter
{"\u2205", "empty"}, {"\u2205", "#8709"},
// nabla = backward difference
{"\u2207", "nabla"}, {"\u2207", "#8711"},
// element of
{"\u2208", "isin"}, {"\u2208", "#8712"},
// not an element of
{"\u2209", "notin"}, {"\u2209", "#8713"},
// contains as member
{"\u220B", "ni"}, {"\u220B", "#8715"},
// n-ary product = product sign
{"\u220F", "prod"}, {"\u220F", "#8719"},
// n-ary sumation
{"\u2211", "sum"}, {"\u2211", "#8721"},
// minus sign
{"\u2212", "minus"}, {"\u2212", "#8722"},
// asterisk operator
{"\u2217", "lowast"}, {"\u2217", "#8727"},
// square root = radical sign
{"\u221A", "radic"}, {"\u221A", "#8730"},
// proportional to
{"\u221D", "prop"}, {"\u221D", "#8733"},
// infinity
{"\u221E", "infin"}, {"\u221E", "#8734"},
// angle
{"\u2220", "ang"}, {"\u2220", "#8736"},
// logical and = wedge
{"\u2227", "and"}, {"\u2227", "#8743"},
// logical or = vee
{"\u2228", "or"}, {"\u2228", "#8744"},
// intersection = cap
{"\u2229", "cap"}, {"\u2229", "#8745"},
// union = cup
{"\u222A", "cup"}, {"\u222A", "#8746"},
// integral
{"\u222B", "int"}, {"\u222B", "#8747"},
// therefore
{"\u2234", "there4"}, {"\u2234", "#8756"},
// tilde operator = varies with = similar to
{"\u223C", "sim"}, {"\u223C", "#8764"},
// approximately equal to
{"\u2245", "cong"}, {"\u2245", "#8773"},
// almost equal to = asymptotic to
{"\u2248", "asymp"}, {"\u2248", "#8776"},
// not equal to
{"\u2260", "ne"}, {"\u2260", "#8800"},
// identical to
{"\u2261", "equiv"}, {"\u2261", "#8801"},
// less-than or equal to
{"\u2264", "le"}, {"\u2264", "#8804"},
// greater-than or equal to
{"\u2265", "ge"}, {"\u2265", "#8805"},
// subset of
{"\u2282", "sub"}, {"\u2282", "#8834"},
// superset of
{"\u2283", "sup"}, {"\u2283", "#8835"},
// not a subset of
{"\u2284", "nsub"}, {"\u2284", "#8836"},
// subset of or equal to
{"\u2286", "sube"}, {"\u2286", "#8838"},
// superset of or equal to
{"\u2287", "supe"}, {"\u2287", "#8839"},
// circled plus = direct sum
{"\u2295", "oplus"}, {"\u2295", "#8853"},
// circled times = vector product
{"\u2297", "otimes"}, {"\u2297", "#8855"},
// up tack = orthogonal to = perpendicular
{"\u22A5", "perp"}, {"\u22A5", "#8869"},
// dot operator
{"\u22C5", "sdot"}, {"\u22C5", "#8901"},
// left ceiling = apl upstile
{"\u2308", "lceil"}, {"\u2308", "#8968"},
// right ceiling
{"\u2309", "rceil"}, {"\u2309", "#8969"},
// left floor = apl downstile
{"\u230A", "lfloor"}, {"\u230A", "#8970"},
// right floor
{"\u230B", "rfloor"}, {"\u230B", "#8971"},
// left-pointing angle bracket = bra
{"\u2329", "lang"}, {"\u2329", "#9001"},
// right-pointing angle bracket = ket
{"\u232A", "rang"}, {"\u232A", "#9002"},
// lozenge
{"\u25CA", "loz"}, {"\u25CA", "#9674"},
// black spade suit
{"\u2660", "spades"}, {"\u2660", "#9824"},
// black club suit = shamrock
{"\u2663", "clubs"}, {"\u2663", "#9827"},
// black heart suit = valentine
{"\u2665", "hearts"}, {"\u2665", "#9829"},
// black diamond suit
{"\u2666", "diams"}, {"\u2666", "#9830"} };
/**
* This array stores all of the character entities we want to convert.
*/
private static String[] entitiesArray = null;
/**
* Each character in this array maps to an entity in
* <code>entitiesArray</code>.
*/
private static String entityMapString;
/**
* This is tag is placed after anything which should not be converted. It is
* used by other formats.
*/
private static final String KEEP_END = "</KEEP:>";
/**
* This is tag is placed before anything which should not be converted. It
* is used by other formats.
*/
private static final String KEEP_START = "<KEEP:>";
/**
* Just what it says on the tin - no character entity string can be longer
* than this limit.
*/
private static final int MAXIMUM_ENTITY_LENGTH = 15;
/**
* <copyDoc>Refer to {@link #isReverse}.</copyDoc>
*/
private boolean reverse = false;
/**
* <p>
* Default constructor.
* </p>
*/
public CharacterEntityFormat() {
// this will speed up the conversion of HTML entities
// we put them into the array of array of strings to make it more
// manageable :-)
if (entitiesArray == null) {
int length = CharacterEntityFormat.ENTITIES.length;
StringBuffer temporaryBuffer = new StringBuffer();
entitiesArray = new String[length];
for (int n = 0; n < length; ++n) {
temporaryBuffer.append(CharacterEntityFormat.ENTITIES[n][0]);
entitiesArray[n] = "&" + CharacterEntityFormat.ENTITIES[n][1]
+ ";";
// this code can be used to calculate the maximum entity length
/*
* if (entities[n][1].length() > MAXIMUM_ENTITY_LENGTH) {
* MAXIMUM_ENTITY_LENGTH = entities[n][1].length(); }
*/
}
entityMapString = temporaryBuffer.toString();
}
}
/**
* <p>
* Convert the character entities in the text provided.
* </p>
*
* @param hTMLText a text to convert all the character entities in
* @return formatted text where all of the characters are converted to the
* appropriate character entities.
*/
public final String format(final String hTMLText) {
if (logger.isDebugEnabled()) {
logger.debug("format(String hTMLText = " + hTMLText + ") - start");
}
StringBuffer returnBuffer = new StringBuffer();
int length = hTMLText.length();
int index;
int indexStart = hTMLText.indexOf(KEEP_START);
int indexEnd;
for (int n = 0; n < length; ++n) {
// if we have reached the next keep section (and there is one)
if ((indexStart > -1) && (indexStart == n)) {
// find the end of the keep section
if ((indexEnd = hTMLText.indexOf(KEEP_END, indexStart)) != -1) {
int keepEndPosition = KEEP_END.length() + 1;
returnBuffer.append(hTMLText.substring(indexStart
+ keepEndPosition, indexEnd));
n = indexEnd + keepEndPosition;
indexStart = hTMLText.indexOf(KEEP_START, n);
} else {
// no end tag -> ignore
indexStart = -1;
}
} else {
int semiIndex = n;
char ch = hTMLText.charAt(n);
StringBuffer entity = null;
// is there a character entity at this point?
if (ch == '&') {
// look ahead for the semicolon
for (entity = new StringBuffer(MAXIMUM_ENTITY_LENGTH);
(semiIndex < length)
&& ((semiIndex - n + 1) <= MAXIMUM_ENTITY_LENGTH);
++semiIndex) {
char semi = hTMLText.charAt(semiIndex);
// add the character to the buffer
entity.append(semi);
// if we found a semi-colon, that's great. this is a
// real entity.
if (semi == ';') {
break;
}
// if this is not alphanumeric or hash, remove the
// entity buffer
if ((semi != '&') && (semi != '#')
&& !Character.isLetterOrDigit(semi)) {
entity = null;
break;
}
}
}
// if we go in reverse direction, look for character entities
if (reverse) {
// if there was an entity, try to convert it
if (entity == null) {
returnBuffer.append(ch);
} else if (entity.toString().equalsIgnoreCase("nbsp")) {
// this is a special case - it only translates one way
returnBuffer.append(' ');
n = semiIndex;
} else {
String compare = entity.toString();
for (int arrayIndex = 0;
arrayIndex < entitiesArray.length;
++arrayIndex) {
if (entitiesArray[arrayIndex].equals(compare)) {
returnBuffer.append(entityMapString
.charAt(arrayIndex));
entity = null;
n = semiIndex;
break;
}
}
if (entity != null) {
returnBuffer.append(ch);
}
}
} else if (entity == null) {
// see if we should convert the character
if ((index = entityMapString.indexOf(ch)) == -1) {
returnBuffer.append(ch);
} else {
returnBuffer.append(entitiesArray[index]);
}
} else {
// if this is not reverse direction, and an entity was found
// then skip past it no matter what
n = semiIndex;
returnBuffer.append(entity.toString());
}
}
}
String returnString = returnBuffer.toString();
if (logger.isDebugEnabled()) {
logger.debug("format(String) - end - return value = "
+ returnString);
}
return returnString;
}
/**
* <p>
* Gets whether or not character entity conversion goes in the opposite
* direction. If character entities are converted to characters then this
* method returns <code>true</code>, otherwise <code>false</code>
* </p>
*
* @return <code>true</code> if character entities are converted to
* characters, otherwise <code>false</code>/
*/
public final boolean isReverse() {
if (logger.isDebugEnabled()) {
logger.debug("isReverse() - start");
}
if (logger.isDebugEnabled()) {
logger.debug("isReverse() - end - return value = " + reverse);
}
return reverse;
}
/**
* <p>
* Set whether or not character entity conversion goes in the opposite
* direction.
* </p>
*
* @param newReverse set to <code>true</code> if character entities should
* be converted to characters, otherwise <code>false</code>.
*/
public final void setReverse(final boolean newReverse) {
if (logger.isDebugEnabled()) {
logger.debug("setReverse(boolean newReverse = " + newReverse
+ ") - start");
}
reverse = newReverse;
if (logger.isDebugEnabled()) {
logger.debug("setReverse(boolean) - end");
}
}
}
|