[LANG-102] [lang] Refactor Entities methods.
http://issues.apache.org/jira/browse/LANG-102.
Refactored escape and unescape methods to remove code duplication.
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@491695 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/java/org/apache/commons/lang/Entities.java b/src/java/org/apache/commons/lang/Entities.java
index a45e004..cdd9fe3 100644
--- a/src/java/org/apache/commons/lang/Entities.java
+++ b/src/java/org/apache/commons/lang/Entities.java
@@ -14,23 +14,27 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.commons.lang;
import java.io.IOException;
+import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
/**
- * <p>Provides HTML and XML entity utilities.</p>
- *
+ * <p>
+ * Provides HTML and XML entity utilities.
+ * </p>
+ *
* @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
* @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
* @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
* @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
* @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
- *
+ *
* @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
* @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
* @since 2.0
@@ -38,51 +42,48 @@
*/
class Entities {
- private static final String[][] BASIC_ARRAY = {
- {"quot", "34"}, // " - double-quote
+ private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
{"amp", "38"}, // & - ampersand
{"lt", "60"}, // < - less-than
{"gt", "62"}, // > - greater-than
};
- private static final String[][] APOS_ARRAY = {
- {"apos", "39"}, // XML apostrophe
+ private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
};
// package scoped for testing
- static final String[][] ISO8859_1_ARRAY = {
- {"nbsp", "160"}, // non-breaking space
- {"iexcl", "161"}, //inverted exclamation mark
- {"cent", "162"}, //cent sign
- {"pound", "163"}, //pound sign
- {"curren", "164"}, //currency sign
- {"yen", "165"}, //yen sign = yuan sign
- {"brvbar", "166"}, //broken bar = broken vertical bar
- {"sect", "167"}, //section sign
- {"uml", "168"}, //diaeresis = spacing diaeresis
+ static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
+ {"iexcl", "161"}, // inverted exclamation mark
+ {"cent", "162"}, // cent sign
+ {"pound", "163"}, // pound sign
+ {"curren", "164"}, // currency sign
+ {"yen", "165"}, // yen sign = yuan sign
+ {"brvbar", "166"}, // broken bar = broken vertical bar
+ {"sect", "167"}, // section sign
+ {"uml", "168"}, // diaeresis = spacing diaeresis
{"copy", "169"}, // © - copyright sign
- {"ordf", "170"}, //feminine ordinal indicator
- {"laquo", "171"}, //left-pointing double angle quotation mark = left pointing guillemet
- {"not", "172"}, //not sign
- {"shy", "173"}, //soft hyphen = discretionary hyphen
+ {"ordf", "170"}, // feminine ordinal indicator
+ {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
+ {"not", "172"}, // not sign
+ {"shy", "173"}, // soft hyphen = discretionary hyphen
{"reg", "174"}, // ® - registered trademark sign
- {"macr", "175"}, //macron = spacing macron = overline = APL overbar
- {"deg", "176"}, //degree sign
- {"plusmn", "177"}, //plus-minus sign = plus-or-minus sign
- {"sup2", "178"}, //superscript two = superscript digit two = squared
- {"sup3", "179"}, //superscript three = superscript digit three = cubed
- {"acute", "180"}, //acute accent = spacing acute
- {"micro", "181"}, //micro sign
- {"para", "182"}, //pilcrow sign = paragraph sign
- {"middot", "183"}, //middle dot = Georgian comma = Greek middle dot
- {"cedil", "184"}, //cedilla = spacing cedilla
- {"sup1", "185"}, //superscript one = superscript digit one
- {"ordm", "186"}, //masculine ordinal indicator
- {"raquo", "187"}, //right-pointing double angle quotation mark = right pointing guillemet
- {"frac14", "188"}, //vulgar fraction one quarter = fraction one quarter
- {"frac12", "189"}, //vulgar fraction one half = fraction one half
- {"frac34", "190"}, //vulgar fraction three quarters = fraction three quarters
- {"iquest", "191"}, //inverted question mark = turned question mark
+ {"macr", "175"}, // macron = spacing macron = overline = APL overbar
+ {"deg", "176"}, // degree sign
+ {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
+ {"sup2", "178"}, // superscript two = superscript digit two = squared
+ {"sup3", "179"}, // superscript three = superscript digit three = cubed
+ {"acute", "180"}, // acute accent = spacing acute
+ {"micro", "181"}, // micro sign
+ {"para", "182"}, // pilcrow sign = paragraph sign
+ {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
+ {"cedil", "184"}, // cedilla = spacing cedilla
+ {"sup1", "185"}, // superscript one = superscript digit one
+ {"ordm", "186"}, // masculine ordinal indicator
+ {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
+ {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
+ {"frac12", "189"}, // vulgar fraction one half = fraction one half
+ {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
+ {"iquest", "191"}, // inverted question mark = turned question mark
{"Agrave", "192"}, // À - uppercase A, grave accent
{"Aacute", "193"}, // Á - uppercase A, acute accent
{"Acirc", "194"}, // Â - uppercase A, circumflex accent
@@ -106,7 +107,7 @@
{"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
{"Otilde", "213"}, // Õ - uppercase O, tilde
{"Ouml", "214"}, // Ö - uppercase O, umlaut
- {"times", "215"}, //multiplication sign
+ {"times", "215"}, // multiplication sign
{"Oslash", "216"}, // Ø - uppercase O, slash
{"Ugrave", "217"}, // Ù - uppercase U, grave accent
{"Uacute", "218"}, // Ú - uppercase U, acute accent
@@ -152,213 +153,220 @@
// http://www.w3.org/TR/REC-html40/sgml/entities.html
// package scoped for testing
static final String[][] HTML40_ARRAY = {
-// <!-- Latin Extended-B -->
- {"fnof", "402"}, //latin small f with hook = function= florin, U+0192 ISOtech -->
-// <!-- Greek -->
- {"Alpha", "913"}, //greek capital letter alpha, U+0391 -->
- {"Beta", "914"}, //greek capital letter beta, U+0392 -->
- {"Gamma", "915"}, //greek capital letter gamma,U+0393 ISOgrk3 -->
- {"Delta", "916"}, //greek capital letter delta,U+0394 ISOgrk3 -->
- {"Epsilon", "917"}, //greek capital letter epsilon, U+0395 -->
- {"Zeta", "918"}, //greek capital letter zeta, U+0396 -->
- {"Eta", "919"}, //greek capital letter eta, U+0397 -->
- {"Theta", "920"}, //greek capital letter theta,U+0398 ISOgrk3 -->
- {"Iota", "921"}, //greek capital letter iota, U+0399 -->
- {"Kappa", "922"}, //greek capital letter kappa, U+039A -->
- {"Lambda", "923"}, //greek capital letter lambda,U+039B ISOgrk3 -->
- {"Mu", "924"}, //greek capital letter mu, U+039C -->
- {"Nu", "925"}, //greek capital letter nu, U+039D -->
- {"Xi", "926"}, //greek capital letter xi, U+039E ISOgrk3 -->
- {"Omicron", "927"}, //greek capital letter omicron, U+039F -->
- {"Pi", "928"}, //greek capital letter pi, U+03A0 ISOgrk3 -->
- {"Rho", "929"}, //greek capital letter rho, U+03A1 -->
-// <!-- there is no Sigmaf, and no U+03A2 character either -->
- {"Sigma", "931"}, //greek capital letter sigma,U+03A3 ISOgrk3 -->
- {"Tau", "932"}, //greek capital letter tau, U+03A4 -->
- {"Upsilon", "933"}, //greek capital letter upsilon,U+03A5 ISOgrk3 -->
- {"Phi", "934"}, //greek capital letter phi,U+03A6 ISOgrk3 -->
- {"Chi", "935"}, //greek capital letter chi, U+03A7 -->
- {"Psi", "936"}, //greek capital letter psi,U+03A8 ISOgrk3 -->
- {"Omega", "937"}, //greek capital letter omega,U+03A9 ISOgrk3 -->
- {"alpha", "945"}, //greek small letter alpha,U+03B1 ISOgrk3 -->
- {"beta", "946"}, //greek small letter beta, U+03B2 ISOgrk3 -->
- {"gamma", "947"}, //greek small letter gamma,U+03B3 ISOgrk3 -->
- {"delta", "948"}, //greek small letter delta,U+03B4 ISOgrk3 -->
- {"epsilon", "949"}, //greek small letter epsilon,U+03B5 ISOgrk3 -->
- {"zeta", "950"}, //greek small letter zeta, U+03B6 ISOgrk3 -->
- {"eta", "951"}, //greek small letter eta, U+03B7 ISOgrk3 -->
- {"theta", "952"}, //greek small letter theta,U+03B8 ISOgrk3 -->
- {"iota", "953"}, //greek small letter iota, U+03B9 ISOgrk3 -->
- {"kappa", "954"}, //greek small letter kappa,U+03BA ISOgrk3 -->
- {"lambda", "955"}, //greek small letter lambda,U+03BB ISOgrk3 -->
- {"mu", "956"}, //greek small letter mu, U+03BC ISOgrk3 -->
- {"nu", "957"}, //greek small letter nu, U+03BD ISOgrk3 -->
- {"xi", "958"}, //greek small letter xi, U+03BE ISOgrk3 -->
- {"omicron", "959"}, //greek small letter omicron, U+03BF NEW -->
- {"pi", "960"}, //greek small letter pi, U+03C0 ISOgrk3 -->
- {"rho", "961"}, //greek small letter rho, U+03C1 ISOgrk3 -->
- {"sigmaf", "962"}, //greek small letter final sigma,U+03C2 ISOgrk3 -->
- {"sigma", "963"}, //greek small letter sigma,U+03C3 ISOgrk3 -->
- {"tau", "964"}, //greek small letter tau, U+03C4 ISOgrk3 -->
- {"upsilon", "965"}, //greek small letter upsilon,U+03C5 ISOgrk3 -->
- {"phi", "966"}, //greek small letter phi, U+03C6 ISOgrk3 -->
- {"chi", "967"}, //greek small letter chi, U+03C7 ISOgrk3 -->
- {"psi", "968"}, //greek small letter psi, U+03C8 ISOgrk3 -->
- {"omega", "969"}, //greek small letter omega,U+03C9 ISOgrk3 -->
- {"thetasym", "977"}, //greek small letter theta symbol,U+03D1 NEW -->
- {"upsih", "978"}, //greek upsilon with hook symbol,U+03D2 NEW -->
- {"piv", "982"}, //greek pi symbol, U+03D6 ISOgrk3 -->
-// <!-- General Punctuation -->
- {"bull", "8226"}, //bullet = black small circle,U+2022 ISOpub -->
-// <!-- bullet is NOT the same as bullet operator, U+2219 -->
- {"hellip", "8230"}, //horizontal ellipsis = three dot leader,U+2026 ISOpub -->
- {"prime", "8242"}, //prime = minutes = feet, U+2032 ISOtech -->
- {"Prime", "8243"}, //double prime = seconds = inches,U+2033 ISOtech -->
- {"oline", "8254"}, //overline = spacing overscore,U+203E NEW -->
- {"frasl", "8260"}, //fraction slash, U+2044 NEW -->
-// <!-- Letterlike Symbols -->
- {"weierp", "8472"}, //script capital P = power set= Weierstrass p, U+2118 ISOamso -->
- {"image", "8465"}, //blackletter capital I = imaginary part,U+2111 ISOamso -->
- {"real", "8476"}, //blackletter capital R = real part symbol,U+211C ISOamso -->
- {"trade", "8482"}, //trade mark sign, U+2122 ISOnum -->
- {"alefsym", "8501"}, //alef symbol = first transfinite cardinal,U+2135 NEW -->
-// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
-// same glyph could be used to depict both characters -->
-// <!-- Arrows -->
- {"larr", "8592"}, //leftwards arrow, U+2190 ISOnum -->
- {"uarr", "8593"}, //upwards arrow, U+2191 ISOnum-->
- {"rarr", "8594"}, //rightwards arrow, U+2192 ISOnum -->
- {"darr", "8595"}, //downwards arrow, U+2193 ISOnum -->
- {"harr", "8596"}, //left right arrow, U+2194 ISOamsa -->
- {"crarr", "8629"}, //downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
- {"lArr", "8656"}, //leftwards double arrow, U+21D0 ISOtech -->
-// <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
-// arrow but also does not have any other character for that function.
-// So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
- {"uArr", "8657"}, //upwards double arrow, U+21D1 ISOamsa -->
- {"rArr", "8658"}, //rightwards double arrow,U+21D2 ISOtech -->
-// <!-- ISO 10646 does not say this is the 'implies' character but does not
-// have another character with this function so ?rArr can be used for
-// 'implies' as ISOtech suggests -->
- {"dArr", "8659"}, //downwards double arrow, U+21D3 ISOamsa -->
- {"hArr", "8660"}, //left right double arrow,U+21D4 ISOamsa -->
-// <!-- Mathematical Operators -->
- {"forall", "8704"}, //for all, U+2200 ISOtech -->
- {"part", "8706"}, //partial differential, U+2202 ISOtech -->
- {"exist", "8707"}, //there exists, U+2203 ISOtech -->
- {"empty", "8709"}, //empty set = null set = diameter,U+2205 ISOamso -->
- {"nabla", "8711"}, //nabla = backward difference,U+2207 ISOtech -->
- {"isin", "8712"}, //element of, U+2208 ISOtech -->
- {"notin", "8713"}, //not an element of, U+2209 ISOtech -->
- {"ni", "8715"}, //contains as member, U+220B ISOtech -->
-// <!-- should there be a more memorable name than 'ni'? -->
- {"prod", "8719"}, //n-ary product = product sign,U+220F ISOamsb -->
-// <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
-// though the same glyph might be used for both -->
- {"sum", "8721"}, //n-ary summation, U+2211 ISOamsb -->
-// <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
-// though the same glyph might be used for both -->
- {"minus", "8722"}, //minus sign, U+2212 ISOtech -->
- {"lowast", "8727"}, //asterisk operator, U+2217 ISOtech -->
- {"radic", "8730"}, //square root = radical sign,U+221A ISOtech -->
- {"prop", "8733"}, //proportional to, U+221D ISOtech -->
- {"infin", "8734"}, //infinity, U+221E ISOtech -->
- {"ang", "8736"}, //angle, U+2220 ISOamso -->
- {"and", "8743"}, //logical and = wedge, U+2227 ISOtech -->
- {"or", "8744"}, //logical or = vee, U+2228 ISOtech -->
- {"cap", "8745"}, //intersection = cap, U+2229 ISOtech -->
- {"cup", "8746"}, //union = cup, U+222A ISOtech -->
- {"int", "8747"}, //integral, U+222B ISOtech -->
- {"there4", "8756"}, //therefore, U+2234 ISOtech -->
- {"sim", "8764"}, //tilde operator = varies with = similar to,U+223C ISOtech -->
-// <!-- tilde operator is NOT the same character as the tilde, U+007E,although
-// the same glyph might be used to represent both -->
- {"cong", "8773"}, //approximately equal to, U+2245 ISOtech -->
- {"asymp", "8776"}, //almost equal to = asymptotic to,U+2248 ISOamsr -->
- {"ne", "8800"}, //not equal to, U+2260 ISOtech -->
- {"equiv", "8801"}, //identical to, U+2261 ISOtech -->
- {"le", "8804"}, //less-than or equal to, U+2264 ISOtech -->
- {"ge", "8805"}, //greater-than or equal to,U+2265 ISOtech -->
- {"sub", "8834"}, //subset of, U+2282 ISOtech -->
- {"sup", "8835"}, //superset of, U+2283 ISOtech -->
-// <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
-// Symbol font encoding and is not included. Should it be, for symmetry?
-// It is in ISOamsn --> <!ENTITY nsub", "8836"},
-// not a subset of, U+2284 ISOamsn -->
- {"sube", "8838"}, //subset of or equal to, U+2286 ISOtech -->
- {"supe", "8839"}, //superset of or equal to,U+2287 ISOtech -->
- {"oplus", "8853"}, //circled plus = direct sum,U+2295 ISOamsb -->
- {"otimes", "8855"}, //circled times = vector product,U+2297 ISOamsb -->
- {"perp", "8869"}, //up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
- {"sdot", "8901"}, //dot operator, U+22C5 ISOamsb -->
-// <!-- dot operator is NOT the same character as U+00B7 middle dot -->
-// <!-- Miscellaneous Technical -->
- {"lceil", "8968"}, //left ceiling = apl upstile,U+2308 ISOamsc -->
- {"rceil", "8969"}, //right ceiling, U+2309 ISOamsc -->
- {"lfloor", "8970"}, //left floor = apl downstile,U+230A ISOamsc -->
- {"rfloor", "8971"}, //right floor, U+230B ISOamsc -->
- {"lang", "9001"}, //left-pointing angle bracket = bra,U+2329 ISOtech -->
-// <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' -->
- {"rang", "9002"}, //right-pointing angle bracket = ket,U+232A ISOtech -->
-// <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
-// 'single right-pointing angle quotation mark' -->
-// <!-- Geometric Shapes -->
- {"loz", "9674"}, //lozenge, U+25CA ISOpub -->
-// <!-- Miscellaneous Symbols -->
- {"spades", "9824"}, //black spade suit, U+2660 ISOpub -->
-// <!-- black here seems to mean filled as opposed to hollow -->
- {"clubs", "9827"}, //black club suit = shamrock,U+2663 ISOpub -->
- {"hearts", "9829"}, //black heart suit = valentine,U+2665 ISOpub -->
- {"diams", "9830"}, //black diamond suit, U+2666 ISOpub -->
+ // <!-- Latin Extended-B -->
+ {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
+ // <!-- Greek -->
+ {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
+ {"Beta", "914"}, // greek capital letter beta, U+0392 -->
+ {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
+ {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
+ {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
+ {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
+ {"Eta", "919"}, // greek capital letter eta, U+0397 -->
+ {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
+ {"Iota", "921"}, // greek capital letter iota, U+0399 -->
+ {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
+ {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
+ {"Mu", "924"}, // greek capital letter mu, U+039C -->
+ {"Nu", "925"}, // greek capital letter nu, U+039D -->
+ {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
+ {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
+ {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
+ {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
+ // <!-- there is no Sigmaf, and no U+03A2 character either -->
+ {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
+ {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
+ {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
+ {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
+ {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
+ {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
+ {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
+ {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
+ {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
+ {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
+ {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
+ {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
+ {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
+ {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
+ {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
+ {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
+ {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
+ {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
+ {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
+ {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
+ {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
+ {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
+ {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
+ {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
+ {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
+ {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
+ {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
+ {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
+ {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
+ {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
+ {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
+ {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
+ {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
+ {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
+ {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
+ // <!-- General Punctuation -->
+ {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
+ // <!-- bullet is NOT the same as bullet operator, U+2219 -->
+ {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
+ {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
+ {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
+ {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
+ {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
+ // <!-- Letterlike Symbols -->
+ {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
+ {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
+ {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
+ {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
+ {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
+ // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
+ // same glyph could be used to depict both characters -->
+ // <!-- Arrows -->
+ {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
+ {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
+ {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
+ {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
+ {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
+ {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
+ {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
+ // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
+ // arrow but also does not have any other character for that function.
+ // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
+ {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
+ {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
+ // <!-- ISO 10646 does not say this is the 'implies' character but does not
+ // have another character with this function so ?rArr can be used for
+ // 'implies' as ISOtech suggests -->
+ {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
+ {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
+ // <!-- Mathematical Operators -->
+ {"forall", "8704"}, // for all, U+2200 ISOtech -->
+ {"part", "8706"}, // partial differential, U+2202 ISOtech -->
+ {"exist", "8707"}, // there exists, U+2203 ISOtech -->
+ {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
+ {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
+ {"isin", "8712"}, // element of, U+2208 ISOtech -->
+ {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
+ {"ni", "8715"}, // contains as member, U+220B ISOtech -->
+ // <!-- should there be a more memorable name than 'ni'? -->
+ {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
+ // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
+ // though the same glyph might be used for both -->
+ {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
+ // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+ // though the same glyph might be used for both -->
+ {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
+ {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
+ {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
+ {"prop", "8733"}, // proportional to, U+221D ISOtech -->
+ {"infin", "8734"}, // infinity, U+221E ISOtech -->
+ {"ang", "8736"}, // angle, U+2220 ISOamso -->
+ {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
+ {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
+ {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
+ {"cup", "8746"}, // union = cup, U+222A ISOtech -->
+ {"int", "8747"}, // integral, U+222B ISOtech -->
+ {"there4", "8756"}, // therefore, U+2234 ISOtech -->
+ {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
+ // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
+ // the same glyph might be used to represent both -->
+ {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
+ {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
+ {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
+ {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
+ {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
+ {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
+ {"sub", "8834"}, // subset of, U+2282 ISOtech -->
+ {"sup", "8835"}, // superset of, U+2283 ISOtech -->
+ // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
+ // Symbol font encoding and is not included. Should it be, for symmetry?
+ // It is in ISOamsn --> <!ENTITY nsub", "8836"},
+ // not a subset of, U+2284 ISOamsn -->
+ {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
+ {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
+ {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
+ {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
+ {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
+ {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
+ // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
+ // <!-- Miscellaneous Technical -->
+ {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
+ {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
+ {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
+ {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
+ {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
+ // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
+ // mark' -->
+ {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
+ // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
+ // 'single right-pointing angle quotation mark' -->
+ // <!-- Geometric Shapes -->
+ {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
+ // <!-- Miscellaneous Symbols -->
+ {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
+ // <!-- black here seems to mean filled as opposed to hollow -->
+ {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
+ {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
+ {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
-// <!-- Latin Extended-A -->
- {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
- {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
-// <!-- ligature is a misnomer, this is a separate character in some languages -->
- {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
- {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
- {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
-// <!-- Spacing Modifier Letters -->
- {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
- {"tilde", "732"}, //small tilde, U+02DC ISOdia -->
-// <!-- General Punctuation -->
- {"ensp", "8194"}, //en space, U+2002 ISOpub -->
- {"emsp", "8195"}, //em space, U+2003 ISOpub -->
- {"thinsp", "8201"}, //thin space, U+2009 ISOpub -->
- {"zwnj", "8204"}, //zero width non-joiner,U+200C NEW RFC 2070 -->
- {"zwj", "8205"}, //zero width joiner, U+200D NEW RFC 2070 -->
- {"lrm", "8206"}, //left-to-right mark, U+200E NEW RFC 2070 -->
- {"rlm", "8207"}, //right-to-left mark, U+200F NEW RFC 2070 -->
- {"ndash", "8211"}, //en dash, U+2013 ISOpub -->
- {"mdash", "8212"}, //em dash, U+2014 ISOpub -->
- {"lsquo", "8216"}, //left single quotation mark,U+2018 ISOnum -->
- {"rsquo", "8217"}, //right single quotation mark,U+2019 ISOnum -->
- {"sbquo", "8218"}, //single low-9 quotation mark, U+201A NEW -->
- {"ldquo", "8220"}, //left double quotation mark,U+201C ISOnum -->
- {"rdquo", "8221"}, //right double quotation mark,U+201D ISOnum -->
- {"bdquo", "8222"}, //double low-9 quotation mark, U+201E NEW -->
- {"dagger", "8224"}, //dagger, U+2020 ISOpub -->
- {"Dagger", "8225"}, //double dagger, U+2021 ISOpub -->
- {"permil", "8240"}, //per mille sign, U+2030 ISOtech -->
- {"lsaquo", "8249"}, //single left-pointing angle quotation mark,U+2039 ISO proposed -->
-// <!-- lsaquo is proposed but not yet ISO standardized -->
- {"rsaquo", "8250"}, //single right-pointing angle quotation mark,U+203A ISO proposed -->
-// <!-- rsaquo is proposed but not yet ISO standardized -->
- {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
+ // <!-- Latin Extended-A -->
+ {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
+ {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
+ // <!-- ligature is a misnomer, this is a separate character in some languages -->
+ {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
+ {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
+ {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
+ // <!-- Spacing Modifier Letters -->
+ {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
+ {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
+ // <!-- General Punctuation -->
+ {"ensp", "8194"}, // en space, U+2002 ISOpub -->
+ {"emsp", "8195"}, // em space, U+2003 ISOpub -->
+ {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
+ {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
+ {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
+ {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
+ {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
+ {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
+ {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
+ {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
+ {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
+ {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
+ {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
+ {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
+ {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
+ {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
+ {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
+ {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
+ {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
+ // <!-- lsaquo is proposed but not yet ISO standardized -->
+ {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
+ // <!-- rsaquo is proposed but not yet ISO standardized -->
+ {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
};
/**
- * <p>The set of entities supported by standard XML.</p>
+ * <p>
+ * The set of entities supported by standard XML.
+ * </p>
*/
public static final Entities XML;
/**
- * <p>The set of entities supported by HTML 3.2.</p>
+ * <p>
+ * The set of entities supported by HTML 3.2.
+ * </p>
*/
public static final Entities HTML32;
/**
- * <p>The set of entities supported by HTML 4.0.</p>
+ * <p>
+ * The set of entities supported by HTML 4.0.
+ * </p>
*/
public static final Entities HTML40;
@@ -380,9 +388,12 @@
}
/**
- * <p>Fills the specified entities instance with HTML 40 entities.</p>
+ * <p>
+ * Fills the specified entities instance with HTML 40 entities.
+ * </p>
*
- * @param entities the instance to be filled.
+ * @param entities
+ * the instance to be filled.
*/
static void fillWithHtml40Entities(Entities entities) {
entities.addEntities(BASIC_ARRAY);
@@ -392,25 +403,35 @@
static interface EntityMap {
/**
- * <p>Add an entry to this entity map.</p>
+ * <p>
+ * Add an entry to this entity map.
+ * </p>
*
- * @param name the entity name
- * @param value the entity value
+ * @param name
+ * the entity name
+ * @param value
+ * the entity value
*/
void add(String name, int value);
/**
- * <p>Returns the name of the entity identified by the specified value.</p>
+ * <p>
+ * Returns the name of the entity identified by the specified value.
+ * </p>
*
- * @param value the value to locate
+ * @param value
+ * the value to locate
* @return entity name associated with the specified value
*/
String name(int value);
/**
- * <p>Returns the value of the entity identified by the specified name.</p>
+ * <p>
+ * Returns the value of the entity identified by the specified name.
+ * </p>
*
- * @param name the name to locate
+ * @param name
+ * the name to locate
* @return entity value associated with the specified name
*/
int value(String name);
@@ -418,6 +439,7 @@
static class PrimitiveEntityMap implements EntityMap {
private Map mapNameToValue = new HashMap();
+
private IntHashMap mapValueToName = new IntHashMap();
/**
@@ -447,9 +469,9 @@
}
}
-
static abstract class MapIntMap implements Entities.EntityMap {
protected Map mapNameToValue;
+
protected Map mapValueToName;
/**
@@ -490,9 +512,9 @@
}
static class TreeEntityMap extends MapIntMap {
- /**
- * Constructs a new instance of <code>TreeEntityMap</code>.
- */
+ /**
+ * Constructs a new instance of <code>TreeEntityMap</code>.
+ */
public TreeEntityMap() {
mapNameToValue = new TreeMap();
mapValueToName = new TreeMap();
@@ -501,6 +523,7 @@
static class LookupEntityMap extends PrimitiveEntityMap {
private String[] lookupTable;
+
private int LOOKUP_TABLE_SIZE = 256;
/**
@@ -514,8 +537,9 @@
}
/**
- * <p>Returns the lookup table for this entity map. The lookup table is created if it has not been
- * previously.</p>
+ * <p>
+ * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
+ * </p>
*
* @return the lookup table
*/
@@ -527,7 +551,9 @@
}
/**
- * <p>Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.</p>
+ * <p>
+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
+ * </p>
*/
private void createLookupTable() {
lookupTable = new String[LOOKUP_TABLE_SIZE];
@@ -539,8 +565,11 @@
static class ArrayEntityMap implements EntityMap {
protected int growBy = 100;
+
protected int size = 0;
+
protected String[] names;
+
protected int[] values;
/**
@@ -552,10 +581,11 @@
}
/**
- * Constructs a new instance of <code>ArrayEntityMap</code>
- * specifying the size by which the array should grow.
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
+ * grow.
*
- * @param growBy array will be initialized to and will grow by this amount
+ * @param growBy
+ * array will be initialized to and will grow by this amount
*/
public ArrayEntityMap(int growBy) {
this.growBy = growBy;
@@ -576,7 +606,8 @@
/**
* Verifies the capacity of the entity array, adjusting the size if necessary.
*
- * @param capacity size the array should be
+ * @param capacity
+ * size the array should be
*/
protected void ensureCapacity(int capacity) {
if (capacity > names.length) {
@@ -621,24 +652,26 @@
* Constructs a new instance of <code>BinaryEntityMap</code>.
*/
public BinaryEntityMap() {
- super();
+ super();
}
/**
- * Constructs a new instance of <code>ArrayEntityMap</code>
- * specifying the size by which the underlying array should grow.
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
+ * should grow.
*
- * @param growBy array will be initialized to and will grow by this amount
+ * @param growBy
+ * array will be initialized to and will grow by this amount
*/
public BinaryEntityMap(int growBy) {
super(growBy);
}
/**
- * Performs a binary search of the entity array for the specified key.
- * This method is based on code in {@link java.util.Arrays}.
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
+ * {@link java.util.Arrays}.
*
- * @param key the key to be found
+ * @param key
+ * the key to be found
* @return the index of the entity array matching the specified key
*/
private int binarySearch(int key) {
@@ -657,7 +690,7 @@
return mid; // key found
}
}
- return -(low + 1); // key not found.
+ return -(low + 1); // key not found.
}
/**
@@ -667,9 +700,9 @@
ensureCapacity(size + 1);
int insertAt = binarySearch(value);
if (insertAt > 0) {
- return; // note: this means you can't insert the same value twice
+ return; // note: this means you can't insert the same value twice
}
- insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
values[insertAt] = value;
System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
@@ -693,9 +726,12 @@
EntityMap map = new Entities.LookupEntityMap();
/**
- * <p>Adds entities to this entity.</p>
+ * <p>
+ * Adds entities to this entity.
+ * </p>
*
- * @param entityArray array of entities to be added
+ * @param entityArray
+ * array of entities to be added
*/
public void addEntities(String[][] entityArray) {
for (int i = 0; i < entityArray.length; ++i) {
@@ -704,19 +740,26 @@
}
/**
- * <p>Add an entity to this entity.</p>
+ * <p>
+ * Add an entity to this entity.
+ * </p>
*
- * @param name name of the entity
- * @param value vale of the entity
+ * @param name
+ * name of the entity
+ * @param value
+ * vale of the entity
*/
public void addEntity(String name, int value) {
map.add(name, value);
}
/**
- * <p>Returns the name of the entity identified by the specified value.</p>
+ * <p>
+ * Returns the name of the entity identified by the specified value.
+ * </p>
*
- * @param value the value to locate
+ * @param value
+ * the value to locate
* @return entity name associated with the specified value
*/
public String entityName(int value) {
@@ -724,9 +767,12 @@
}
/**
- * <p>Returns the value of the entity identified by the specified name.</p>
+ * <p>
+ * Returns the value of the entity identified by the specified name.
+ * </p>
*
- * @param name the name to locate
+ * @param name
+ * the name to locate
* @return entity value associated with the specified name
*/
public int entityValue(String name) {
@@ -734,49 +780,39 @@
}
/**
- * <p>Escapes the characters in a <code>String</code>.</p>
- *
- * <p>For example, if you have called addEntity("foo", 0xA1),
- * escape("\u00A1") will return "&foo;"</p>
- *
- * @param str The <code>String</code> to escape.
+ * <p>
+ * Escapes the characters in a <code>String</code>.
+ * </p>
+ *
+ * <p>
+ * For example, if you have called addEntity("foo", 0xA1), escape("\u00A1") will return
+ * "&foo;"
+ * </p>
+ *
+ * @param str
+ * The <code>String</code> to escape.
* @return A new escaped <code>String</code>.
*/
public String escape(String str) {
- //todo: rewrite to use a Writer
- StringBuffer buf = new StringBuffer(str.length() * 2);
- int i;
- for (i = 0; i < str.length(); ++i) {
- char ch = str.charAt(i);
- String entityName = this.entityName(ch);
- if (entityName == null) {
- if (ch > 0x7F) {
- int intValue = ch;
- buf.append("&#");
- buf.append(intValue);
- buf.append(';');
- } else {
- buf.append(ch);
- }
- } else {
- buf.append('&');
- buf.append(entityName);
- buf.append(';');
- }
- }
- return buf.toString();
+ StringWriter stringWriter = newStringWriter(str);
+ this.escape(stringWriter, str);
+ return stringWriter.toString();
}
/**
- * <p>Escapes the characters in the <code>String</code> passed and writes the result
- * to the <code>Writer</code> passed. </p>
+ * <p>
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
+ * passed.
+ * </p>
*
- * @param writer The <code>Writer</code> to write the results of the escaping to.
- * Assumed to be a non-null value.
- * @param str The <code>String</code> to escape. Assumed to be a non-null value.
- * @throws IOException when <code>Writer</code> passed throws the exception from
- * calls to the {@link Writer#write(int)} methods.
- *
+ * @param writer
+ * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
+ * @param str
+ * The <code>String</code> to escape. Assumed to be a non-null value.
+ * @throws IOException
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
+ * methods.
+ *
* @see #escape(String)
* @see Writer
*/
@@ -800,87 +836,98 @@
}
}
}
-
- /**
- * <p>Unescapes the entities in a <code>String</code>.</p>
- *
- * <p>For example, if you have called addEntity("foo", 0xA1),
- * unescape("&foo;") will return "\u00A1"</p>
- *
- * @param str The <code>String</code> to escape.
- * @return A new escaped <code>String</code>.
- */
- public String unescape(String str) {
- int firstAmp = str.indexOf('&');
- if (firstAmp < 0) {
- return str;
- }
- StringBuffer buf = new StringBuffer(str.length());
- buf.append(str.substring(0, firstAmp));
- for (int i = firstAmp; i < str.length(); ++i) {
- char ch = str.charAt(i);
- if (ch == '&') {
- int semi = str.indexOf(';', i + 1);
- if (semi == -1) {
- buf.append(ch);
- continue;
- }
- int amph = str.indexOf('&', i + 1);
- if( amph != -1 && amph < semi ) {
- // Then the text looks like &...&...;
- buf.append(ch);
- continue;
- }
- String entityName = str.substring(i + 1, semi);
- int entityValue;
- if (entityName.length() == 0) {
- entityValue = -1;
- } else if (entityName.charAt(0) == '#') {
- if (entityName.length() == 1) {
- entityValue = -1;
- } else {
- char charAt1 = entityName.charAt(1);
- try {
- if (charAt1 == 'x' || charAt1=='X') {
- entityValue = Integer.valueOf(entityName.substring(2), 16).intValue();
- } else {
- entityValue = Integer.parseInt(entityName.substring(1));
- }
- if (entityValue > 0xFFFF) {
- entityValue = -1;
- }
- } catch (NumberFormatException ex) {
- entityValue = -1;
- }
- }
- } else {
- entityValue = this.entityValue(entityName);
- }
- if (entityValue == -1) {
- buf.append('&');
- buf.append(entityName);
- buf.append(';');
- } else {
- buf.append((char) (entityValue));
- }
- i = semi;
- } else {
- buf.append(ch);
- }
+ /**
+ * <p>
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>StringWriter</code>
+ * passed.
+ * </p>
+ *
+ * @param writer
+ * The <code>StringWriter</code> to write the results of the escaping to. Assumed to be a non-null
+ * value.
+ * @param str
+ * The <code>String</code> to escape. Assumed to be a non-null value.
+ *
+ * @see #escape(String)
+ * @see Writer
+ * @since 2.3
+ */
+ public void escape(StringWriter writer, String str) {
+ try {
+ this.escape((Writer) writer, str);
+ } catch (IOException e) {
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
+ // throw IOExceptions.
+ throw new UnhandledException(e);
}
- return buf.toString();
}
/**
- * <p>Unescapes the escaped entities in the <code>String</code> passed and
- * writes the result to the <code>Writer</code> passed.</p>
+ * <p>
+ * Unescapes the entities in a <code>String</code>.
+ * </p>
*
- * @param writer The <code>Writer</code> to write the results to; assumed to be non-null.
- * @param string The <code>String</code> to write the results to; assumed to be non-null.
- * @throws IOException when <code>Writer</code> passed throws the exception from
- * calls to the {@link Writer#write(int)} methods.
- *
+ * <p>
+ * For example, if you have called addEntity("foo", 0xA1), unescape("&foo;") will return
+ * "\u00A1"
+ * </p>
+ *
+ * @param str
+ * The <code>String</code> to escape.
+ * @return A new escaped <code>String</code>.
+ */
+ public String unescape(String str) {
+ // Make the StringWriter larger than the source String to avoid growing the writer.
+ StringWriter stringWriter = newStringWriter(str);
+ this.unescape(stringWriter, str);
+ return stringWriter.toString();
+ }
+
+ private StringWriter newStringWriter(String str) {
+ // Make the StringWriter 10% larger than the source String to avoid growing the writer
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
+ }
+
+ /**
+ * <p>
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
+ * <code>StringWriter</code> passed.
+ * </p>
+ *
+ * @param writer
+ * The <code>StringWriter</code> to write the results to; assumed to be non-null.
+ * @param string
+ * The <code>String</code> to write the results to; assumed to be non-null.
+ *
+ * @see #escape(String)
+ * @see Writer
+ * @since 2.3
+ */
+ public void unescape(StringWriter writer, String string) {
+ try {
+ this.unescape((Writer) writer, string);
+ } catch (IOException e) {
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
+ // throw IOExceptions.
+ throw new UnhandledException(e);
+ }
+ }
+
+ /**
+ * <p>
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
+ * <code>Writer</code> passed.
+ * </p>
+ *
+ * @param writer
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
+ * @param string
+ * The <code>String</code> to write the results to; assumed to be non-null.
+ * @throws IOException
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
+ * methods.
+ *
* @see #escape(String)
* @see Writer
*/
@@ -896,14 +943,14 @@
for (int i = firstAmp; i < len; i++) {
char c = string.charAt(i);
if (c == '&') {
- int nextIdx = i+1;
+ int nextIdx = i + 1;
int semiColonIdx = string.indexOf(';', nextIdx);
if (semiColonIdx == -1) {
writer.write(c);
continue;
}
int amphersandIdx = string.indexOf('&', i + 1);
- if( amphersandIdx != -1 && amphersandIdx < semiColonIdx ) {
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
// Then the text looks like &...&...;
writer.write(c);
continue;
@@ -912,8 +959,9 @@
int entityValue = -1;
int entityContentLen = entityContent.length();
if (entityContentLen > 0) {
- if (entityContent.charAt(0) == '#') { //escaped value content is an integer (decimal or hexidecimal)
- if (entityContentLen > 1) {
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
+ // hexidecimal)
+ if (entityContentLen > 1) {
char isHexChar = entityContent.charAt(1);
try {
switch (isHexChar) {
@@ -933,11 +981,11 @@
entityValue = -1;
}
}
- } else { //escaped value content is an entity name
+ } else { // escaped value content is an entity name
entityValue = this.entityValue(entityContent);
}
}
-
+
if (entityValue == -1) {
writer.write('&');
writer.write(entityContent);
@@ -945,11 +993,11 @@
} else {
writer.write(entityValue);
}
- i = semiColonIdx; //move index up to the semi-colon
+ i = semiColonIdx; // move index up to the semi-colon
} else {
writer.write(c);
}
}
}
-
+
}