UNPKG

@qooxdoo/framework

Version:

The JS Framework for Coders

450 lines (403 loc) 21.4 kB
/* ************************************************************************ qooxdoo - the new era of web development http://qooxdoo.org Copyright: 2004-2008 1&1 Internet AG, Germany, http://www.1und1.de License: MIT: https://opensource.org/licenses/MIT See the LICENSE file in the project's top-level directory for details. Authors: * Fabian Jakobs (fjakobs) ************************************************************************ */ /** * A Collection of utility functions to escape and unescape strings. */ qx.Bootstrap.define("qx.bom.String", { /* ***************************************************************************** STATICS ***************************************************************************** */ statics : { /** Mapping of HTML entity names to the corresponding char code */ TO_CHARCODE : { "quot" : 34, // " - double-quote "amp" : 38, // & "lt" : 60, // < "gt" : 62, // > // http://www.w3.org/TR/REC-html40/sgml/entities.html // ISO 8859-1 characters "nbsp" : 160, // no-break space "iexcl" : 161, // inverted exclamation mark "cent" : 162, // cent sign "pound" : 163, // pound sterling sign "curren" : 164, // general currency sign "yen" : 165, // yen sign "brvbar" : 166, // broken (vertical) bar "sect" : 167, // section sign "uml" : 168, // umlaut (dieresis) "copy" : 169, // copyright sign "ordf" : 170, // ordinal indicator, feminine "laquo" : 171, // angle quotation mark, left "not" : 172, // not sign "shy" : 173, // soft hyphen "reg" : 174, // registered sign "macr" : 175, // macron "deg" : 176, // degree sign "plusmn" : 177, // plus-or-minus sign "sup2" : 178, // superscript two "sup3" : 179, // superscript three "acute" : 180, // acute accent "micro" : 181, // micro sign "para" : 182, // pilcrow (paragraph sign) "middot" : 183, // middle dot "cedil" : 184, // cedilla "sup1" : 185, // superscript one "ordm" : 186, // ordinal indicator, masculine "raquo" : 187, // angle quotation mark, right "frac14" : 188, // fraction one-quarter "frac12" : 189, // fraction one-half "frac34" : 190, // fraction three-quarters "iquest" : 191, // inverted question mark "Agrave" : 192, // capital A, grave accent "Aacute" : 193, // capital A, acute accent "Acirc" : 194, // capital A, circumflex accent "Atilde" : 195, // capital A, tilde "Auml" : 196, // capital A, dieresis or umlaut mark "Aring" : 197, // capital A, ring "AElig" : 198, // capital AE diphthong (ligature) "Ccedil" : 199, // capital C, cedilla "Egrave" : 200, // capital E, grave accent "Eacute" : 201, // capital E, acute accent "Ecirc" : 202, // capital E, circumflex accent "Euml" : 203, // capital E, dieresis or umlaut mark "Igrave" : 204, // capital I, grave accent "Iacute" : 205, // capital I, acute accent "Icirc" : 206, // capital I, circumflex accent "Iuml" : 207, // capital I, dieresis or umlaut mark "ETH" : 208, // capital Eth, Icelandic "Ntilde" : 209, // capital N, tilde "Ograve" : 210, // capital O, grave accent "Oacute" : 211, // capital O, acute accent "Ocirc" : 212, // capital O, circumflex accent "Otilde" : 213, // capital O, tilde "Ouml" : 214, // capital O, dieresis or umlaut mark "times" : 215, // multiply sign "Oslash" : 216, // capital O, slash "Ugrave" : 217, // capital U, grave accent "Uacute" : 218, // capital U, acute accent "Ucirc" : 219, // capital U, circumflex accent "Uuml" : 220, // capital U, dieresis or umlaut mark "Yacute" : 221, // capital Y, acute accent "THORN" : 222, // capital THORN, Icelandic "szlig" : 223, // small sharp s, German (sz ligature) "agrave" : 224, // small a, grave accent "aacute" : 225, // small a, acute accent "acirc" : 226, // small a, circumflex accent "atilde" : 227, // small a, tilde "auml" : 228, // small a, dieresis or umlaut mark "aring" : 229, // small a, ring "aelig" : 230, // small ae diphthong (ligature) "ccedil" : 231, // small c, cedilla "egrave" : 232, // small e, grave accent "eacute" : 233, // small e, acute accent "ecirc" : 234, // small e, circumflex accent "euml" : 235, // small e, dieresis or umlaut mark "igrave" : 236, // small i, grave accent "iacute" : 237, // small i, acute accent "icirc" : 238, // small i, circumflex accent "iuml" : 239, // small i, dieresis or umlaut mark "eth" : 240, // small eth, Icelandic "ntilde" : 241, // small n, tilde "ograve" : 242, // small o, grave accent "oacute" : 243, // small o, acute accent "ocirc" : 244, // small o, circumflex accent "otilde" : 245, // small o, tilde "ouml" : 246, // small o, dieresis or umlaut mark "divide" : 247, // divide sign "oslash" : 248, // small o, slash "ugrave" : 249, // small u, grave accent "uacute" : 250, // small u, acute accent "ucirc" : 251, // small u, circumflex accent "uuml" : 252, // small u, dieresis or umlaut mark "yacute" : 253, // small y, acute accent "thorn" : 254, // small thorn, Icelandic "yuml" : 255, // small y, dieresis or umlaut mark // Latin Extended-B "fnof" : 402, // latin small f with hook = function= florin, U+0192 ISOtech // Greek "Alpha" : 913, // greek capital letter alpha, U+0391 "Beta" : 914, // greek capital letter beta, U+0392 "Gamma" : 915, // greek capital letter gamma,U+0393 ISOgrk3 "Delta" : 916, // greek capital letter delta,U+0394 ISOgrk3 "Epsilon" : 917, // greek capital letter epsilon, U+0395 "Zeta" : 918, // greek capital letter zeta, U+0396 "Eta" : 919, // greek capital letter eta, U+0397 "Theta" : 920, // greek capital letter theta,U+0398 ISOgrk3 "Iota" : 921, // greek capital letter iota, U+0399 "Kappa" : 922, // greek capital letter kappa, U+039A "Lambda" : 923, // greek capital letter lambda,U+039B ISOgrk3 "Mu" : 924, // greek capital letter mu, U+039C "Nu" : 925, // greek capital letter nu, U+039D "Xi" : 926, // greek capital letter xi, U+039E ISOgrk3 "Omicron" : 927, // greek capital letter omicron, U+039F "Pi" : 928, // greek capital letter pi, U+03A0 ISOgrk3 "Rho" : 929, // greek capital letter rho, U+03A1 // there is no Sigmaf, and no U+03A2 character either "Sigma" : 931, // greek capital letter sigma,U+03A3 ISOgrk3 "Tau" : 932, // greek capital letter tau, U+03A4 "Upsilon" : 933, // greek capital letter upsilon,U+03A5 ISOgrk3 "Phi" : 934, // greek capital letter phi,U+03A6 ISOgrk3 "Chi" : 935, // greek capital letter chi, U+03A7 "Psi" : 936, // greek capital letter psi,U+03A8 ISOgrk3 "Omega" : 937, // greek capital letter omega,U+03A9 ISOgrk3 "alpha" : 945, // greek small letter alpha,U+03B1 ISOgrk3 "beta" : 946, // greek small letter beta, U+03B2 ISOgrk3 "gamma" : 947, // greek small letter gamma,U+03B3 ISOgrk3 "delta" : 948, // greek small letter delta,U+03B4 ISOgrk3 "epsilon" : 949, // greek small letter epsilon,U+03B5 ISOgrk3 "zeta" : 950, // greek small letter zeta, U+03B6 ISOgrk3 "eta" : 951, // greek small letter eta, U+03B7 ISOgrk3 "theta" : 952, // greek small letter theta,U+03B8 ISOgrk3 "iota" : 953, // greek small letter iota, U+03B9 ISOgrk3 "kappa" : 954, // greek small letter kappa,U+03BA ISOgrk3 "lambda" : 955, // greek small letter lambda,U+03BB ISOgrk3 "mu" : 956, // greek small letter mu, U+03BC ISOgrk3 "nu" : 957, // greek small letter nu, U+03BD ISOgrk3 "xi" : 958, // greek small letter xi, U+03BE ISOgrk3 "omicron" : 959, // greek small letter omicron, U+03BF NEW "pi" : 960, // greek small letter pi, U+03C0 ISOgrk3 "rho" : 961, // greek small letter rho, U+03C1 ISOgrk3 "sigmaf" : 962, // greek small letter final sigma,U+03C2 ISOgrk3 "sigma" : 963, // greek small letter sigma,U+03C3 ISOgrk3 "tau" : 964, // greek small letter tau, U+03C4 ISOgrk3 "upsilon" : 965, // greek small letter upsilon,U+03C5 ISOgrk3 "phi" : 966, // greek small letter phi, U+03C6 ISOgrk3 "chi" : 967, // greek small letter chi, U+03C7 ISOgrk3 "psi" : 968, // greek small letter psi, U+03C8 ISOgrk3 "omega" : 969, // greek small letter omega,U+03C9 ISOgrk3 "thetasym" : 977, // greek small letter theta symbol,U+03D1 NEW "upsih" : 978, // greek upsilon with hook symbol,U+03D2 NEW "piv" : 982, // greek pi symbol, U+03D6 ISOgrk3 // General Punctuation "bull" : 8226, // bullet = black small circle,U+2022 ISOpub // bullet is NOT the same as bullet operator, U+2219 "hellip" : 8230, // horizontal ellipsis = three dot leader,U+2026 ISOpub "prime" : 8242, // prime = minutes = feet, U+2032 ISOtech "Prime" : 8243, // double prime = seconds = inches,U+2033 ISOtech "oline" : 8254, // overline = spacing overscore,U+203E NEW "frasl" : 8260, // fraction slash, U+2044 NEW // Letterlike Symbols "weierp" : 8472, // script capital P = power set= Weierstrass p, U+2118 ISOamso "image" : 8465, // blackletter capital I = imaginary part,U+2111 ISOamso "real" : 8476, // blackletter capital R = real part symbol,U+211C ISOamso "trade" : 8482, // trade mark sign, U+2122 ISOnum "alefsym" : 8501, // alef symbol = first transfinite cardinal,U+2135 NEW // alef symbol is NOT the same as hebrew letter alef,U+05D0 although the same glyph could be used to depict both characters // Arrows "larr" : 8592, // leftwards arrow, U+2190 ISOnum "uarr" : 8593, // upwards arrow, U+2191 ISOnum--> "rarr" : 8594, // rightwards arrow, U+2192 ISOnum "darr" : 8595, // downwards arrow, U+2193 ISOnum "harr" : 8596, // left right arrow, U+2194 ISOamsa "crarr" : 8629, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW "lArr" : 8656, // leftwards double arrow, U+21D0 ISOtech // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests "uArr" : 8657, // upwards double arrow, U+21D1 ISOamsa "rArr" : 8658, // rightwards double arrow,U+21D2 ISOtech // ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ?rArr can be used for 'implies' as ISOtech suggests "dArr" : 8659, // downwards double arrow, U+21D3 ISOamsa "hArr" : 8660, // left right double arrow,U+21D4 ISOamsa // Mathematical Operators "forall" : 8704, // for all, U+2200 ISOtech "part" : 8706, // partial differential, U+2202 ISOtech "exist" : 8707, // there exists, U+2203 ISOtech "empty" : 8709, // empty set = null set = diameter,U+2205 ISOamso "nabla" : 8711, // nabla = backward difference,U+2207 ISOtech "isin" : 8712, // element of, U+2208 ISOtech "notin" : 8713, // not an element of, U+2209 ISOtech "ni" : 8715, // contains as member, U+220B ISOtech // should there be a more memorable name than 'ni'? "prod" : 8719, // n-ary product = product sign,U+220F ISOamsb // prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both "sum" : 8721, // n-ary summation, U+2211 ISOamsb // sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both "minus" : 8722, // minus sign, U+2212 ISOtech "lowast" : 8727, // asterisk operator, U+2217 ISOtech "radic" : 8730, // square root = radical sign,U+221A ISOtech "prop" : 8733, // proportional to, U+221D ISOtech "infin" : 8734, // infinity, U+221E ISOtech "ang" : 8736, // angle, U+2220 ISOamso "and" : 8743, // logical and = wedge, U+2227 ISOtech "or" : 8744, // logical or = vee, U+2228 ISOtech "cap" : 8745, // intersection = cap, U+2229 ISOtech "cup" : 8746, // union = cup, U+222A ISOtech "int" : 8747, // integral, U+222B ISOtech "there4" : 8756, // therefore, U+2234 ISOtech "sim" : 8764, // tilde operator = varies with = similar to,U+223C ISOtech // tilde operator is NOT the same character as the tilde, U+007E,although the same glyph might be used to represent both "cong" : 8773, // approximately equal to, U+2245 ISOtech "asymp" : 8776, // almost equal to = asymptotic to,U+2248 ISOamsr "ne" : 8800, // not equal to, U+2260 ISOtech "equiv" : 8801, // identical to, U+2261 ISOtech "le" : 8804, // less-than or equal to, U+2264 ISOtech "ge" : 8805, // greater-than or equal to,U+2265 ISOtech "sub" : 8834, // subset of, U+2282 ISOtech "sup" : 8835, // superset of, U+2283 ISOtech // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry?It is in ISOamsn --> <!ENTITY nsub": 8836, //not a subset of, U+2284 ISOamsn "sube" : 8838, // subset of or equal to, U+2286 ISOtech "supe" : 8839, // superset of or equal to,U+2287 ISOtech "oplus" : 8853, // circled plus = direct sum,U+2295 ISOamsb "otimes" : 8855, // circled times = vector product,U+2297 ISOamsb "perp" : 8869, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech "sdot" : 8901, // dot operator, U+22C5 ISOamsb // dot operator is NOT the same character as U+00B7 middle dot // Miscellaneous Technical "lceil" : 8968, // left ceiling = apl upstile,U+2308 ISOamsc "rceil" : 8969, // right ceiling, U+2309 ISOamsc "lfloor" : 8970, // left floor = apl downstile,U+230A ISOamsc "rfloor" : 8971, // right floor, U+230B ISOamsc "lang" : 9001, // left-pointing angle bracket = bra,U+2329 ISOtech // lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' "rang" : 9002, // right-pointing angle bracket = ket,U+232A ISOtech // rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' // Geometric Shapes "loz" : 9674, // lozenge, U+25CA ISOpub // Miscellaneous Symbols "spades" : 9824, // black spade suit, U+2660 ISOpub // black here seems to mean filled as opposed to hollow "clubs" : 9827, // black club suit = shamrock,U+2663 ISOpub "hearts" : 9829, // black heart suit = valentine,U+2665 ISOpub "diams" : 9830, // black diamond suit, U+2666 ISOpub // Latin Extended-A "OElig" : 338, // -- latin capital ligature OE,U+0152 ISOlat2 "oelig" : 339, // -- latin small ligature oe, U+0153 ISOlat2 // ligature is a misnomer, this is a separate character in some languages "Scaron" : 352, // -- latin capital letter S with caron,U+0160 ISOlat2 "scaron" : 353, // -- latin small letter s with caron,U+0161 ISOlat2 "Yuml" : 376, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 // Spacing Modifier Letters "circ" : 710, // -- modifier letter circumflex accent,U+02C6 ISOpub "tilde" : 732, // small tilde, U+02DC ISOdia // General Punctuation "ensp" : 8194, // en space, U+2002 ISOpub "emsp" : 8195, // em space, U+2003 ISOpub "thinsp" : 8201, // thin space, U+2009 ISOpub "zwnj" : 8204, // zero width non-joiner,U+200C NEW RFC 2070 "zwj" : 8205, // zero width joiner, U+200D NEW RFC 2070 "lrm" : 8206, // left-to-right mark, U+200E NEW RFC 2070 "rlm" : 8207, // right-to-left mark, U+200F NEW RFC 2070 "ndash" : 8211, // en dash, U+2013 ISOpub "mdash" : 8212, // em dash, U+2014 ISOpub "lsquo" : 8216, // left single quotation mark,U+2018 ISOnum "rsquo" : 8217, // right single quotation mark,U+2019 ISOnum "sbquo" : 8218, // single low-9 quotation mark, U+201A NEW "ldquo" : 8220, // left double quotation mark,U+201C ISOnum "rdquo" : 8221, // right double quotation mark,U+201D ISOnum "bdquo" : 8222, // double low-9 quotation mark, U+201E NEW "dagger" : 8224, // dagger, U+2020 ISOpub "Dagger" : 8225, // double dagger, U+2021 ISOpub "permil" : 8240, // per mille sign, U+2030 ISOtech "lsaquo" : 8249, // single left-pointing angle quotation mark,U+2039 ISO proposed // lsaquo is proposed but not yet ISO standardized "rsaquo" : 8250, // single right-pointing angle quotation mark,U+203A ISO proposed // rsaquo is proposed but not yet ISO standardized "euro" : 8364 // -- euro sign, U+20AC NEW }, /** * Escapes the characters in a <code>String</code> using HTML entities. * * For example: <tt>"bread" & "butter"</tt> => <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>. * Supports all known HTML 4.0 entities, including funky accents. * * * <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> * * <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> * * <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> * * <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> * * @param str {String} the String to escape * @return {String} a new escaped String * @see #unescape */ escape : function(str) { return qx.util.StringEscape.escape(str, qx.bom.String.FROM_CHARCODE); }, /** * Unescapes a string containing entity escapes to a string * containing the actual Unicode characters corresponding to the * escapes. Supports HTML 4.0 entities. * * For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;" * will become "&lt;Fran&ccedil;ais&gt;" * * If an entity is unrecognized, it is left alone, and inserted * verbatim into the result string. e.g. "&amp;gt;&amp;zzzz;x" will * become "&gt;&amp;zzzz;x". * * @param str {String} the String to unescape, may be null * @return {var} a new unescaped String * @see #escape */ unescape : function(str) { return qx.util.StringEscape.unescape(str, qx.bom.String.TO_CHARCODE); }, /** * Converts a plain text string into HTML. * This is similar to {@link #escape} but converts new lines to * <tt>&lt:br&gt:</tt> and preserves whitespaces. * * @param str {String} the String to convert * @return {String} a new converted String * @see #escape */ fromText : function(str) { return qx.bom.String.escape(str).replace(/( |\n)/g, function(chr) { var map = { " " : " &nbsp;", "\n" : "<br>" }; return map[chr] || chr; }); }, /** * Converts HTML to plain text. * * * Strips all HTML tags * * converts <tt>&lt:br&gt:</tt> to new line * * unescapes HTML entities * * @param str {String} HTML string to converts * @return {String} plain text representation of the HTML string */ toText : function(str) { return qx.bom.String.unescape(str.replace(/\s+|<([^>])+>/gi, function(chr) //return qx.bom.String.unescape(str.replace(/<\/?[^>]+(>|$)/gi, function(chr) { if (chr.indexOf("<br") === 0) { return "\n"; } else if (chr.length > 0 && chr.replace(/^\s*/, "").replace(/\s*$/, "") == "") { return " "; } else { return ""; } })); } }, /* ***************************************************************************** DEFER ***************************************************************************** */ defer : function(statics) { /** Mapping of char codes to HTML entity names */ statics.FROM_CHARCODE = qx.lang.Object.invert(statics.TO_CHARCODE) ; } });