special-entities
Version:
A small JavaScript library for normalizing the encoding of HTML entities.
156 lines (127 loc) • 3.49 kB
JavaScript
// the raw mapping of special names to unicode values
var by_name =
exports.by_name = require("../data/entities.json");
// index the reverse mappings immediately
var by_code = exports.by_code = {};
Object.keys(by_name).forEach(function(n) {
by_name[n].forEach(function(c) {
if (by_code[c] == null) by_code[c] = [];
by_code[c].push(n);
});
});
// a few regular expressions for parsing entities
var hex_value_regex = /^0?x([a-f0-9]+)$/i,
entity_regex = /&(?:#x[a-f0-9]+|#[0-9]+|[a-z0-9]+);?/ig,
dec_entity_regex = /^&#([0-9]+);?$/i,
hex_entity_regex = /^&#x([a-f0-9]+);?$/i,
spec_entity_regex = /^&([a-z0-9]+);?$/i;
// converts all entities found in string to specific format
var normalizeEntities =
exports.normalizeEntities = function(str, format) {
return str.replace(entity_regex, function(entity) {
return convert(entity, format) || entity;
});
}
// converts all entities and higher order utf-8 chars to format
exports.normalizeXML = function(str, format) {
var i, code, res;
// convert entities first
str = normalizeEntities(str, format);
// find all characters above ASCII and replace, backwards
for (i = str.length - 1; i >= 0; i--) {
code = str.charCodeAt(i);
if (code > 127) {
res = convert(str[i], format);
str = str.substr(0, i) + (res != null ? res : str[i]) + str.substr(i + 1);
}
}
return str;
}
// converts single value into utf-8 character or entity equivalent
var convert =
exports.convert = function(s, format) {
var code, name;
if (format == null) format = "html";
code = toCharCode(s);
if (code === false) return null;
switch(format) {
// only decimal entities
case "xml":
case "xhtml":
case "num":
case "numeric":
return toEntity(code);
// only hex entities
case "hex":
return toEntity(code, true);
// first special, then decimal
case "html":
return toEntity((name = cton(code)) != null ? name : code);
// only special entities
case "name":
case "special":
return (name = cton(code)) != null ? toEntity(name) : null;
// utf-8 character
case "char":
case "character":
case "utf-8":
case "UTF-8":
return String.fromCharCode(code);
// regular number
case "code":
return code;
}
return null;
}
// code to name
var cton =
exports.codeToName = function(c) {
var n = by_code[c.toString()];
return n != null ? n[0] : null;
}
// name to code
var ntoc =
exports.nameToCode = function(n) {
var c = by_name[n]
return c != null ? c[0] : null;
}
// parse an array of inputs and returns the equivalent
// unicode decimal or false if invalid
var toCharCode =
exports.toCharCode = function(s) {
var m, code;
if (typeof s === "string") {
if (s === "") return false;
// regular char
if (s.length === 1) return s.charCodeAt(0);
// special entity
if (m = spec_entity_regex.exec(s)) {
return ntoc(m[1]) || false;
}
// decimal entity
if (m = dec_entity_regex.exec(s)) {
return parseInt(m[1], 10);
}
// hex entity
if (m = hex_entity_regex.exec(s)) {
return parseInt(m[1], 16);
}
// hex value
if (m = hex_value_regex.exec(s)) {
return parseInt(m[1], 16);
}
// otherwise look it up as a special entity name
return ntoc(s) || false;
}
if (typeof s === "number") return s;
return false;
}
// converts string or number to valid entity
var toEntity =
exports.toEntity = function(n, hex) {
return "&" + (
typeof n === "number" ?
"#" + (hex ? "x" : "") +
n.toString(hex ? 16 : 10).toUpperCase() : n
) + ";";
}