UNPKG

speedy-entities

Version:
183 lines (182 loc) 7.52 kB
/** * Creates an entity decoder that rewrites numeric and named entities found in input to their respective values. * * @param options The decoder options. * @returns A function that decodes entities in the string. */ export function createEntityDecoder(options = {}) { const { entities, isNumericReferenceSemicolonRequired = false } = options; const fromCharCode = String.fromCharCode; let entityDereferenceMap; let maximumEntityReferenceLength = 32; if (entities !== undefined) { entityDereferenceMap = new Map(); for (const entity in entities) { const hashCode = getEntityReferenceHashCode(entity); if (hashCode !== 0) { maximumEntityReferenceLength = Math.max(maximumEntityReferenceLength, entity.length); entityDereferenceMap.set(hashCode, entities[entity]); } } } return input => { let output = ''; let textIndex = 0; let charIndex = 0; const inputLength = input.length; while (charIndex < inputLength - 1) { let startIndex = input.indexOf('&', charIndex); if (startIndex === -1) { break; } charIndex = startIndex++; let value; let endIndex = startIndex; let charCode = 0; if (startIndex < inputLength - 2 && input.charCodeAt(startIndex) === /* # */ 35) { // Numeric character reference let codePoint = 0; if ((input.charCodeAt(++startIndex) | 32) === /* x */ 120) { endIndex = ++startIndex; // parseInt of a hexadecimal number while (endIndex - startIndex < 6 && endIndex < inputLength) { // Convert alpha to lower case charCode = input.charCodeAt(endIndex) | 32; if ((charCode >= /* 0 */ 48 && charCode <= /* 9 */ 57) || (charCode >= /* a */ 97 && charCode <= /* f */ 102)) { // Convert "0" → 0 and "f" → 15 codePoint = codePoint * 16 + charCode - (charCode & 112) + (charCode >> 6) * 9; ++endIndex; } else { break; } } } else { endIndex = startIndex; // parseInt of a decimal number while (endIndex - startIndex < 6 && endIndex < inputLength) { charCode = input.charCodeAt(endIndex) | 32; if (charCode >= /* 0 */ 48 && charCode <= /* 9 */ 57) { codePoint = codePoint * 10 + charCode - (charCode & 112); ++endIndex; } else { break; } } } if (endIndex !== startIndex) { // At least one digit must present const isTerminated = endIndex < inputLength && input.charCodeAt(endIndex) === /* ; */ 59; if (isTerminated || !isNumericReferenceSemicolonRequired) { // Convert a code point to a string // https://github.com/mathiasbynens/he/blob/master/src/he.js#L106-L134 if (codePoint === 0 || (codePoint >= 0xd800 && codePoint <= 0xdfff) || codePoint > 0x10ffff) { // Null char code, or character reference is outside the permissible Unicode range value = '\uFFFD'; } else if (codePoint >= 128 && codePoint <= 195 && codePoint !== 129 && codePoint !== 141 && codePoint !== 143 && codePoint !== 144 && codePoint !== 157) { // Overridden char code value = entityOverrides[codePoint]; } else if (codePoint > 0xffff) { // Surrogate pair codePoint -= 0x10000; value = fromCharCode(((codePoint >>> 10) & 0x3ff) | 0xd800, 0xdc00 | (codePoint & 0x3ff)); } else { // Char code value = fromCharCode(codePoint); } } if (isTerminated) { ++endIndex; } } } else if (entityDereferenceMap !== undefined) { // Character entity reference let index = startIndex; let hashCode = 0; let referencedValue; while (charCode !== /* ; */ 59 && index < inputLength && index - startIndex < maximumEntityReferenceLength && ((charCode = input.charCodeAt(index) | 0), isEntityReferenceChar(charCode))) { ++index; hashCode = (hashCode << 5) - hashCode + charCode; referencedValue = entityDereferenceMap.get(hashCode); if (referencedValue !== undefined) { value = referencedValue; endIndex = index; } } } // Concat decoded entity and preceding substring if (value !== undefined) { output += textIndex === charIndex ? value : input.slice(textIndex, charIndex) + value; textIndex = endIndex; } charIndex = endIndex; } return textIndex === 0 ? input : textIndex === inputLength ? output : output + input.slice(textIndex); }; } // Standing on the shoulders of giants // https://github.com/mathiasbynens/he/blob/master/data/decode-map-overrides.json const entityOverrides = { 128: '\u20AC', 130: '\u201A', 131: '\u0192', 132: '\u201E', 133: '\u2026', 134: '\u2020', 135: '\u2021', 136: '\u02C6', 137: '\u2030', 138: '\u0160', 139: '\u2039', 140: '\u0152', 142: '\u017D', 145: '\u2018', 146: '\u2019', 147: '\u201C', 148: '\u201D', 149: '\u2022', 150: '\u2013', 151: '\u2014', 152: '\u02DC', 153: '\u2122', 154: '\u0161', 155: '\u203A', 156: '\u0153', 158: '\u017E', 159: '\u0178', }; function getEntityReferenceHashCode(entity) { if (entity.length === 0) { return 0; } let hashCode = 0; for (let index = entity.charCodeAt(0) === /* & */ 38 ? 1 : 0; index < entity.length; ++index) { const charCode = entity.charCodeAt(index); if (!isEntityReferenceChar(charCode)) { return 0; } hashCode = (hashCode << 5) - hashCode + entity.charCodeAt(index); } return hashCode; } function isEntityReferenceChar(charCode) { return ((charCode >= /* a */ 97 && charCode <= /* z */ 122) || (charCode >= /* A */ 65 && charCode <= /* Z */ 90) || charCode === /* ; */ 59); }