UNPKG

fortissimo-html

Version:

Fortissimo HTML - Flexible, Forgiving, Formatting HTML Parser

327 lines 14.3 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.isValidEntityCodepoint = exports.columnWidth = exports.resolveEntity = exports.isValidEntity = exports.isKnownNamedEntity = exports.separateEntities = exports.reencodeEntities = exports.unescapeEntities = exports.escapeToEntities = exports.minimalEscape = exports.isAttributeNameChar = exports.isAllPCENChar = exports.isPCENChar = exports.isMarkupStart = exports.replaceIsolatedSurrogates = exports.isInvalidCharacter = exports.compactNewlines = exports.compactWhitespace = exports.trimRight = exports.trimLeft = exports.trim = exports.isEol = exports.isOtherWhitespace = exports.isWhitespace = exports.TargetEncoding = exports.ReencodeOptions = exports.EntityStyle = void 0; const entitiesAsJson = __importStar(require("./entities.json")); var EntityStyle; (function (EntityStyle) { EntityStyle[EntityStyle["DECIMAL"] = 0] = "DECIMAL"; EntityStyle[EntityStyle["HEX"] = 1] = "HEX"; EntityStyle[EntityStyle["NUMERIC_SHORTEST"] = 2] = "NUMERIC_SHORTEST"; EntityStyle[EntityStyle["NAMED_OR_DECIMAL"] = 3] = "NAMED_OR_DECIMAL"; EntityStyle[EntityStyle["NAMED_OR_HEX"] = 4] = "NAMED_OR_HEX"; EntityStyle[EntityStyle["NAMED_OR_SHORTEST"] = 5] = "NAMED_OR_SHORTEST"; EntityStyle[EntityStyle["SHORTEST"] = 6] = "SHORTEST"; })(EntityStyle = exports.EntityStyle || (exports.EntityStyle = {})); const ES = EntityStyle; var ReencodeOptions; (function (ReencodeOptions) { ReencodeOptions[ReencodeOptions["DONT_CHANGE"] = 0] = "DONT_CHANGE"; ReencodeOptions[ReencodeOptions["REPAIR_ONLY"] = 1] = "REPAIR_ONLY"; ReencodeOptions[ReencodeOptions["LOOSE_MINIMAL"] = 2] = "LOOSE_MINIMAL"; ReencodeOptions[ReencodeOptions["MINIMAL"] = 3] = "MINIMAL"; ReencodeOptions[ReencodeOptions["NAMED_ENTITIES"] = 4] = "NAMED_ENTITIES"; })(ReencodeOptions = exports.ReencodeOptions || (exports.ReencodeOptions = {})); const RO = ReencodeOptions; var TargetEncoding; (function (TargetEncoding) { TargetEncoding[TargetEncoding["SEVEN_BIT"] = 0] = "SEVEN_BIT"; TargetEncoding[TargetEncoding["EIGHT_BIT"] = 1] = "EIGHT_BIT"; TargetEncoding[TargetEncoding["UNICODE"] = 2] = "UNICODE"; })(TargetEncoding = exports.TargetEncoding || (exports.TargetEncoding = {})); const TE = TargetEncoding; const DEFAULT_ESCAPE_OPTIONS = { entityStyle: ES.SHORTEST, reencode: RO.MINIMAL, target: TE.UNICODE }; let entities = entitiesAsJson; if (entities.default) entities = entities.default; const codePointToEntity = {}; const pairsToEntity = {}; Object.keys(entities).forEach(entity => { const value = entities[entity]; const cp = value.codePointAt(0); if (cp < 0x10000 && value.length === 1 || cp >= 0x10000 && value.length === 2) { const oldValue = codePointToEntity[cp]; const newValue = '&' + entity + ';'; if (!oldValue || newValue.length < oldValue.length || oldValue.charAt(1) < 'a' && newValue.charAt(1) >= 'a') codePointToEntity[cp] = newValue; } else if (value.length === 2) pairsToEntity[value] = '&' + entity + ';'; }); function isWhitespace(ch) { return ch === '\t' || ch === '\n' || ch === '\f' || ch === '\r' || ch === ' '; } exports.isWhitespace = isWhitespace; function isOtherWhitespace(ch) { return /\xA0|[\u2000-\u200A]|\u202F|\u205F|\u3000/.test(ch); } exports.isOtherWhitespace = isOtherWhitespace; function isEol(ch) { return ch === '\n' || ch === '\r' || ch === '\r\n'; } exports.isEol = isEol; // The following trim functions differ from the standard string functions in that they only operate on HTML whitespace function trim(s, skipNewlines = false) { if (skipNewlines) return (s || '').replace(/(?:^[ \t\f]+)|(?:[ \t\f]+$)/g, ''); else return (s || '').replace(/(?:^[ \t\n\f\r]+)|(?:[ \t\n\f\r]+$)/g, ''); } exports.trim = trim; function trimLeft(s, skipNewlines = false) { if (skipNewlines) return (s || '').replace(/^[ \t\f]+/, ''); else return (s || '').replace(/^[ \t\n\f\r]+/, ''); } exports.trimLeft = trimLeft; function trimRight(s, skipNewlines = false) { if (skipNewlines) return (s || '').replace(/[ \t\f]+$/, ''); else return (s || '').replace(/[ \t\n\f\r]+$/, ''); } exports.trimRight = trimRight; function compactWhitespace(s, skipNewlines = false) { if (skipNewlines) return (s || '').replace(/[ \t\f]+/g, ' '); else return (s || '').replace(/[ \t\n\f\r]+/g, ' '); } exports.compactWhitespace = compactWhitespace; function compactNewlines(s, maxInARow = 1) { s = s || ''; const replacement = s.includes('\r\n') ? '\r\n' : (s.includes('\r') ? '\r' : '\n').repeat(maxInARow); const regex = new RegExp(`(\r\n|\r|\n){${maxInARow + 1},}`, 'g'); return s.replace(regex, replacement); } exports.compactNewlines = compactNewlines; function isInvalidCharacter(ch) { return /[\x00-\x08\x0B\x0E-\x1F\x7F-\x9F]/.test(ch); } exports.isInvalidCharacter = isInvalidCharacter; function replaceIsolatedSurrogates(s) { return s && s.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[^\uD800-\uDBFF][\uDC00-\uDFFF]/g, ch => ch.length === 1 ? '\x02' : ch.charAt(0) + '\x03'); } exports.replaceIsolatedSurrogates = replaceIsolatedSurrogates; // This combines two tests, whether a character is a valid first character of a standard HTML element // or custom HTML element, or if it's anything else that starts markup (/ ! ?) when it follows <. function isMarkupStart(ch) { return ch !== undefined && /[a-z:/!?]/i.test(ch); } exports.isMarkupStart = isMarkupStart; const PCENCharRanges = new RegExp('[\xB7\xC0-\xD6\xD8-\xF6\xF8-\u037D\u037F-\u1FFF\u200C-\u200D\u203F-\u2040\u2070-\u218F' + '\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]' // U+10000 - U+EFFFF tested separately ); // PCEN: Potential Custom Element Name function isPCENChar(ch, loose = false) { if (loose) return /[^ \n\r\t\f\/>]/.test(ch); else if (ch <= 'z') return /[-._0-9a-z]/i.test(ch); else if (ch.length === 1) return PCENCharRanges.test(ch); const cp = ch.codePointAt(0); return 0x10000 <= cp && cp <= 0xEFFFF; } exports.isPCENChar = isPCENChar; function isAllPCENChar(s, loose = false) { for (let i = 0; i < s.length; ++i) { let ch = s.charAt(i); if (s.codePointAt(i) > 0xFFFF) ch += s.charAt(++i); if (!isPCENChar(ch, loose)) return false; } return true; } exports.isAllPCENChar = isAllPCENChar; function isAttributeNameChar(ch, loose = false) { if (loose) return /[^ \n\r\t\f>/=]/.test(ch); else return ch > ' ' && !/["`>/=]/.test(ch) && (ch < '0x7F' || ch >= '0xA0'); } exports.isAttributeNameChar = isAttributeNameChar; const basicEntities = { '<': '&lt;', '>': '&gt;', '&': '&amp;' }; function minimalEscape(s) { return s.replace(/[<>&]/g, match => basicEntities[match]); } exports.minimalEscape = minimalEscape; function escapeToEntities(s, options) { options = Object.assign(Object.assign({}, DEFAULT_ESCAPE_OPTIONS), options || {}); const sb = []; const style = options.entityStyle; const highest = (options.target === TE.SEVEN_BIT ? 0x7E : options.target === TE.EIGHT_BIT ? 0xFF : 0x10FFFF); for (let i = 0; i < s.length; ++i) { let ch = s.charAt(i); const cp = s.codePointAt(i); let pairMatch; let named; let numeric; if (cp > 0xFFFF) { ch = s.substr(i, 2); ++i; } const nextCh = s.charAt(i + 1) || ''; const entityNeeded = (cp < 32 && !isWhitespace(ch) || 0x7F <= cp && cp <= 0x9F || cp > highest || options.reencode >= RO.MINIMAL && /[<>&]/.test(ch) || options.reencode === RO.LOOSE_MINIMAL && (ch === '<' && (!nextCh || isMarkupStart(nextCh)) || ch === '&' && nextCh && /[a-z0-9#]/i.test(nextCh))); if ((entityNeeded || options.reencode === RO.NAMED_ENTITIES) && cp <= 0xFFFF && nextCh && style >= ES.NAMED_OR_DECIMAL) named = pairMatch = pairsToEntity[s.substr(i, 2)]; if (!named && style >= ES.NAMED_OR_DECIMAL && (entityNeeded || options.reencode === RO.NAMED_ENTITIES)) named = codePointToEntity[cp]; if (!entityNeeded && named) { sb.push(named); if (pairMatch) ++i; continue; } if ((entityNeeded || (options.reencode === RO.NAMED_ENTITIES && cp >= highest)) && !named && style >= ES.NAMED_OR_DECIMAL) named = codePointToEntity[cp]; if (entityNeeded && (!named || style >= ES.NAMED_OR_SHORTEST)) { if (style === ES.DECIMAL || style === ES.NAMED_OR_DECIMAL || (style === ES.NUMERIC_SHORTEST || (!named && style === ES.NAMED_OR_SHORTEST) || style === ES.SHORTEST) && cp <= 9999) numeric = '&#' + cp + ';'; else if (style === ES.HEX || style === ES.NAMED_OR_HEX || (style === ES.NUMERIC_SHORTEST || (!named && style === ES.NAMED_OR_SHORTEST) || style === ES.SHORTEST) && cp > 9999) numeric = '&#x' + cp.toString(16).toUpperCase() + ';'; } if (!numeric && named || numeric && named && named.length <= numeric.length) { sb.push(named); if (pairMatch) ++i; } else if (numeric) sb.push(numeric); else sb.push(ch); } return sb.join(''); } exports.escapeToEntities = escapeToEntities; function unescapeEntities(s, forAttributeValue = false) { const sb = []; separateEntities(s).forEach((value, index) => { if (index % 2 === 0 || forAttributeValue && !value.endsWith(';')) sb.push(value); else sb.push(resolveEntity(value)); }); return sb.join(''); } exports.unescapeEntities = unescapeEntities; function reencodeEntities(s, options, forAttributeValue = false) { const sb = []; separateEntities(s).forEach((value, index) => { if (index % 2 === 0 || (forAttributeValue && !value.endsWith(';'))) sb.push(escapeToEntities(value, options)); else { const valid = isValidEntity(value); if (valid && !value.endsWith(';')) value += ';'; if (options.reencode !== RO.REPAIR_ONLY && valid) { const chars = resolveEntity(value); if (options.undoUnneededEntities && !/&(amp|lt|gt|quot|apos);/.test(value) && chars > ' ' && !isOtherWhitespace(chars) && (options.target === TargetEncoding.UNICODE || (options.target === TargetEncoding.EIGHT_BIT && /^[\x00-\xFF]+$/.test(value)) || (options.target === TargetEncoding.SEVEN_BIT && /^[\x00-\x7E]+$/.test(value)))) value = chars; else value = escapeToEntities(chars, options); } sb.push(value); } }); return sb.join(''); } exports.reencodeEntities = reencodeEntities; function separateEntities(s) { return s ? s.split(/(&(?:amp(?:;?)|#\d+(?:;|\b|(?=\D))|#x[0-9a-f]+(?:;|\b|(?=[^0-9a-f]))|[0-9a-z]+(?:;|\b|(?=[^0-9a-z]))))/i) : [s]; } exports.separateEntities = separateEntities; function isKnownNamedEntity(entity) { if (entity.startsWith('&')) entity = entity.substr(1); if (entity.endsWith(';')) entity = entity.substr(0, entity.length - 1); return entity in entities; } exports.isKnownNamedEntity = isKnownNamedEntity; function isValidEntity(entity) { if (entity.startsWith('&')) entity = entity.substr(1); if (entity.endsWith(';')) entity = entity.substr(0, entity.length - 1); let cp; if (entity.toLowerCase().startsWith('#x')) return !isNaN(cp = parseInt(entity.substr(2), 16)) && isValidEntityCodepoint(cp); if (entity.toLowerCase().startsWith('#')) return !isNaN(cp = parseInt(entity.substr(1), 10)) && isValidEntityCodepoint(cp); return entity in entities; } exports.isValidEntity = isValidEntity; function resolveEntity(entity) { const original = entity; let ambiguous = false; if (entity.endsWith(';')) entity = entity.substr(0, entity.length - 1); else ambiguous = true; if (entity.startsWith('&')) entity = entity.substr(1); else ambiguous = false; if (entity.startsWith('#')) { let cp; entity = entity.substr(1); if (entity.startsWith('x') || entity.startsWith('X')) cp = parseInt(entity.substr(1), 16); else cp = parseInt(entity, 10); if (isNaN(cp) || cp > 0x10FFFF || (0xD800 <= cp && cp <= 0xDFFF)) return '�'; else return String.fromCodePoint(cp); } return entities[entity] || (ambiguous ? original : '�'); } exports.resolveEntity = resolveEntity; function columnWidth(s) { return s ? s.length - (s.match(/[\u0300-\u036F\u1AB0-\u1AFF\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]|[\uD800-\uDBFF][\uDC00-\uDFFF]/g) || []).length : 0; } exports.columnWidth = columnWidth; function isValidEntityCodepoint(cp) { return cp > 0 && cp <= 0x10FFFF && cp !== 0x0D && (cp < 0x80 || cp > 0x9F) && (cp < 0xD800 || cp > 0xDFFF); } exports.isValidEntityCodepoint = isValidEntityCodepoint; //# sourceMappingURL=characters.js.map