fortissimo-html
Version:
Fortissimo HTML - Flexible, Forgiving, Formatting HTML Parser
327 lines • 14.3 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.isValidEntityCodepoint = exports.columnWidth = exports.resolveEntity = exports.isValidEntity = exports.isKnownNamedEntity = exports.separateEntities = exports.reencodeEntities = exports.unescapeEntities = exports.escapeToEntities = exports.minimalEscape = exports.isAttributeNameChar = exports.isAllPCENChar = exports.isPCENChar = exports.isMarkupStart = exports.replaceIsolatedSurrogates = exports.isInvalidCharacter = exports.compactNewlines = exports.compactWhitespace = exports.trimRight = exports.trimLeft = exports.trim = exports.isEol = exports.isOtherWhitespace = exports.isWhitespace = exports.TargetEncoding = exports.ReencodeOptions = exports.EntityStyle = void 0;
const entitiesAsJson = __importStar(require("./entities.json"));
var EntityStyle;
(function (EntityStyle) {
EntityStyle[EntityStyle["DECIMAL"] = 0] = "DECIMAL";
EntityStyle[EntityStyle["HEX"] = 1] = "HEX";
EntityStyle[EntityStyle["NUMERIC_SHORTEST"] = 2] = "NUMERIC_SHORTEST";
EntityStyle[EntityStyle["NAMED_OR_DECIMAL"] = 3] = "NAMED_OR_DECIMAL";
EntityStyle[EntityStyle["NAMED_OR_HEX"] = 4] = "NAMED_OR_HEX";
EntityStyle[EntityStyle["NAMED_OR_SHORTEST"] = 5] = "NAMED_OR_SHORTEST";
EntityStyle[EntityStyle["SHORTEST"] = 6] = "SHORTEST";
})(EntityStyle = exports.EntityStyle || (exports.EntityStyle = {}));
const ES = EntityStyle;
var ReencodeOptions;
(function (ReencodeOptions) {
ReencodeOptions[ReencodeOptions["DONT_CHANGE"] = 0] = "DONT_CHANGE";
ReencodeOptions[ReencodeOptions["REPAIR_ONLY"] = 1] = "REPAIR_ONLY";
ReencodeOptions[ReencodeOptions["LOOSE_MINIMAL"] = 2] = "LOOSE_MINIMAL";
ReencodeOptions[ReencodeOptions["MINIMAL"] = 3] = "MINIMAL";
ReencodeOptions[ReencodeOptions["NAMED_ENTITIES"] = 4] = "NAMED_ENTITIES";
})(ReencodeOptions = exports.ReencodeOptions || (exports.ReencodeOptions = {}));
const RO = ReencodeOptions;
var TargetEncoding;
(function (TargetEncoding) {
TargetEncoding[TargetEncoding["SEVEN_BIT"] = 0] = "SEVEN_BIT";
TargetEncoding[TargetEncoding["EIGHT_BIT"] = 1] = "EIGHT_BIT";
TargetEncoding[TargetEncoding["UNICODE"] = 2] = "UNICODE";
})(TargetEncoding = exports.TargetEncoding || (exports.TargetEncoding = {}));
const TE = TargetEncoding;
const DEFAULT_ESCAPE_OPTIONS = {
entityStyle: ES.SHORTEST,
reencode: RO.MINIMAL,
target: TE.UNICODE
};
let entities = entitiesAsJson;
if (entities.default)
entities = entities.default;
const codePointToEntity = {};
const pairsToEntity = {};
Object.keys(entities).forEach(entity => {
const value = entities[entity];
const cp = value.codePointAt(0);
if (cp < 0x10000 && value.length === 1 || cp >= 0x10000 && value.length === 2) {
const oldValue = codePointToEntity[cp];
const newValue = '&' + entity + ';';
if (!oldValue || newValue.length < oldValue.length || oldValue.charAt(1) < 'a' && newValue.charAt(1) >= 'a')
codePointToEntity[cp] = newValue;
}
else if (value.length === 2)
pairsToEntity[value] = '&' + entity + ';';
});
function isWhitespace(ch) {
return ch === '\t' || ch === '\n' || ch === '\f' || ch === '\r' || ch === ' ';
}
exports.isWhitespace = isWhitespace;
function isOtherWhitespace(ch) {
return /\xA0|[\u2000-\u200A]|\u202F|\u205F|\u3000/.test(ch);
}
exports.isOtherWhitespace = isOtherWhitespace;
function isEol(ch) {
return ch === '\n' || ch === '\r' || ch === '\r\n';
}
exports.isEol = isEol;
// The following trim functions differ from the standard string functions in that they only operate on HTML whitespace
function trim(s, skipNewlines = false) {
if (skipNewlines)
return (s || '').replace(/(?:^[ \t\f]+)|(?:[ \t\f]+$)/g, '');
else
return (s || '').replace(/(?:^[ \t\n\f\r]+)|(?:[ \t\n\f\r]+$)/g, '');
}
exports.trim = trim;
function trimLeft(s, skipNewlines = false) {
if (skipNewlines)
return (s || '').replace(/^[ \t\f]+/, '');
else
return (s || '').replace(/^[ \t\n\f\r]+/, '');
}
exports.trimLeft = trimLeft;
function trimRight(s, skipNewlines = false) {
if (skipNewlines)
return (s || '').replace(/[ \t\f]+$/, '');
else
return (s || '').replace(/[ \t\n\f\r]+$/, '');
}
exports.trimRight = trimRight;
function compactWhitespace(s, skipNewlines = false) {
if (skipNewlines)
return (s || '').replace(/[ \t\f]+/g, ' ');
else
return (s || '').replace(/[ \t\n\f\r]+/g, ' ');
}
exports.compactWhitespace = compactWhitespace;
function compactNewlines(s, maxInARow = 1) {
s = s || '';
const replacement = s.includes('\r\n') ? '\r\n' : (s.includes('\r') ? '\r' : '\n').repeat(maxInARow);
const regex = new RegExp(`(\r\n|\r|\n){${maxInARow + 1},}`, 'g');
return s.replace(regex, replacement);
}
exports.compactNewlines = compactNewlines;
function isInvalidCharacter(ch) {
return /[\x00-\x08\x0B\x0E-\x1F\x7F-\x9F]/.test(ch);
}
exports.isInvalidCharacter = isInvalidCharacter;
function replaceIsolatedSurrogates(s) {
return s && s.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[^\uD800-\uDBFF][\uDC00-\uDFFF]/g, ch => ch.length === 1 ? '\x02' : ch.charAt(0) + '\x03');
}
exports.replaceIsolatedSurrogates = replaceIsolatedSurrogates;
// This combines two tests, whether a character is a valid first character of a standard HTML element
// or custom HTML element, or if it's anything else that starts markup (/ ! ?) when it follows <.
function isMarkupStart(ch) {
return ch !== undefined && /[a-z:/!?]/i.test(ch);
}
exports.isMarkupStart = isMarkupStart;
const PCENCharRanges = new RegExp('[\xB7\xC0-\xD6\xD8-\xF6\xF8-\u037D\u037F-\u1FFF\u200C-\u200D\u203F-\u2040\u2070-\u218F' +
'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]' // U+10000 - U+EFFFF tested separately
);
// PCEN: Potential Custom Element Name
function isPCENChar(ch, loose = false) {
if (loose)
return /[^ \n\r\t\f\/>]/.test(ch);
else if (ch <= 'z')
return /[-._0-9a-z]/i.test(ch);
else if (ch.length === 1)
return PCENCharRanges.test(ch);
const cp = ch.codePointAt(0);
return 0x10000 <= cp && cp <= 0xEFFFF;
}
exports.isPCENChar = isPCENChar;
function isAllPCENChar(s, loose = false) {
for (let i = 0; i < s.length; ++i) {
let ch = s.charAt(i);
if (s.codePointAt(i) > 0xFFFF)
ch += s.charAt(++i);
if (!isPCENChar(ch, loose))
return false;
}
return true;
}
exports.isAllPCENChar = isAllPCENChar;
function isAttributeNameChar(ch, loose = false) {
if (loose)
return /[^ \n\r\t\f>/=]/.test(ch);
else
return ch > ' ' && !/["`>/=]/.test(ch) && (ch < '0x7F' || ch >= '0xA0');
}
exports.isAttributeNameChar = isAttributeNameChar;
const basicEntities = { '<': '<', '>': '>', '&': '&' };
function minimalEscape(s) {
return s.replace(/[<>&]/g, match => basicEntities[match]);
}
exports.minimalEscape = minimalEscape;
function escapeToEntities(s, options) {
options = Object.assign(Object.assign({}, DEFAULT_ESCAPE_OPTIONS), options || {});
const sb = [];
const style = options.entityStyle;
const highest = (options.target === TE.SEVEN_BIT ? 0x7E : options.target === TE.EIGHT_BIT ? 0xFF : 0x10FFFF);
for (let i = 0; i < s.length; ++i) {
let ch = s.charAt(i);
const cp = s.codePointAt(i);
let pairMatch;
let named;
let numeric;
if (cp > 0xFFFF) {
ch = s.substr(i, 2);
++i;
}
const nextCh = s.charAt(i + 1) || '';
const entityNeeded = (cp < 32 && !isWhitespace(ch) ||
0x7F <= cp && cp <= 0x9F ||
cp > highest ||
options.reencode >= RO.MINIMAL && /[<>&]/.test(ch) ||
options.reencode === RO.LOOSE_MINIMAL && (ch === '<' && (!nextCh || isMarkupStart(nextCh)) ||
ch === '&' && nextCh && /[a-z0-9#]/i.test(nextCh)));
if ((entityNeeded || options.reencode === RO.NAMED_ENTITIES) &&
cp <= 0xFFFF && nextCh && style >= ES.NAMED_OR_DECIMAL)
named = pairMatch = pairsToEntity[s.substr(i, 2)];
if (!named && style >= ES.NAMED_OR_DECIMAL && (entityNeeded || options.reencode === RO.NAMED_ENTITIES))
named = codePointToEntity[cp];
if (!entityNeeded && named) {
sb.push(named);
if (pairMatch)
++i;
continue;
}
if ((entityNeeded || (options.reencode === RO.NAMED_ENTITIES && cp >= highest)) && !named && style >= ES.NAMED_OR_DECIMAL)
named = codePointToEntity[cp];
if (entityNeeded && (!named || style >= ES.NAMED_OR_SHORTEST)) {
if (style === ES.DECIMAL || style === ES.NAMED_OR_DECIMAL ||
(style === ES.NUMERIC_SHORTEST || (!named && style === ES.NAMED_OR_SHORTEST) || style === ES.SHORTEST) && cp <= 9999)
numeric = '&#' + cp + ';';
else if (style === ES.HEX || style === ES.NAMED_OR_HEX ||
(style === ES.NUMERIC_SHORTEST || (!named && style === ES.NAMED_OR_SHORTEST) || style === ES.SHORTEST) && cp > 9999)
numeric = '&#x' + cp.toString(16).toUpperCase() + ';';
}
if (!numeric && named || numeric && named && named.length <= numeric.length) {
sb.push(named);
if (pairMatch)
++i;
}
else if (numeric)
sb.push(numeric);
else
sb.push(ch);
}
return sb.join('');
}
exports.escapeToEntities = escapeToEntities;
function unescapeEntities(s, forAttributeValue = false) {
const sb = [];
separateEntities(s).forEach((value, index) => {
if (index % 2 === 0 || forAttributeValue && !value.endsWith(';'))
sb.push(value);
else
sb.push(resolveEntity(value));
});
return sb.join('');
}
exports.unescapeEntities = unescapeEntities;
function reencodeEntities(s, options, forAttributeValue = false) {
const sb = [];
separateEntities(s).forEach((value, index) => {
if (index % 2 === 0 || (forAttributeValue && !value.endsWith(';')))
sb.push(escapeToEntities(value, options));
else {
const valid = isValidEntity(value);
if (valid && !value.endsWith(';'))
value += ';';
if (options.reencode !== RO.REPAIR_ONLY && valid) {
const chars = resolveEntity(value);
if (options.undoUnneededEntities && !/&(amp|lt|gt|quot|apos);/.test(value) &&
chars > ' ' && !isOtherWhitespace(chars) &&
(options.target === TargetEncoding.UNICODE ||
(options.target === TargetEncoding.EIGHT_BIT && /^[\x00-\xFF]+$/.test(value)) ||
(options.target === TargetEncoding.SEVEN_BIT && /^[\x00-\x7E]+$/.test(value))))
value = chars;
else
value = escapeToEntities(chars, options);
}
sb.push(value);
}
});
return sb.join('');
}
exports.reencodeEntities = reencodeEntities;
function separateEntities(s) {
return s ? s.split(/(&(?:amp(?:;?)|#\d+(?:;|\b|(?=\D))|#x[0-9a-f]+(?:;|\b|(?=[^0-9a-f]))|[0-9a-z]+(?:;|\b|(?=[^0-9a-z]))))/i) : [s];
}
exports.separateEntities = separateEntities;
function isKnownNamedEntity(entity) {
if (entity.startsWith('&'))
entity = entity.substr(1);
if (entity.endsWith(';'))
entity = entity.substr(0, entity.length - 1);
return entity in entities;
}
exports.isKnownNamedEntity = isKnownNamedEntity;
function isValidEntity(entity) {
if (entity.startsWith('&'))
entity = entity.substr(1);
if (entity.endsWith(';'))
entity = entity.substr(0, entity.length - 1);
let cp;
if (entity.toLowerCase().startsWith('#x'))
return !isNaN(cp = parseInt(entity.substr(2), 16)) && isValidEntityCodepoint(cp);
if (entity.toLowerCase().startsWith('#'))
return !isNaN(cp = parseInt(entity.substr(1), 10)) && isValidEntityCodepoint(cp);
return entity in entities;
}
exports.isValidEntity = isValidEntity;
function resolveEntity(entity) {
const original = entity;
let ambiguous = false;
if (entity.endsWith(';'))
entity = entity.substr(0, entity.length - 1);
else
ambiguous = true;
if (entity.startsWith('&'))
entity = entity.substr(1);
else
ambiguous = false;
if (entity.startsWith('#')) {
let cp;
entity = entity.substr(1);
if (entity.startsWith('x') || entity.startsWith('X'))
cp = parseInt(entity.substr(1), 16);
else
cp = parseInt(entity, 10);
if (isNaN(cp) || cp > 0x10FFFF || (0xD800 <= cp && cp <= 0xDFFF))
return '�';
else
return String.fromCodePoint(cp);
}
return entities[entity] || (ambiguous ? original : '�');
}
exports.resolveEntity = resolveEntity;
function columnWidth(s) {
return s ? s.length -
(s.match(/[\u0300-\u036F\u1AB0-\u1AFF\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]|[\uD800-\uDBFF][\uDC00-\uDFFF]/g)
|| []).length : 0;
}
exports.columnWidth = columnWidth;
function isValidEntityCodepoint(cp) {
return cp > 0 && cp <= 0x10FFFF && cp !== 0x0D && (cp < 0x80 || cp > 0x9F) && (cp < 0xD800 || cp > 0xDFFF);
}
exports.isValidEntityCodepoint = isValidEntityCodepoint;
//# sourceMappingURL=characters.js.map