UNPKG

html-minifier-next

Version:

Super-configurable and well-tested web page minifier (enhanced successor of HTML Minifier)

771 lines (686 loc) 32.2 kB
/* * HTML Parser By John Resig (ejohn.org) * Modified by Juriy “kangax” Zaytsev * Original code by Erik Arvidsson, Mozilla Public License * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js */ import { isThenable } from './lib/utils.js'; /* * Use like so: * * HTMLParser(htmlString, { * start: function(tag, attrs, unary) {}, * end: function(tag) {}, * chars: function(text) {}, * comment: function(text) {} * }); */ class CaseInsensitiveSet extends Set { has(str) { return super.has(str.toLowerCase()); } } // Regular expressions for parsing tags and attributes const singleAttrIdentifier = /([^\s"'<>/=]+)/; const singleAttrAssigns = [/=/]; const singleAttrValues = [ // Attr value double quotes /"([^"]*)"+/.source, // Attr value, single quotes /'([^']*)'+/.source, // Attr value, no quotes /([^ \t\n\f\r"'`=<>]+)/.source ]; // https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName const qnameCapture = (function () { // https://www.npmjs.com/package/ncname const combiningChar = '\u0300-\u0345\u0360\u0361\u0483-\u0486\u0591-\u05A1\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1\u05C2\u05C4\u064B-\u0652\u0670\u06D6-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A02\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A70\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B82\u0B83\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C82\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0D02\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86-\u0F8B\u0F90-\u0F95\u0F97\u0F99-\u0FAD\u0FB1-\u0FB7\u0FB9\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A'; const digit = '0-9\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29'; const extender = '\xB7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005\u3031-\u3035\u309D\u309E\u30FC-\u30FE'; const letter = 'A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E\u0180-\u01C3\u01CD-\u01F0\u01F4\u01F5\u01FA-\u0217\u0250-\u02A8\u02BB-\u02C1\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D6\u03DA\u03DC\u03DE\u03E0\u03E2-\u03F3\u0401-\u040C\u040E-\u044F\u0451-\u045C\u045E-\u0481\u0490-\u04C4\u04C7\u04C8\u04CB\u04CC\u04D0-\u04EB\u04EE-\u04F5\u04F8\u04F9\u0531-\u0556\u0559\u0561-\u0586\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A\u0641-\u064A\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE\u06D0-\u06D3\u06D5\u06E5\u06E6\u0905-\u0939\u093D\u0958-\u0961\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8B\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B36-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60\u0D61\u0E01-\u0E2E\u0E30\u0E32\u0E33\u0E40-\u0E45\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD\u0EAE\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0F40-\u0F47\u0F49-\u0F69\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102\u1103\u1105-\u1107\u1109\u110B\u110C\u110E-\u1112\u113C\u113E\u1140\u114C\u114E\u1150\u1154\u1155\u1159\u115F-\u1161\u1163\u1165\u1167\u1169\u116D\u116E\u1172\u1173\u1175\u119E\u11A8\u11AB\u11AE\u11AF\u11B7\u11B8\u11BA\u11BC-\u11C2\u11EB\u11F0\u11F9\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A\u212B\u212E\u2180-\u2182\u3007\u3021-\u3029\u3041-\u3094\u30A1-\u30FA\u3105-\u312C\u4E00-\u9FA5\uAC00-\uD7A3'; const ncname = '[' + letter + '_][' + letter + digit + '\\.\\-_' + combiningChar + extender + ']*'; return '((?:' + ncname + '\\:)?' + ncname + ')'; })(); const startTagOpen = new RegExp('^<' + qnameCapture); export const endTag = new RegExp('^</' + qnameCapture + '[^>]*>'); let IS_REGEX_CAPTURING_BROKEN = false; 'x'.replace(/x(.)?/g, function (m, g) { IS_REGEX_CAPTURING_BROKEN = g === ''; }); // Empty elements const empty = new CaseInsensitiveSet(['area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']); // Elements that you can, intentionally, leave open (and which close themselves) const closeSelf = new CaseInsensitiveSet(['colgroup', 'dd', 'dt', 'li', 'option', 'p', 'td', 'tfoot', 'th', 'thead', 'tr', 'source']); // Attributes that have their values filled in `disabled='disabled'` const fillAttrs = new CaseInsensitiveSet(['checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap', 'readonly', 'selected']); // Special elements (can contain anything) const special = new CaseInsensitiveSet(['script', 'style']); // HTML elements, https://html.spec.whatwg.org/multipage/indices.html#elements-3 // Phrasing content, https://html.spec.whatwg.org/multipage/dom.html#phrasing-content const nonPhrasing = new CaseInsensitiveSet(['address', 'article', 'aside', 'base', 'blockquote', 'body', 'caption', 'col', 'colgroup', 'dd', 'details', 'dialog', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'legend', 'li', 'menuitem', 'meta', 'ol', 'optgroup', 'option', 'param', 'rp', 'rt', 'source', 'style', 'summary', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', 'ul']); const reCache = {}; // Pre-compiled regexes for common special elements (`script`, `style`, `noscript`) // These are used frequently, and pre-compiling them avoids regex creation overhead const preCompiledStackedTags = { 'script': /([\s\S]*?)<\/script[^>]*>/i, 'style': /([\s\S]*?)<\/style[^>]*>/i, 'noscript': /([\s\S]*?)<\/noscript[^>]*>/i }; // Cache for compiled attribute regexes per handler configuration const attrRegexCache = new WeakMap(); // O(n) helper: Strip all occurrences of `open…close` delimiters, keeping inner content // Used instead of a regex replace to avoid O(n²) behavior on adversarial inputs function stripDelimited(str, open, close) { let result = ''; let i = 0; while (i < str.length) { const start = str.indexOf(open, i); if (start === -1) { result += str.slice(i); break; } result += str.slice(i, start); const end = str.indexOf(close, start + open.length); if (end === -1) { result += str.slice(start); break; } result += str.slice(start + open.length, end); i = end + close.length; } return result; } function buildAttrRegex(handler) { let pattern = singleAttrIdentifier.source + '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' + '[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?'; if (handler.customAttrSurround) { const attrClauses = []; for (let i = handler.customAttrSurround.length - 1; i >= 0; i--) { attrClauses[i] = '(?:' + '(' + handler.customAttrSurround[i][0].source + ')\\s*' + pattern + '\\s*(' + handler.customAttrSurround[i][1].source + ')' + ')'; } attrClauses.push('(?:' + pattern + ')'); pattern = '(?:' + attrClauses.join('|') + ')'; } return new RegExp('^\\s*' + pattern); } function getAttrRegexForHandler(handler) { let cached = attrRegexCache.get(handler); if (cached) return cached; const compiled = buildAttrRegex(handler); attrRegexCache.set(handler, compiled); return compiled; } // Cache for sticky attribute regexes (`y` flag for position-based matching on full string) const attrRegexStickyCache = new WeakMap(); function getAttrRegexStickyForHandler(handler) { let cached = attrRegexStickyCache.get(handler); if (cached) return cached; const nonSticky = getAttrRegexForHandler(handler); // Derive sticky version: Remove `^` anchor, add `y` flag const compiled = new RegExp(nonSticky.source.slice(1), 'y'); attrRegexStickyCache.set(handler, compiled); return compiled; } function joinSingleAttrAssigns(handler) { return singleAttrAssigns.concat( handler.customAttrAssign || [] ).map(function (assign) { return '(?:' + assign.source + ')'; }).join('|'); } // Number of captured parts per `customAttrSurround` pattern const NCP = 7; export class HTMLParser { constructor(html, handler) { this.html = html; this.handler = handler; } async parse() { const handler = this.handler; const fullHtml = this.html; const fullLength = fullHtml.length; const stack = []; let lastTag; // Use cached attribute regex for this handler configuration const attribute = getAttrRegexForHandler(handler); const attributeY = getAttrRegexStickyForHandler(handler); let prevTag = undefined, nextTag = undefined; let prevAttrs = [], nextAttrs = []; // Sticky regex versions for position-based matching (avoids string slicing) const startTagOpenY = new RegExp(startTagOpen.source.slice(1), 'y'); // `\s*` with sticky flag is O(n) at worst—no retry from different positions possible const startTagCloseY = /\s*(\/?)>/y; const endTagY = new RegExp(endTag.source.slice(1), 'y'); const doctypeY = /<!DOCTYPE[^<>]+>/iy; const commentTestY = /<!--/y; const conditionalTestY = /<!\[/y; // Cached next-tag from lookahead (avoids re-parsing the same tag) let cachedNextStartTag = null; let cachedNextEndTag = null; // Index-based parsing let pos = 0; let lastPos; // Helper to advance position const advance = (n) => { pos += n; }; // Lazy line/column calculation—only compute on actual errors const getLineColumn = (position) => { let line = 1; let column = 1; for (let i = 0; i < position; i++) { if (fullHtml[i] === '\n') { line++; column = 1; } else { column++; } } return { line, column }; }; // Helper to safely extract substring when needed for stacked tag content const sliceFromPos = (startPos) => { return fullHtml.slice(startPos); }; while (pos < fullLength) { lastPos = pos; // Make sure we’re not in a `script` or `style` element if (!lastTag || !special.has(lastTag)) { const textEnd = fullHtml.indexOf('<', pos); if (textEnd === pos) { // We found a tag at current position // Check cache from previous lookahead (avoids re-parsing the same tag) if (cachedNextStartTag && cachedNextStartTag.pos === pos) { const startTagMatch = cachedNextStartTag.match; cachedNextStartTag = null; cachedNextEndTag = null; advance(startTagMatch.advance); await handleStartTag(startTagMatch); prevTag = startTagMatch.tagName.toLowerCase(); continue; } if (cachedNextEndTag && cachedNextEndTag.pos === pos) { const endTagMatch = cachedNextEndTag.match; cachedNextStartTag = null; cachedNextEndTag = null; advance(endTagMatch[0].length); await parseEndTag(endTagMatch[0], endTagMatch[1]); prevTag = '/' + endTagMatch[1].toLowerCase(); prevAttrs = []; continue; } cachedNextStartTag = null; cachedNextEndTag = null; // Comment commentTestY.lastIndex = pos; if (commentTestY.test(fullHtml)) { const commentEnd = fullHtml.indexOf('-->', pos + 4); if (commentEnd >= 0) { if (handler.comment) { const result = handler.comment(fullHtml.substring(pos + 4, commentEnd)); if (isThenable(result)) await result; } advance(commentEnd + 3 - pos); prevTag = ''; prevAttrs = []; continue; } } // https://web.archive.org/web/20241201212701/https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment conditionalTestY.lastIndex = pos; if (conditionalTestY.test(fullHtml)) { const conditionalEnd = fullHtml.indexOf(']>', pos + 3); if (conditionalEnd >= 0) { if (handler.comment) { const result = handler.comment(fullHtml.substring(pos + 2, conditionalEnd + 1), true /* Non-standard */); if (isThenable(result)) await result; } advance(conditionalEnd + 2 - pos); prevTag = ''; prevAttrs = []; continue; } } // Doctype doctypeY.lastIndex = pos; const doctypeMatch = doctypeY.exec(fullHtml); if (doctypeMatch) { if (handler.doctype) { handler.doctype(doctypeMatch[0]); } advance(doctypeMatch[0].length); prevTag = ''; prevAttrs = []; continue; } // End tag endTagY.lastIndex = pos; const endTagMatch = endTagY.exec(fullHtml); if (endTagMatch) { advance(endTagMatch[0].length); await parseEndTag(endTagMatch[0], endTagMatch[1]); prevTag = '/' + endTagMatch[1].toLowerCase(); prevAttrs = []; continue; } // Start tag const startTagMatch = parseStartTag(pos); if (startTagMatch) { advance(startTagMatch.advance); await handleStartTag(startTagMatch); prevTag = startTagMatch.tagName.toLowerCase(); continue; } // Treat `<` as text if (handler.continueOnParseError) { // Continue looking for next tag } } let text; if (textEnd >= 0) { text = fullHtml.substring(pos, textEnd); advance(textEnd - pos); } else { text = fullHtml.substring(pos); advance(fullLength - pos); } // Next tag for whitespace processing context if (handler.wantsNextTag) { const nextStartTagMatch = parseStartTag(pos); if (nextStartTagMatch) { nextTag = nextStartTagMatch.tagName; // Extract minimal attribute info for whitespace logic (just name/value pairs) nextAttrs = extractAttrInfo(nextStartTagMatch.attrs); cachedNextStartTag = { match: nextStartTagMatch, pos }; } else { endTagY.lastIndex = pos; const nextEndTagMatch = endTagY.exec(fullHtml); if (nextEndTagMatch) { nextTag = '/' + nextEndTagMatch[1]; nextAttrs = []; cachedNextEndTag = { match: nextEndTagMatch, pos }; } else { nextTag = ''; nextAttrs = []; } } } if (handler.chars) { const result = handler.chars(text, prevTag, nextTag, prevAttrs, nextAttrs); if (isThenable(result)) await result; } prevTag = ''; prevAttrs = []; } else { const stackedTag = lastTag.toLowerCase(); // Use pre-compiled regex for common tags (`script`, `style`, `noscript`) to avoid regex creation overhead const reStackedTag = preCompiledStackedTags[stackedTag] || reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)\\x3c/' + stackedTag + '[^>]*>', 'i')); const remaining = sliceFromPos(pos); const m = reStackedTag.exec(remaining); if (m && m.index === 0) { let text = m[1]; if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') { text = stripDelimited(stripDelimited(text, '<!--', '-->'), '<![CDATA[', ']]>'); } if (handler.chars) { const result = handler.chars(text); if (isThenable(result)) await result; } // Advance HTML past the matched special tag content and its closing tag advance(m[0].length); await parseEndTag('</' + stackedTag + '>', stackedTag); } else { // No closing tag found; to avoid infinite loop, break similarly to previous behavior if (handler.continueOnParseError && handler.chars && pos < fullLength) { const result = handler.chars(fullHtml[pos], prevTag, '', prevAttrs, []); if (isThenable(result)) await result; advance(1); } else { break; } } } if (pos === lastPos) { if (handler.continueOnParseError) { // Skip the problematic character and continue if (handler.chars) { const result = handler.chars(fullHtml[pos], prevTag, '', prevAttrs, []); if (isThenable(result)) await result; } advance(1); prevTag = ''; prevAttrs = []; continue; } const loc = getLineColumn(pos); // Include some context before the error position so the snippet contains the offending markup plus preceding characters (e.g., `invalid<tag`) const CONTEXT_BEFORE = 50; const startPos = Math.max(0, pos - CONTEXT_BEFORE); const snippet = fullHtml.slice(startPos, startPos + 200).replace(/\n/g, ' '); throw new Error( `Parse error at line ${loc.line}, column ${loc.column}:\n${snippet}${fullHtml.length > startPos + 200 ? '…' : ''}` ); } } if (!handler.partialMarkup) { // Clean up any remaining tags await parseEndTag(); } // Helper to extract minimal attribute info (name/value pairs) from raw attribute matches // Used for whitespace collapsing logic—doesn’t need full processing function extractAttrInfo(rawAttrs) { if (!rawAttrs || !rawAttrs.length) return []; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; return rawAttrs.map(args => { // Extract attribute name (always at `baseIndex`) const name = args[baseIndex]; // Extract value from double-quoted (`baseIndex + 2`), single-quoted (`baseIndex + 3`), or unquoted (`baseIndex + 4`) const value = args[baseIndex + 2] ?? args[baseIndex + 3] ?? args[baseIndex + 4]; return { name: name?.toLowerCase(), value }; }).filter(attr => attr.name); // Filter out invalid entries } function parseStartTag(startPos) { startTagOpenY.lastIndex = startPos; const start = startTagOpenY.exec(fullHtml); if (start) { const match = { tagName: start[1], attrs: [], advance: 0 }; let consumed = start[0].length; let currentPos = startPos + consumed; let end, attr; // Safety limit: Max length of input to check for attributes // Protects against catastrophic backtracking on massive attribute values const MAX_ATTR_PARSE_LENGTH = 20000; // 20 KB should be enough for any reasonable tag while (true) { // Check for closing tag first (sticky regex—no slicing) startTagCloseY.lastIndex = currentPos; end = startTagCloseY.exec(fullHtml); if (end) { break; } // Limit the input length we pass to the regex to prevent catastrophic backtracking const remainingLen = fullLength - currentPos; const isLimited = remainingLen > MAX_ATTR_PARSE_LENGTH; if (!isLimited) { // Common case: Use sticky regex directly on full string (no slicing) attributeY.lastIndex = currentPos; attr = attributeY.exec(fullHtml); } else { const extractEndPos = currentPos + MAX_ATTR_PARSE_LENGTH; // Create a temporary substring only for attribute parsing (limited for safety) const searchStr = fullHtml.substring(currentPos, extractEndPos); attr = searchStr.match(attribute); // If we limited the input and got a match, check if the value might be truncated if (attr) { // Check if the attribute value extends beyond our search window const attrEnd = attr[0].length; // If the match ends near the limit, the value might be truncated if (attrEnd > MAX_ATTR_PARSE_LENGTH - 100) { // Manually extract this attribute to handle potentially huge value const manualMatch = searchStr.match(/^\s*([^\s"'<>/=]+)\s*=\s*/); if (manualMatch) { const quoteChar = searchStr[manualMatch[0].length]; if (quoteChar === '"' || quoteChar === "'") { const closeQuote = searchStr.indexOf(quoteChar, manualMatch[0].length + 1); if (closeQuote !== -1) { const fullAttrLen = closeQuote + 1; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; attr = []; attr[0] = searchStr.substring(0, fullAttrLen); attr[baseIndex] = manualMatch[1]; // Attribute name attr[baseIndex + 1] = '='; // `customAssign` (falls back to "=" for huge attributes) const value = searchStr.substring(manualMatch[0].length + 1, closeQuote); // Place value at correct index based on quote type if (quoteChar === '"') { attr[baseIndex + 2] = value; // Double-quoted value } else { attr[baseIndex + 3] = value; // Single-quoted value } currentPos += fullAttrLen; consumed += fullAttrLen; match.attrs.push(attr); continue; } } // Note: Unquoted attribute values are intentionally not handled here. // Per HTML spec, unquoted values cannot contain spaces or special chars, // making a 20 KB+ unquoted value practically impossible. If encountered, // it's malformed HTML and using the truncated regex match is acceptable. } } } if (!attr) { // If we limited the input and got no match, try manual extraction // This handles cases where quoted attributes exceed `MAX_ATTR_PARSE_LENGTH` const manualMatch = searchStr.match(/^\s*([^\s"'<>/=]+)\s*=\s*/); if (manualMatch) { const quoteChar = searchStr[manualMatch[0].length]; if (quoteChar === '"' || quoteChar === "'") { // Search in the full HTML (not limited substring) for closing quote const closeQuote = fullHtml.indexOf(quoteChar, currentPos + manualMatch[0].length + 1); if (closeQuote !== -1) { const fullAttrLen = closeQuote - currentPos + 1; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; attr = []; attr[0] = fullHtml.substring(currentPos, closeQuote + 1); attr[baseIndex] = manualMatch[1]; // Attribute name attr[baseIndex + 1] = '='; // customAssign const value = fullHtml.substring(currentPos + manualMatch[0].length + 1, closeQuote); // Place value at correct index based on quote type if (quoteChar === '"') { attr[baseIndex + 2] = value; // Double-quoted value } else { attr[baseIndex + 3] = value; // Single-quoted value } currentPos += fullAttrLen; consumed += fullAttrLen; match.attrs.push(attr); continue; } } } } } if (!attr) { break; } const attrLen = attr[0].length; currentPos += attrLen; consumed += attrLen; match.attrs.push(attr); } // Check for closing tag (sticky regex—no slicing) startTagCloseY.lastIndex = currentPos; end = startTagCloseY.exec(fullHtml); if (end) { match.unarySlash = end[1]; consumed += end[0].length; match.advance = consumed; return match; } } } function findTagInCurrentTable(tagName) { let pos; const needle = tagName.toLowerCase(); for (pos = stack.length - 1; pos >= 0; pos--) { const currentTag = stack[pos].lowerTag; if (currentTag === needle) { return pos; } // Stop searching if we hit a table boundary if (currentTag === 'table') { break; } } return -1; } async function parseEndTagAt(pos) { // Close all open elements up to `pos` (mirrors `parseEndTag`’s core branch) for (let i = stack.length - 1; i >= pos; i--) { if (handler.end) { await handler.end(stack[i].tag, stack[i].attrs, true); } } stack.length = pos; lastTag = pos && stack[pos - 1].tag; } async function closeIfFoundInCurrentTable(tagName) { const pos = findTagInCurrentTable(tagName); if (pos >= 0) { // Close at the specific index to avoid re-searching await parseEndTagAt(pos); return true; } return false; } async function handleStartTag(match) { const tagName = match.tagName; let unarySlash = match.unarySlash; if (lastTag === 'p' && nonPhrasing.has(tagName)) { await parseEndTag('', lastTag); } else if (tagName === 'tbody') { if (!await closeIfFoundInCurrentTable('tfoot')) { await closeIfFoundInCurrentTable('thead'); } } else if (tagName === 'tfoot') { if (!await closeIfFoundInCurrentTable('tbody')) { await closeIfFoundInCurrentTable('thead'); } } else if (tagName === 'thead') { // If a `tbody` or `tfoot` is open in the current table, close it if (!await closeIfFoundInCurrentTable('tbody')) { await closeIfFoundInCurrentTable('tfoot'); } } if (tagName === 'col' && findTagInCurrentTable('colgroup') < 0) { lastTag = 'colgroup'; stack.push({ tag: lastTag, lowerTag: 'colgroup', attrs: [] }); if (handler.start) { await handler.start(lastTag, [], false, '', true); } } else if (tagName !== 'col' && lastTag === 'colgroup') { // Auto-close synthetic `<colgroup>` when a non-`col` element starts await parseEndTag('', 'colgroup'); } if (closeSelf.has(tagName) && lastTag === tagName) { await parseEndTag('', tagName); } // Handle `dt`/`dd` cross-closing: `dt` followed by `dd`, or `dd` followed by `dt` if ((tagName === 'dt' || tagName === 'dd') && (lastTag === 'dt' || lastTag === 'dd')) { await parseEndTag('', lastTag); } const unary = empty.has(tagName) || (tagName === 'html' && lastTag === 'head') || !!unarySlash; const attrs = match.attrs.map(function (args) { let name, value, customOpen, customClose, customAssign, quote; // Hackish workaround for Firefox bug, https://bugzilla.mozilla.org/show_bug.cgi?id=369778 if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) { if (args[3] === '') { delete args[3]; } if (args[4] === '') { delete args[4]; } if (args[5] === '') { delete args[5]; } } function populate(index) { customAssign = args[index]; value = args[index + 1]; if (typeof value !== 'undefined') { return '"'; } value = args[index + 2]; if (typeof value !== 'undefined') { return '\''; } value = args[index + 3]; if (typeof value === 'undefined' && fillAttrs.has(name)) { value = name; } return ''; } let j = 1; if (handler.customAttrSurround) { for (let i = 0, l = handler.customAttrSurround.length; i < l; i++, j += NCP) { name = args[j + 1]; if (name) { quote = populate(j + 2); customOpen = args[j]; customClose = args[j + 6]; break; } } } if (!name && (name = args[j])) { quote = populate(j + 1); } return { name, value, customAssign: customAssign || '=', customOpen: customOpen || '', customClose: customClose || '', quote: quote || '' }; }); if (!unary) { stack.push({ tag: tagName, lowerTag: tagName.toLowerCase(), attrs }); lastTag = tagName; unarySlash = ''; } // Store attributes for `prevAttrs` tracking (used in whitespace collapsing) prevAttrs = attrs; if (handler.start) { await handler.start(tagName, attrs, unary, unarySlash); } } function findTag(tagName) { let pos; const needle = tagName.toLowerCase(); for (pos = stack.length - 1; pos >= 0; pos--) { if (stack[pos].lowerTag === needle) { break; } } return pos; } async function parseEndTag(tag, tagName) { let pos; // Find the closest opened tag of the same type if (tagName) { pos = findTag(tagName); } else { // If no tag name is provided, clean shop pos = 0; } if (pos >= 0) { // Close all the open elements, up the stack for (let i = stack.length - 1; i >= pos; i--) { if (handler.end) { handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag); } } // Remove the open elements from the stack stack.length = pos; lastTag = pos && stack[pos - 1].tag; } else if (handler.partialMarkup && tagName) { // In partial markup mode, preserve stray end tags if (handler.end) { handler.end(tagName, [], false); } } else if (tagName && tagName.toLowerCase() === 'br') { if (handler.start) { await handler.start(tagName, [], true, ''); } } else if (tagName && tagName.toLowerCase() === 'p') { if (handler.start) { await handler.start(tagName, [], false, '', true); } if (handler.end) { handler.end(tagName, []); } } } } }