UNPKG

html-minifier-next

Version:

Super-configurable and well-tested web page minifier (enhanced successor of HTML Minifier)

j9t.github.io/html-minifier-next/

j9t/html-minifier-next

1,278 lines (1,098 loc) • 176 kB

JavaScript

'use strict'; Object.defineProperty(exports, '__esModule', { value: true }); // Stringify for options signatures (sorted keys, shallow, nested objects) function stableStringify(obj) { if (obj == null || typeof obj !== 'object') return JSON.stringify(obj); if (Array.isArray(obj)) return '[' + obj.map(stableStringify).join(',') + ']'; const keys = Object.keys(obj).sort(); let out = '{'; for (let i = 0; i < keys.length; i++) { const k = keys[i]; out += JSON.stringify(k) + ':' + stableStringify(obj[k]) + (i < keys.length - 1 ? ',' : ''); } return out + '}'; } // LRU cache for strings and promises class LRU { constructor(limit = 200) { this.limit = limit; this.map = new Map(); } get(key) { if (this.map.has(key)) { const v = this.map.get(key); this.map.delete(key); this.map.set(key, v); return v; } return undefined; } set(key, value) { if (this.map.has(key)) this.map.delete(key); this.map.set(key, value); if (this.map.size > this.limit) { const first = this.map.keys().next().value; this.map.delete(first); } } delete(key) { this.map.delete(key); } } // Unique ID generator function uniqueId(value) { let id; do { id = 'u' + crypto.randomUUID().replace(/-/g, ''); } while (~value.indexOf(id)); return id; } // Identity and transform functions function identity(value) { return value; } function isThenable(value) { return value != null && typeof value === 'object' && typeof value.then === 'function'; } function lowercase(value) { return value.toLowerCase(); } // Replace async helper /** * Asynchronously replace matches in a string * @param {string} str - Input string * @param {RegExp} regex - Regular expression with global flag * @param {Function} asyncFn - Async function to process each match * @returns {Promise<string>} Processed string */ async function replaceAsync(str, regex, asyncFn) { const promises = []; str.replace(regex, (match, ...args) => { const promise = asyncFn(match, ...args); promises.push(promise); }); const data = await Promise.all(promises); return str.replace(regex, () => data.shift()); } // String patterns to RegExp conversion (for JSON config support) function parseRegExp(value) { if (typeof value === 'string') { if (!value) return undefined; // Empty string = not configured const match = value.match(/^\/(.+)\/([dgimsuvy]*)$/); if (match) { return new RegExp(match[1], match[2]); } return new RegExp(value); } return value; } /* * HTML Parser By John Resig (ejohn.org) * Modified by Juriy “kangax” Zaytsev * Original code by Erik Arvidsson, Mozilla Public License * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js */ /* * Use like so: * * HTMLParser(htmlString, { * start: function(tag, attrs, unary) {}, * end: function(tag) {}, * chars: function(text) {}, * comment: function(text) {} * }); */ class CaseInsensitiveSet extends Set { has(str) { return super.has(str.toLowerCase()); } } // Regular expressions for parsing tags and attributes const singleAttrIdentifier = /([^\s"'<>/=]+)/; const singleAttrAssigns = [/=/]; const singleAttrValues = [ // Attr value double quotes /"([^"]*)"+/.source, // Attr value, single quotes /'([^']*)'+/.source, // Attr value, no quotes /([^ \t\n\f\r"'`=<>]+)/.source ]; // https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName const qnameCapture = (function () { // https://www.npmjs.com/package/ncname const combiningChar = '\u0300-\u0345\u0360\u0361\u0483-\u0486\u0591-\u05A1\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1\u05C2\u05C4\u064B-\u0652\u0670\u06D6-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A02\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A70\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B82\u0B83\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C82\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0D02\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86-\u0F8B\u0F90-\u0F95\u0F97\u0F99-\u0FAD\u0FB1-\u0FB7\u0FB9\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A'; const digit = '0-9\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29'; const extender = '\xB7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005\u3031-\u3035\u309D\u309E\u30FC-\u30FE'; const letter = 'A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E\u0180-\u01C3\u01CD-\u01F0\u01F4\u01F5\u01FA-\u0217\u0250-\u02A8\u02BB-\u02C1\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D6\u03DA\u03DC\u03DE\u03E0\u03E2-\u03F3\u0401-\u040C\u040E-\u044F\u0451-\u045C\u045E-\u0481\u0490-\u04C4\u04C7\u04C8\u04CB\u04CC\u04D0-\u04EB\u04EE-\u04F5\u04F8\u04F9\u0531-\u0556\u0559\u0561-\u0586\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A\u0641-\u064A\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE\u06D0-\u06D3\u06D5\u06E5\u06E6\u0905-\u0939\u093D\u0958-\u0961\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8B\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B36-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60\u0D61\u0E01-\u0E2E\u0E30\u0E32\u0E33\u0E40-\u0E45\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD\u0EAE\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0F40-\u0F47\u0F49-\u0F69\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102\u1103\u1105-\u1107\u1109\u110B\u110C\u110E-\u1112\u113C\u113E\u1140\u114C\u114E\u1150\u1154\u1155\u1159\u115F-\u1161\u1163\u1165\u1167\u1169\u116D\u116E\u1172\u1173\u1175\u119E\u11A8\u11AB\u11AE\u11AF\u11B7\u11B8\u11BA\u11BC-\u11C2\u11EB\u11F0\u11F9\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A\u212B\u212E\u2180-\u2182\u3007\u3021-\u3029\u3041-\u3094\u30A1-\u30FA\u3105-\u312C\u4E00-\u9FA5\uAC00-\uD7A3'; const ncname = '[' + letter + '_][' + letter + digit + '\\.\\-_' + combiningChar + extender + ']*'; return '((?:' + ncname + '\\:)?' + ncname + ')'; })(); const startTagOpen = new RegExp('^<' + qnameCapture); const endTag = new RegExp('^</' + qnameCapture + '[^>]*>'); let IS_REGEX_CAPTURING_BROKEN = false; 'x'.replace(/x(.)?/g, function (m, g) { IS_REGEX_CAPTURING_BROKEN = g === ''; }); // Empty elements const empty = new CaseInsensitiveSet(['area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']); // Elements that you can, intentionally, leave open (and which close themselves) const closeSelf = new CaseInsensitiveSet(['colgroup', 'dd', 'dt', 'li', 'option', 'p', 'td', 'tfoot', 'th', 'thead', 'tr', 'source']); // Attributes that have their values filled in `disabled='disabled'` const fillAttrs = new CaseInsensitiveSet(['checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap', 'readonly', 'selected']); // Special elements (can contain anything) const special = new CaseInsensitiveSet(['script', 'style']); // HTML elements, https://html.spec.whatwg.org/multipage/indices.html#elements-3 // Phrasing content, https://html.spec.whatwg.org/multipage/dom.html#phrasing-content const nonPhrasing = new CaseInsensitiveSet(['address', 'article', 'aside', 'base', 'blockquote', 'body', 'caption', 'col', 'colgroup', 'dd', 'details', 'dialog', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'legend', 'li', 'menuitem', 'meta', 'ol', 'optgroup', 'option', 'param', 'rp', 'rt', 'source', 'style', 'summary', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', 'ul']); const reCache = {}; // Pre-compiled regexes for common special elements (`script`, `style`, `noscript`) // These are used frequently, and pre-compiling them avoids regex creation overhead const preCompiledStackedTags = { 'script': /([\s\S]*?)<\/script[^>]*>/i, 'style': /([\s\S]*?)<\/style[^>]*>/i, 'noscript': /([\s\S]*?)<\/noscript[^>]*>/i }; // Cache for compiled attribute regexes per handler configuration const attrRegexCache = new WeakMap(); // O(n) helper: Strip all occurrences of `open…close` delimiters, keeping inner content // Used instead of a regex replace to avoid O(n²) behavior on adversarial inputs function stripDelimited(str, open, close) { let result = ''; let i = 0; while (i < str.length) { const start = str.indexOf(open, i); if (start === -1) { result += str.slice(i); break; } result += str.slice(i, start); const end = str.indexOf(close, start + open.length); if (end === -1) { result += str.slice(start); break; } result += str.slice(start + open.length, end); i = end + close.length; } return result; } function buildAttrRegex(handler) { let pattern = singleAttrIdentifier.source + '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' + '[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?'; if (handler.customAttrSurround) { const attrClauses = []; for (let i = handler.customAttrSurround.length - 1; i >= 0; i--) { attrClauses[i] = '(?:' + '(' + handler.customAttrSurround[i][0].source + ')\\s*' + pattern + '\\s*(' + handler.customAttrSurround[i][1].source + ')' + ')'; } attrClauses.push('(?:' + pattern + ')'); pattern = '(?:' + attrClauses.join('|') + ')'; } return new RegExp('^\\s*' + pattern); } function getAttrRegexForHandler(handler) { let cached = attrRegexCache.get(handler); if (cached) return cached; const compiled = buildAttrRegex(handler); attrRegexCache.set(handler, compiled); return compiled; } // Cache for sticky attribute regexes (`y` flag for position-based matching on full string) const attrRegexStickyCache = new WeakMap(); function getAttrRegexStickyForHandler(handler) { let cached = attrRegexStickyCache.get(handler); if (cached) return cached; const nonSticky = getAttrRegexForHandler(handler); // Derive sticky version: Remove `^` anchor, add `y` flag const compiled = new RegExp(nonSticky.source.slice(1), 'y'); attrRegexStickyCache.set(handler, compiled); return compiled; } function joinSingleAttrAssigns(handler) { return singleAttrAssigns.concat( handler.customAttrAssign || [] ).map(function (assign) { return '(?:' + assign.source + ')'; }).join('|'); } // Number of captured parts per `customAttrSurround` pattern const NCP = 7; class HTMLParser { constructor(html, handler) { this.html = html; this.handler = handler; } async parse() { const handler = this.handler; const fullHtml = this.html; const fullLength = fullHtml.length; const stack = []; let lastTag; // Use cached attribute regex for this handler configuration const attribute = getAttrRegexForHandler(handler); const attributeY = getAttrRegexStickyForHandler(handler); let prevTag = undefined, nextTag = undefined; let prevAttrs = [], nextAttrs = []; // Sticky regex versions for position-based matching (avoids string slicing) const startTagOpenY = new RegExp(startTagOpen.source.slice(1), 'y'); // `\s*` with sticky flag is O(n) at worst—no retry from different positions possible const startTagCloseY = /\s*(\/?)>/y; const endTagY = new RegExp(endTag.source.slice(1), 'y'); const doctypeY = /<!DOCTYPE[^<>]+>/iy; const commentTestY = /', pos + 4); if (commentEnd >= 0) { if (handler.comment) { const result = handler.comment(fullHtml.substring(pos + 4, commentEnd)); if (isThenable(result)) await result; } advance(commentEnd + 3 - pos); prevTag = ''; prevAttrs = []; continue; } } // https://web.archive.org/web/20241201212701/https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment conditionalTestY.lastIndex = pos; if (conditionalTestY.test(fullHtml)) { const conditionalEnd = fullHtml.indexOf(']>', pos + 3); if (conditionalEnd >= 0) { if (handler.comment) { const result = handler.comment(fullHtml.substring(pos + 2, conditionalEnd + 1), true /* Non-standard */); if (isThenable(result)) await result; } advance(conditionalEnd + 2 - pos); prevTag = ''; prevAttrs = []; continue; } } // Doctype doctypeY.lastIndex = pos; const doctypeMatch = doctypeY.exec(fullHtml); if (doctypeMatch) { if (handler.doctype) { handler.doctype(doctypeMatch[0]); } advance(doctypeMatch[0].length); prevTag = ''; prevAttrs = []; continue; } // End tag endTagY.lastIndex = pos; const endTagMatch = endTagY.exec(fullHtml); if (endTagMatch) { advance(endTagMatch[0].length); await parseEndTag(endTagMatch[0], endTagMatch[1]); prevTag = '/' + endTagMatch[1].toLowerCase(); prevAttrs = []; continue; } // Start tag const startTagMatch = parseStartTag(pos); if (startTagMatch) { advance(startTagMatch.advance); await handleStartTag(startTagMatch); prevTag = startTagMatch.tagName.toLowerCase(); continue; } // Treat `<` as text if (handler.continueOnParseError) ; } let text; if (textEnd >= 0) { text = fullHtml.substring(pos, textEnd); advance(textEnd - pos); } else { text = fullHtml.substring(pos); advance(fullLength - pos); } // Next tag for whitespace processing context if (handler.wantsNextTag) { const nextStartTagMatch = parseStartTag(pos); if (nextStartTagMatch) { nextTag = nextStartTagMatch.tagName; // Extract minimal attribute info for whitespace logic (just name/value pairs) nextAttrs = extractAttrInfo(nextStartTagMatch.attrs); cachedNextStartTag = { match: nextStartTagMatch, pos }; } else { endTagY.lastIndex = pos; const nextEndTagMatch = endTagY.exec(fullHtml); if (nextEndTagMatch) { nextTag = '/' + nextEndTagMatch[1]; nextAttrs = []; cachedNextEndTag = { match: nextEndTagMatch, pos }; } else { nextTag = ''; nextAttrs = []; } } } if (handler.chars) { const result = handler.chars(text, prevTag, nextTag, prevAttrs, nextAttrs); if (isThenable(result)) await result; } prevTag = ''; prevAttrs = []; } else { const stackedTag = lastTag.toLowerCase(); // Use pre-compiled regex for common tags (`script`, `style`, `noscript`) to avoid regex creation overhead const reStackedTag = preCompiledStackedTags[stackedTag] || reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)\\x3c/' + stackedTag + '[^>]*>', 'i')); const remaining = sliceFromPos(pos); const m = reStackedTag.exec(remaining); if (m && m.index === 0) { let text = m[1]; if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') { text = stripDelimited(stripDelimited(text, ''), '<![CDATA[', ']]>'); } if (handler.chars) { const result = handler.chars(text); if (isThenable(result)) await result; } // Advance HTML past the matched special tag content and its closing tag advance(m[0].length); await parseEndTag('</' + stackedTag + '>', stackedTag); } else { // No closing tag found; to avoid infinite loop, break similarly to previous behavior if (handler.continueOnParseError && handler.chars && pos < fullLength) { const result = handler.chars(fullHtml[pos], prevTag, '', prevAttrs, []); if (isThenable(result)) await result; advance(1); } else { break; } } } if (pos === lastPos) { if (handler.continueOnParseError) { // Skip the problematic character and continue if (handler.chars) { const result = handler.chars(fullHtml[pos], prevTag, '', prevAttrs, []); if (isThenable(result)) await result; } advance(1); prevTag = ''; prevAttrs = []; continue; } const loc = getLineColumn(pos); // Include some context before the error position so the snippet contains the offending markup plus preceding characters (e.g., `invalid<tag`) const CONTEXT_BEFORE = 50; const startPos = Math.max(0, pos - CONTEXT_BEFORE); const snippet = fullHtml.slice(startPos, startPos + 200).replace(/\n/g, ' '); throw new Error( `Parse error at line ${loc.line}, column ${loc.column}:\n${snippet}${fullHtml.length > startPos + 200 ? '…' : ''}` ); } } if (!handler.partialMarkup) { // Clean up any remaining tags await parseEndTag(); } // Helper to extract minimal attribute info (name/value pairs) from raw attribute matches // Used for whitespace collapsing logic—doesn’t need full processing function extractAttrInfo(rawAttrs) { if (!rawAttrs || !rawAttrs.length) return []; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; return rawAttrs.map(args => { // Extract attribute name (always at `baseIndex`) const name = args[baseIndex]; // Extract value from double-quoted (`baseIndex + 2`), single-quoted (`baseIndex + 3`), or unquoted (`baseIndex + 4`) const value = args[baseIndex + 2] ?? args[baseIndex + 3] ?? args[baseIndex + 4]; return { name: name?.toLowerCase(), value }; }).filter(attr => attr.name); // Filter out invalid entries } function parseStartTag(startPos) { startTagOpenY.lastIndex = startPos; const start = startTagOpenY.exec(fullHtml); if (start) { const match = { tagName: start[1], attrs: [], advance: 0 }; let consumed = start[0].length; let currentPos = startPos + consumed; let end, attr; // Safety limit: Max length of input to check for attributes // Protects against catastrophic backtracking on massive attribute values const MAX_ATTR_PARSE_LENGTH = 20000; // 20 KB should be enough for any reasonable tag while (true) { // Check for closing tag first (sticky regex—no slicing) startTagCloseY.lastIndex = currentPos; end = startTagCloseY.exec(fullHtml); if (end) { break; } // Limit the input length we pass to the regex to prevent catastrophic backtracking const remainingLen = fullLength - currentPos; const isLimited = remainingLen > MAX_ATTR_PARSE_LENGTH; if (!isLimited) { // Common case: Use sticky regex directly on full string (no slicing) attributeY.lastIndex = currentPos; attr = attributeY.exec(fullHtml); } else { const extractEndPos = currentPos + MAX_ATTR_PARSE_LENGTH; // Create a temporary substring only for attribute parsing (limited for safety) const searchStr = fullHtml.substring(currentPos, extractEndPos); attr = searchStr.match(attribute); // If we limited the input and got a match, check if the value might be truncated if (attr) { // Check if the attribute value extends beyond our search window const attrEnd = attr[0].length; // If the match ends near the limit, the value might be truncated if (attrEnd > MAX_ATTR_PARSE_LENGTH - 100) { // Manually extract this attribute to handle potentially huge value const manualMatch = searchStr.match(/^\s*([^\s"'<>/=]+)\s*=\s*/); if (manualMatch) { const quoteChar = searchStr[manualMatch[0].length]; if (quoteChar === '"' || quoteChar === "'") { const closeQuote = searchStr.indexOf(quoteChar, manualMatch[0].length + 1); if (closeQuote !== -1) { const fullAttrLen = closeQuote + 1; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; attr = []; attr[0] = searchStr.substring(0, fullAttrLen); attr[baseIndex] = manualMatch[1]; // Attribute name attr[baseIndex + 1] = '='; // `customAssign` (falls back to "=" for huge attributes) const value = searchStr.substring(manualMatch[0].length + 1, closeQuote); // Place value at correct index based on quote type if (quoteChar === '"') { attr[baseIndex + 2] = value; // Double-quoted value } else { attr[baseIndex + 3] = value; // Single-quoted value } currentPos += fullAttrLen; consumed += fullAttrLen; match.attrs.push(attr); continue; } } // Note: Unquoted attribute values are intentionally not handled here. // Per HTML spec, unquoted values cannot contain spaces or special chars, // making a 20 KB+ unquoted value practically impossible. If encountered, // it's malformed HTML and using the truncated regex match is acceptable. } } } if (!attr) { // If we limited the input and got no match, try manual extraction // This handles cases where quoted attributes exceed `MAX_ATTR_PARSE_LENGTH` const manualMatch = searchStr.match(/^\s*([^\s"'<>/=]+)\s*=\s*/); if (manualMatch) { const quoteChar = searchStr[manualMatch[0].length]; if (quoteChar === '"' || quoteChar === "'") { // Search in the full HTML (not limited substring) for closing quote const closeQuote = fullHtml.indexOf(quoteChar, currentPos + manualMatch[0].length + 1); if (closeQuote !== -1) { const fullAttrLen = closeQuote - currentPos + 1; const numCustomParts = handler.customAttrSurround ? handler.customAttrSurround.length * NCP : 0; const baseIndex = 1 + numCustomParts; attr = []; attr[0] = fullHtml.substring(currentPos, closeQuote + 1); attr[baseIndex] = manualMatch[1]; // Attribute name attr[baseIndex + 1] = '='; // customAssign const value = fullHtml.substring(currentPos + manualMatch[0].length + 1, closeQuote); // Place value at correct index based on quote type if (quoteChar === '"') { attr[baseIndex + 2] = value; // Double-quoted value } else { attr[baseIndex + 3] = value; // Single-quoted value } currentPos += fullAttrLen; consumed += fullAttrLen; match.attrs.push(attr); continue; } } } } } if (!attr) { break; } const attrLen = attr[0].length; currentPos += attrLen; consumed += attrLen; match.attrs.push(attr); } // Check for closing tag (sticky regex—no slicing) startTagCloseY.lastIndex = currentPos; end = startTagCloseY.exec(fullHtml); if (end) { match.unarySlash = end[1]; consumed += end[0].length; match.advance = consumed; return match; } } } function findTagInCurrentTable(tagName) { let pos; const needle = tagName.toLowerCase(); for (pos = stack.length - 1; pos >= 0; pos--) { const currentTag = stack[pos].lowerTag; if (currentTag === needle) { return pos; } // Stop searching if we hit a table boundary if (currentTag === 'table') { break; } } return -1; } async function parseEndTagAt(pos) { // Close all open elements up to `pos` (mirrors `parseEndTag`’s core branch) for (let i = stack.length - 1; i >= pos; i--) { if (handler.end) { await handler.end(stack[i].tag, stack[i].attrs, true); } } stack.length = pos; lastTag = pos && stack[pos - 1].tag; } async function closeIfFoundInCurrentTable(tagName) { const pos = findTagInCurrentTable(tagName); if (pos >= 0) { // Close at the specific index to avoid re-searching await parseEndTagAt(pos); return true; } return false; } async function handleStartTag(match) { const tagName = match.tagName; let unarySlash = match.unarySlash; if (lastTag === 'p' && nonPhrasing.has(tagName)) { await parseEndTag('', lastTag); } else if (tagName === 'tbody') { if (!await closeIfFoundInCurrentTable('tfoot')) { await closeIfFoundInCurrentTable('thead'); } } else if (tagName === 'tfoot') { if (!await closeIfFoundInCurrentTable('tbody')) { await closeIfFoundInCurrentTable('thead'); } } else if (tagName === 'thead') { // If a `tbody` or `tfoot` is open in the current table, close it if (!await closeIfFoundInCurrentTable('tbody')) { await closeIfFoundInCurrentTable('tfoot'); } } if (tagName === 'col' && findTagInCurrentTable('colgroup') < 0) { lastTag = 'colgroup'; stack.push({ tag: lastTag, lowerTag: 'colgroup', attrs: [] }); if (handler.start) { await handler.start(lastTag, [], false, '', true); } } else if (tagName !== 'col' && lastTag === 'colgroup') { // Auto-close synthetic `<colgroup>` when a non-`col` element starts await parseEndTag('', 'colgroup'); } if (closeSelf.has(tagName) && lastTag === tagName) { await parseEndTag('', tagName); } // Handle `dt`/`dd` cross-closing: `dt` followed by `dd`, or `dd` followed by `dt` if ((tagName === 'dt' || tagName === 'dd') && (lastTag === 'dt' || lastTag === 'dd')) { await parseEndTag('', lastTag); } const unary = empty.has(tagName) || (tagName === 'html' && lastTag === 'head') || !!unarySlash; const attrs = match.attrs.map(function (args) { let name, value, customOpen, customClose, customAssign, quote; // Hackish workaround for Firefox bug, https://bugzilla.mozilla.org/show_bug.cgi?id=369778 if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) { if (args[3] === '') { delete args[3]; } if (args[4] === '') { delete args[4]; } if (args[5] === '') { delete args[5]; } } function populate(index) { customAssign = args[index]; value = args[index + 1]; if (typeof value !== 'undefined') { return '"'; } value = args[index + 2]; if (typeof value !== 'undefined') { return '\''; } value = args[index + 3]; if (typeof value === 'undefined' && fillAttrs.has(name)) { value = name; } return ''; } let j = 1; if (handler.customAttrSurround) { for (let i = 0, l = handler.customAttrSurround.length; i < l; i++, j += NCP) { name = args[j + 1]; if (name) { quote = populate(j + 2); customOpen = args[j]; customClose = args[j + 6]; break; } } } if (!name && (name = args[j])) { quote = populate(j + 1); } return { name, value, customAssign: customAssign || '=', customOpen: customOpen || '', customClose: customClose || '', quote: quote || '' }; }); if (!unary) { stack.push({ tag: tagName, lowerTag: tagName.toLowerCase(), attrs }); lastTag = tagName; unarySlash = ''; } // Store attributes for `prevAttrs` tracking (used in whitespace collapsing) prevAttrs = attrs; if (handler.start) { await handler.start(tagName, attrs, unary, unarySlash); } } function findTag(tagName) { let pos; const needle = tagName.toLowerCase(); for (pos = stack.length - 1; pos >= 0; pos--) { if (stack[pos].lowerTag === needle) { break; } } return pos; } async function parseEndTag(tag, tagName) { let pos; // Find the closest opened tag of the same type if (tagName) { pos = findTag(tagName); } else { // If no tag name is provided, clean shop pos = 0; } if (pos >= 0) { // Close all the open elements, up the stack for (let i = stack.length - 1; i >= pos; i--) { if (handler.end) { handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag); } } // Remove the open elements from the stack stack.length = pos; lastTag = pos && stack[pos - 1].tag; } else if (handler.partialMarkup && tagName) { // In partial markup mode, preserve stray end tags if (handler.end) { handler.end(tagName, [], false); } } else if (tagName && tagName.toLowerCase() === 'br') { if (handler.start) { await handler.start(tagName, [], true, ''); } } else if (tagName && tagName.toLowerCase() === 'p') { if (handler.start) { await handler.start(tagName, [], false, '', true); } if (handler.end) { handler.end(tagName, []); } } } } } class Sorter { sort(tokens, fromIndex = 0) { for (let i = 0, len = this.keys.length; i < len; i++) { const token = this.keys[i]; // Single pass: Count matches and collect non-matches let matchCount = 0; const others = []; for (let j = fromIndex; j < tokens.length; j++) { if (tokens[j] === token) { matchCount++; } else { others.push(tokens[j]); } } if (matchCount > 0) { // Rebuild: `matchCount` instances of token first, then others let writeIdx = fromIndex; for (let j = 0; j < matchCount; j++) { tokens[writeIdx++] = token; } for (let j = 0; j < others.length; j++) { tokens[writeIdx++] = others[j]; } const newFromIndex = fromIndex + matchCount; return this.sorterMap.get(token).sort(tokens, newFromIndex); } } return tokens; } } class TokenChain { constructor() { // Use map instead of object properties for better performance this.map = new Map(); } add(tokens) { tokens.forEach((token) => { if (!this.map.has(token)) { this.map.set(token, { arrays: [], processed: 0 }); } this.map.get(token).arrays.push(tokens); }); } createSorter() { const sorter = new Sorter(); sorter.sorterMap = new Map(); // Convert map entries to array and sort by frequency (descending), then alphabetically const entries = Array.from(this.map.entries()).sort((a, b) => { const m = a[1].arrays.length; const n = b[1].arrays.length; // Sort by length descending (larger first) const lengthDiff = n - m; if (lengthDiff !== 0) return lengthDiff; // If lengths equal, sort by key ascending return a[0].localeCompare(b[0]); }); sorter.keys = []; entries.forEach(([token, data]) => { if (data.processed < data.arrays.length) { const chain = new TokenChain(); data.arrays.forEach((tokens) => { // Build new array without the current token instead of splicing const filtered = []; for (let i = 0; i < tokens.length; i++) { if (tokens[i] !== token) { filtered.push(tokens[i]); } } // Mark remaining tokens as processed filtered.forEach((t) => { const tData = this.map.get(t); if (tData) { tData.processed++; } }); if (filtered.length > 0) { chain.add(filtered); } }); sorter.keys.push(token); sorter.sorterMap.set(token, chain.createSorter()); } }); return sorter; } } /** * Preset configurations * * Presets provide curated option sets for common use cases: * - `conservative`: Safe minification suitable for most projects * - `comprehensive`: Aggressive minification for maximum file size reduction */ const presets = { conservative: { caseSensitive: true, collapseBooleanAttributes: true, collapseWhitespace: true, conservativeCollapse: true, preserveLineBreaks: true, processConditionalComments: true, removeComments: true, removeScriptTypeAttributes: true, removeStyleLinkTypeAttributes: true, useShortDoctype: true }, comprehensive: { collapseAttributeWhitespace: true, collapseBooleanAttributes: true, collapseWhitespace: true, continueOnParseError: true, decodeEntities: true, mergeScripts: true, minifyCSS: true, minifyJS: true, minifySVG: true, minifyURLs: true, processConditionalComments: true, removeAttributeQuotes: true, removeComments: true, removeEmptyAttributes: true, removeOptionalTags: true, removeRedundantAttributes: true, removeScriptTypeAttributes: true, removeStyleLinkTypeAttributes: true, useShortDoctype: true } }; /** * Get preset configuration by name * @param {string} name - Preset name (“conservative” or “comprehensive”) * @returns {object|null} Preset options object or null if not found */ function getPreset(name) { if (!name) return null; const normalizedName = name.toLowerCase(); return presets[normalizedName] || null; } /** * Get list of available preset names * @returns {string[]} Array of preset names */ function getPresetNames() { return Object.keys(presets); } // Regex patterns (to avoid repeated allocations in hot paths) const RE_WS_START = /^[ \n\r\t\f]+/; const RE_WS_END = /[ \n\r\t\f]+$/; const RE_ALL_WS_NBSP = /[ \n\r\t\f\xA0]+/g; const RE_NBSP_LEADING_GROUP = /(^|\xA0+)[^\xA0]+/g; const RE_NBSP_LEAD_GROUP = /(\xA0+)[^\xA0]+/g; const RE_NBSP_TRAILING_GROUP = /[^\xA0]+(\xA0+)/g; const RE_NBSP_TRAILING_STRIP = /[^\xA0]+$/; const RE_CONDITIONAL_COMMENT = /^\[if\s[^\]]+]|\[endif]$/; const RE_EVENT_ATTR_DEFAULT = /^on[a-z]{3,}$/; const RE_CAN_REMOVE_ATTR_QUOTES = /^[^ \t\n\f\r"'`=<>]+$/; const RE_TRAILING_SEMICOLON = /;$/; const RE_AMP_ENTITY = /&(#?[0-9a-zA-Z]+;)/g; const RE_LEGACY_ENTITIES = /&((?:Iacute|aacute|uacute|plusmn|Otilde|otilde|agrave|Agrave|Yacute|yacute|Oslash|oslash|atilde|Atilde|brvbar|ccedil|Ccedil|Ograve|curren|divide|eacute|Eacute|ograve|Oacute|egrave|Egrave|Ugrave|frac12|frac14|frac34|ugrave|oacute|iacute|Ntilde|ntilde|Uacute|middot|igrave|Igrave|iquest|Aacute|cedil|laquo|micro|iexcl|Icirc|icirc|acirc|Ucirc|Ecirc|ocirc|Ocirc|ecirc|ucirc|Aring|aring|AElig|aelig|acute|pound|raquo|Acirc|times|THORN|szlig|thorn|COPY|auml|ordf|ordm|Uuml|macr|uuml|Auml|ouml|Ouml|para|nbsp|euml|quot|QUOT|Euml|yuml|cent|sect|copy|sup1|sup2|sup3|iuml|Iuml|ETH|shy|reg|not|yen|amp|AMP|REG|uml|eth|deg|gt|GT|LT|lt)(?!;)|(?:#?[0-9a-zA-Z]+;))/g; const RE_ESCAPE_LT = /</g; const RE_ATTR_WS_CHECK = /[ \n\r\t\f]/; const RE_ATTR_WS_COLLAPSE = /[ \n\r\t\f]+/g; const RE_ATTR_WS_TRIM = /^[ \n\r\t\f]+|[ \n\r\t\f]+$/g; // Inline element sets for whitespace handling // Non-empty elements that will maintain whitespace around them const inlineElementsToKeepWhitespaceAround = new Set(['a', 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'big', 'button', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'mark', 'math', 'meter', 'nobr', 'object', 'output', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'svg', 'textarea', 'time', 'tt', 'u', 'var', 'wbr']); // Non-empty elements that will maintain whitespace within them const inlineElementsToKeepWhitespaceWithin = new Set(['a', 'abbr', 'acronym', 'b', 'big', 'del', 'em', 'font', 'i', 'ins', 'kbd', 'mark', 'nobr', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'time', 'tt', 'u', 'var']); // Elements that will always maintain whitespace around them const inlineElementsToKeepWhitespace = new Set(['comment', 'img', 'input', 'wbr']); // Form control elements (for conditional whitespace collapsing) const formControlElements = new Set(['input', 'button', 'select', 'textarea', 'output', 'meter', 'progress']); // Default attribute values // Default attribute values (could apply to any element) const generalDefaults = { autocorrect: 'on', fetchpriority: 'auto', loading: 'eager', popovertargetaction: 'toggle' }; // Tag-specific default attribute values const tagDefaults = { area: { shape: 'rect' }, button: { type: 'submit' }, form: { enctype: 'application/x-www-form-urlencoded', method: 'get' }, html: { dir: 'ltr' }, img: { decoding: 'auto' }, input: { colorspace: 'limited-srgb', type: 'text' }, link: { media: 'all' }, marquee: { behavior: 'scroll', direction: 'left' }, meta: { media: 'all' }, source: { media: 'all' }, style: { media: 'all' }, textarea: { wrap: 'soft' }, track: { kind: 'subtitles' } }; // Script MIME types // https://mathiasbynens.be/demo/javascript-mime-type // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/script const executableScriptsMimetypes = new Set([ 'text/javascript', 'text/x-javascript', 'text/ecmascript', 'text/x-ecmascript', 'text/jscript', 'application/javascript', 'application/x-javascript', 'application/ecmascript', 'application/x-ecmascript', 'module' ]); const keepScriptsMimetypes = new Set([ 'module' ]); // Boolean attribute sets const isSimpleBoolean = new Set(['allowfullscreen', 'async', 'autofocus', 'autoplay', 'checked', 'compact', 'controls', 'declare', 'default', 'defaultchecked', 'defaultmuted', 'defaultselected', 'defer', 'disabled', 'enabled', 'formnovalidate', 'hidden', 'indeterminate', 'inert', 'ismap', 'itemscope', 'loop', 'multiple', 'muted', 'nohref', 'noresize', 'noshade', 'novalidate', 'nowrap', 'open', 'pauseonexit', 'readonly', 'required', 'reversed', 'scoped', 'seamless', 'selected', 'sortable', 'truespeed', 'typemustmatch', 'visible']); const isBooleanValue = new Set(['true', 'false']); // Attributes where certain values can be collapsed to just the attribute name; // maps each attribute name to the set of values that collapse to the bare attribute: // - `crossorigin=""` and `crossorigin="anonymous"` → `crossorigin` (anonymous is the default) // - `contenteditable=""` → `contenteditable` (empty string means inherit/true) const collapsibleValues = new Map([ ['crossorigin', new Set(['', 'anonymous'])], ['contenteditable', new Set([''])] ]); // `srcset` elements const srcsetElements = new Set(['img', 'source']); // JSON script types const jsonScriptTypes = new Set([ 'application/json', 'application/ld+json', 'application/manifest+json', 'application/vnd.geo+json', 'application/problem+json', 'application/merge-patch+json', 'application/json-patch+json', 'importmap', 'speculationrules', ]); // Tag omission rules and element sets // Tag omission rules from https://html.spec.whatwg.org/multipage/syntax.html#optional-tags with the following extensions: // - retain `<body>` if followed by `<noscript>` // - `<rb>`, `<rt>`, `<rtc>`, `<rp>` follow HTML Ruby Markup Extensions draft (https://www.w3.org/TR/html-ruby-extensions/) // - retain all tags which are adjacent to non-standard HTML tags const optionalStartTags = new Set(['html', 'head', 'body', 'colgroup', 'tbody']); const optionalEndTags = new Set(['html', 'head', 'body', 'li', 'dt', 'dd', 'p', 'rb', 'rt', 'rtc', 'rp', 'optgroup', 'option', 'colgroup', 'caption', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th']); const headerElements = new Set(['meta', 'link', 'script', 'style', 'template', 'noscript']); const descriptionElements = new Set(['dt', 'dd']); const pBlockElements = new Set(['address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'search', 'section', 'table', 'ul']); const pInlineElements = new Set(['a', 'audio', 'del', 'ins', 'map', 'noscript', 'video']); const rubyEndTagOmission = new Set(['rb', 'rt', 'rtc', 'rp']); // `</rb>`, `</rt>`, `</rp>` can be omitted if followed by `<rb>`, `<rt>`, `<rtc>`, or `<rp>` const rubyRtcEndTagOmission = new Set(['rb', 'rtc']); // `</rtc>` can be omitted if followed by `<rb>` or `<rtc>` (not `<rt>` or `<rp>`) const optionElements = new Set(['option', 'optgroup']); const tableContentElements = new Set(['tbody', 'tfoot']); const tableSectionElements = new Set(['thead', 'tbody', 'tfoot']); const cellElements = new Set(['td', 'th']); const topLevelElements = new Set(['html', 'head', 'body']); const compactElements = new Set(['html', 'body']); const looseElements = new Set(['head', 'colgroup', 'caption']); const trailingElements = new Set(['dt', 'thead']); const htmlElements = new Set(['a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi', 'bdo', 'bgsound', 'big', 'blink', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', 'content', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'element', 'em', 'embed', 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'i', 'iframe', 'image', 'img', 'input', 'ins', 'isindex', 'kbd', 'keygen', 'label', 'legend', 'li', 'link', 'listing', 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol', 'nav', 'nobr', 'noembed', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'picture', 'plaintext', 'pre', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'script', 'search', 'section', 'select', 'selectedcontent', 'shadow', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video', 'wbr', 'xmp']); // Empty attribute regex const reEmptyAttribute = new RegExp( '^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(' + '?:down|up|over|move|out)|key(?:press|down|up)))$'); // Special content elements const specialContentElements = new Set(['script', 'style']); // Imports // Trim whitespace const trimWhitespace = str => { if (!str) return str; // Fast path: If no whitespace at start or end, return early if (!/^[ \n\r\t\f]/.test(str) && !/[ \n\r\t\f]$/.test(str)) { return str; } return str.replace(RE_WS_START, '').replace(RE_WS_END, ''); }; // Collapse all whitespace function collapseWhitespaceAll(str) { if (!str) return str; // Fast path: If there are no common whitespace characters, return early if (!/[ \n\r\t\f\xA0]/.test(str)) { return str; } // No-break space is specifically handled inside the replacer function here: return str.replace(RE_ALL_WS_NBSP, function (spaces) { // Preserve standalone tabs if (spaces === '\t') return '\t'; // Fast path: No no-break space, common case—just collapse to single space // This avoids the nested regex for the majority of cases if (spaces.indexOf('\xA0') === -1) return ' '; // For no-break space handling, use the original regex approach return spaces.replace(RE_NBSP_LEADING_GROUP, '$1 '); }); } // Collapse whitespace with options function collapseWhitespace(str, options, trimLeft, trimRight, collapseAll) { let lineBreakBefore = ''; let lineBreakAfter = ''; if (!str) return str; // Fast path: Nothing to do if (!trimLeft && !trimRight && !collapseAll && !options.preserveLineBreaks) { return str; } // Fast path: No whitespace at all if (!/[ \n\r\t\f\xA0]/.test(str)) { return str; } if (options.preserveLineBreaks) { // Find leading/trailing whitespace containing line breaks manually // (avoids polynomial backtracking with end-anchored lazy quantifiers) const WS_CHARS = ' \n\r\t\f'; let leadEnd = 0; while (leadEnd < str.length && WS_CHARS.includes(str[leadEnd])) { leadEnd++; } if (leadEnd > 0) { const leading = str.slice(0, leadEnd); if (/[\n\r]/.test(leading)) { lineBreakBefore = '\n'; str = str.slice(leadEnd);