UNPKG

@humanspeak/svelte-markdown

Version:

Markdown and HTML renderer for Svelte 5 — built for rendering streaming AI agent output from Claude Code, ChatGPT, and agentic workflows. XSS-safe defaults, streaming-aware sanitization, token caching, TypeScript types, and Svelte 5 runes.

619 lines (618 loc) 22.3 kB
import * as htmlparser2 from 'htmlparser2'; /** * Matches HTML tags with comprehensive coverage of edge cases. * Pattern breakdown: * - <\/? : Matches opening < and optional / * - [a-zA-Z] : Tag must start with letter * - [a-zA-Z0-9-] : Subsequent chars can be letters, numbers, or hyphens * - (?:\s+[^>]*)?: Optional attributes * - > : Closing bracket * * @const {RegExp} */ const htmlTagRegex = /<\/?([a-zA-Z][a-zA-Z0-9-]{0,})(?:\s+[^>]*)?>/; /** * Regex pattern for self-closing HTML tags. * @const {RegExp} */ const SELF_CLOSING_TAGS = /^(br|hr|img|input|link|meta|area|base|col|embed|keygen|param|source|track|wbr)$/i; /** * Analyzes a string to determine if it contains an HTML tag and its characteristics. * * @param {string} raw - Raw string potentially containing an HTML tag * @returns {Object|null} Returns null if no tag found, otherwise returns: * { * tag: string - The name of the HTML tag * isOpening: bool - True if opening tag, false if closing * } * * @example * isHtmlOpenTag('<div class="test">') // Returns { tag: 'div', isOpening: true } * isHtmlOpenTag('</span>') // Returns { tag: 'span', isOpening: false } * isHtmlOpenTag('plain text') // Returns null */ export const isHtmlOpenTag = (raw) => { const match = htmlTagRegex.exec(raw); if (!match) return null; return { tag: match[1], isOpening: !raw.startsWith('</') }; }; /** * Formats individual HTML tokens to ensure self-closing tags are properly formatted. * This handles cases like <br> -> <br/> without affecting the token structure. * * @param {Token} token - HTML token to format * @returns {Token} Formatted token with proper self-closing syntax */ const formatSelfClosingHtmlToken = (token) => { // Extract tag name from raw HTML const tagMatch = token.raw.match(/<\/?([a-zA-Z][a-zA-Z0-9-]*)/i); if (!tagMatch) return token; const tagName = tagMatch[1]; if (!SELF_CLOSING_TAGS.test(tagName)) return token; // Self-closing tags get `.tag` and `.attributes` set so downstream // code (pairing, dispatch, sanitization) has structured access. If // the source already used the `<.../>` form we keep raw as-is; // otherwise we normalize the `>` to `/>`. const formattedRaw = token.raw.endsWith('/>') ? token.raw : token.raw.replace(/\s*>$/, '/>'); return { ...token, raw: formattedRaw, tag: tagName, attributes: extractAttributes(token.raw) }; }; /** * Parses HTML attributes from a tag string into a structured object. * Handles both single and double quoted attributes, plus bare boolean * attributes. Quoted regions are stripped before the boolean pass so * space-separated words inside a value (e.g. `bar` in `title="foo bar * baz"`) aren't mistakenly harvested as boolean attributes (issue #297). * * @param raw - Raw HTML tag string containing attributes. * @returns Map of attribute names to their values. Boolean attributes * are represented as `''`. * * @example * extractAttributes('<div class="foo" id="bar">') * // → { class: 'foo', id: 'bar' } * * extractAttributes('<Tip title="foo bar baz">') * // → { title: 'foo bar baz' } // not { title: …, bar: '' } * * extractAttributes('<input type="checkbox" checked disabled>') * // → { type: 'checkbox', checked: '', disabled: '' } * * @internal */ export const extractAttributes = (raw) => { const attributes = {}; // First pass: handle regular and unclosed quoted attributes const quotedRegex = /([a-zA-Z][\w-]*?)=["']([^"']*?)(?:["']|$)/g; let match; while ((match = quotedRegex.exec(raw)) !== null) { const [, key, value] = match; attributes[key] = value.trim(); } // Strip quoted attribute blocks before the boolean pass so word-like // tokens inside a value (e.g. `bar` in `title="foo bar baz"`) aren't // mistakenly harvested as boolean attributes. const stripped = raw.replace(/[a-zA-Z][\w-]*?=["'][^"']*?(?:["']|$)/g, ' '); // Second pass: handle boolean attributes const booleanRegex = /(?:^|\s)([a-zA-Z][\w-]*?)(?=[\s>]|$)/g; while ((match = booleanRegex.exec(stripped)) !== null) { const [, key] = match; if (key && !attributes[key]) { attributes[key] = ''; } } return attributes; }; /** * Converts an HTML string into a sequence of tokens using htmlparser2. * Handles complex nested structures while maintaining proper order and relationships. * * Key features: * - Preserves original HTML structure without automatic tag closing * - Handles self-closing tags with proper XML syntax (e.g., <br/> instead of <br>) * - Gracefully handles malformed HTML by preserving the original structure * - Maintains attribute information in opening tags * - Processes text content between tags * * @param {string} html - HTML string to be parsed * @returns {Token[]} Array of tokens representing the HTML structure * * @example * // Well-formed HTML * parseHtmlBlock('<div>Hello <span>world</span></div>') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Hello ', ... }, * // { type: 'html', raw: '<span>', ... }, * // { type: 'text', raw: 'world', ... }, * // { type: 'html', raw: '</span>', ... }, * // { type: 'html', raw: '</div>', ... } * // ] * * // Self-closing tags * parseHtmlBlock('<div>Before<br/>After</div>') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Before', ... }, * // { type: 'html', raw: '<br/>', ... }, * // { type: 'text', raw: 'After', ... }, * // { type: 'html', raw: '</div>', ... } * // ] * * // Malformed HTML * parseHtmlBlock('<div>Unclosed') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Unclosed', ... } * // ] * * @internal */ /** * Serializes an HTML attribute map into a string for tag construction. * Escapes double quotes in values to prevent attribute injection. * * @param {Record<string, string>} attributes - Map of attribute names to values * @returns {string} Serialized attributes string with leading spaces * * @example * serializeAttributes({ class: 'foo', id: 'bar' }) * // Returns ' class="foo" id="bar"' * * @internal */ const serializeAttributes = (attributes) => Object.entries(attributes) .map(([key, value]) => ` ${key}="${value.replace(/"/g, '&quot;')}"`) .join(''); export const parseHtmlBlock = (html) => { const tokens = []; let currentText = ''; const openTags = []; const parser = new htmlparser2.Parser({ onopentag: (name, attributes) => { if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); currentText = ''; } if (SELF_CLOSING_TAGS.test(name)) { tokens.push({ type: 'html', raw: `<${name}${serializeAttributes(attributes)}/>`, tag: name, attributes }); } else { openTags.push(name); tokens.push({ type: 'html', raw: `<${name}${serializeAttributes(attributes)}>`, tag: name, attributes }); } }, ontext: (text) => { currentText += text; }, onclosetag: (name) => { if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); currentText = ''; } // Only add closing tag if we found its opening tag // and it's not a self-closing tag if (openTags.includes(name) && !SELF_CLOSING_TAGS.test(name)) { if (html.includes(`</${name}>`)) { tokens.push({ type: 'html', raw: `</${name}>`, tag: name }); } openTags.splice(openTags.indexOf(name), 1); } } }, { xmlMode: false, recognizeSelfClosing: true }); parser.write(html); parser.end(); if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); } return tokens; }; /** * Determines if an HTML string contains multiple distinct tags. * Used as a preprocessing step to optimize token processing. * * @param {string} html - HTML string to analyze * @returns {boolean} True if multiple tags are present or if it's a single pair of matching tags * * @internal */ const TAG_REGEX = /<\/?[a-zA-Z][^>]*>/g; export const containsMultipleTags = (html) => { let openCount = 0; let closeCount = 0; TAG_REGEX.lastIndex = 0; let match; while ((match = TAG_REGEX.exec(html)) !== null) { if (match[0][1] === '/') { closeCount++; } else { openCount++; } if (openCount > 1 || closeCount > 1) return true; if (openCount >= 1 && closeCount >= 1) return true; } return false; }; /** * Fast scan used by `expandHtmlToken` to decide whether the htmlparser2 * expansion path is worth invoking. Returns true when the input contains * at least two `<` characters separated by a `>` — i.e. the cheapest * possible witness that more than one tag is present. * * Cheaper than `containsMultipleTags` (two `indexOf` calls vs a global * regex sweep) and good enough for the perf gate: false positives just * route through htmlparser2 (correct, slightly slower); false negatives * cannot occur for any input that contains two tags. * * @internal */ const hasMultipleTags = (html) => { const firstClose = html.indexOf('>'); if (firstClose === -1) return false; return html.indexOf('<', firstClose + 1) !== -1; }; /** * Single-pass expansion of one html token's raw string into nested * tokens. Combines what `parseHtmlBlock` (flat tokenization) and the * subsequent `processHtmlTokens` walk (stack-based nesting) used to do * separately into a single htmlparser2 traversal: opening tags push a * fresh child array onto the stack, closing tags pop it and attach the * collected children to the opening token via `tokens`. * * Behavior is matched to the legacy two-pass pipeline: * - Self-closing tags (`<br>` etc.) are emitted with `<.../>` form. * - Auto-closes injected by htmlparser2 at end-of-input (when the * source did not literally contain `</tag>`) keep the children flat * under the opening tag rather than nesting them — this preserves * the legacy "partial result on unclosed tags" output. * - Whitespace-only text between tags is dropped. * * Post-condition (depended on by `IncrementalParser`, see #291): an html * token's `.tokens` array is set only when a real (non-implied) closing * tag was seen in the source. Unclosed openings leave `.tokens` as * `undefined`, which is how downstream streaming code distinguishes * `<div>` (still streaming) from `<div></div>` (genuinely empty). * * @internal */ const expandHtmlBlockNested = (html) => { const root = []; const stack = [root]; const opens = []; let currentText = ''; const flushText = () => { if (currentText.length === 0) return; if (currentText.trim()) { stack[stack.length - 1].push({ type: 'text', raw: currentText, text: currentText }); } currentText = ''; }; const parser = new htmlparser2.Parser({ onopentag: (name, attributes) => { flushText(); if (SELF_CLOSING_TAGS.test(name)) { stack[stack.length - 1].push({ type: 'html', raw: `<${name}${serializeAttributes(attributes)}/>`, tag: name, attributes }); return; } const childTokens = []; const opening = { type: 'html', raw: `<${name}${serializeAttributes(attributes)}>`, tag: name, attributes }; stack[stack.length - 1].push(opening); stack.push(childTokens); opens.push({ tag: name, opening, childTokens }); }, ontext: (text) => { currentText += text; }, onclosetag: (name, implied) => { flushText(); if (opens.length === 0) return; const top = opens[opens.length - 1]; if (top.tag !== name) return; opens.pop(); stack.pop(); if (!implied) { // Real `</tag>` in source — fully resolved nested token. ; top.opening.tokens = top.childTokens; } else { // Auto-closed by htmlparser2 at end-of-input — this // opening tag is unclosed in the source. Leave `.tokens` // undefined so downstream code can tell it apart from a // genuinely empty closed element (`<div></div>`), and // flatten any children under the parent to match the // legacy partial-result behavior. const parent = stack[stack.length - 1]; for (const child of top.childTokens) parent.push(child); } } }, { xmlMode: false, recognizeSelfClosing: true }); parser.write(html); parser.end(); flushText(); return root; }; /** * Expands a single html token. Single-tag inputs (the dominant inline * shape — opening tag alone, closing tag alone, self-closing) skip * htmlparser2 entirely and go through the cheap `formatSelfClosingHtmlToken` * path. Anything with two or more tags routes through * `expandHtmlBlockNested` for inline nesting. * * @internal */ const expandHtmlToken = (token) => { if (!hasMultipleTags(token.raw)) { return [formatSelfClosingHtmlToken(token)]; } return expandHtmlBlockNested(token.raw); }; /** * Pair-matches flat html opens/closes that span across separate marked * tokens (e.g. marked emits `<details>` and `</details>` as two * top-level html tokens with markdown blocks between them). Tokens that * `expandHtmlToken` already nested (recognizable by the populated * `tokens` array on an html token) are passed through opaquely — no * recursion, no re-walk. This is the key delta from the legacy * `processHtmlTokens` walk, which re-traversed every nested descendant * for each html token in the result. * * @internal */ const pairFlatHtmlTokens = (tokens) => { const result = []; const stack = []; for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; if (token.type !== 'html') { result.push(token); continue; } // Already-nested html tokens (from expandHtmlToken) are opaque — // their internal structure is fully resolved. if ('tokens' in token && Array.isArray(token.tokens)) { result.push(token); continue; } const tagInfo = isHtmlOpenTag(token.raw); if (!tagInfo) { result.push(token); continue; } // Self-closing tags (e.g. <img src="x"/>) don't participate in // open/close pairing — pushing them onto the stack would block // a later `</tag>` from finding its real opening. if (token.raw.endsWith('/>')) { result.push(token); continue; } if (tagInfo.isOpening) { stack.push({ tag: tagInfo.tag, startIndex: result.length }); result.push(token); } else { const lastOpen = stack.pop(); if (!lastOpen || lastOpen.tag !== tagInfo.tag) { result.push(token); continue; } const startIndex = lastOpen.startIndex; const innerTokens = result.splice(startIndex + 1, result.length - startIndex - 1); const openingToken = result.pop(); result.push({ type: 'html', raw: openingToken.raw, tag: tagInfo.tag, tokens: innerTokens, attributes: extractAttributes(openingToken.raw) }); } } return result; }; /** * Primary entry point for HTML token processing. Transforms flat token arrays * into properly nested structures while preserving HTML semantics. * * Key features: * - Breaks down complex HTML structures into atomic tokens * - Formats self-closing tags with proper syntax (e.g., <br> -> <br/>) * - Maintains attribute information * - Preserves proper nesting relationships * - Handles malformed HTML gracefully * * @param {Token[]} tokens - Array of tokens to process * @returns {Token[]} Processed and properly nested token array * * @example * const tokens = [ * { type: 'html', raw: '<div class="wrapper">' }, * { type: 'text', raw: 'content' }, * { type: 'html', raw: '</div>' } * ]; * shrinkHtmlTokens(tokens); * // Returns nested structure with proper token relationships * * @public */ export const shrinkHtmlTokens = (tokens) => { const expanded = []; for (const token of tokens) { if (token.type !== 'html' && 'tokens' in token && Array.isArray(token.tokens)) { const t = token; t.tokens = shrinkHtmlTokens(t.tokens); expanded.push(token); } else if (token.type === 'list') { token.items = token.items.map((item, index) => ({ ...item, listItemIndex: index, tokens: item.tokens ? shrinkHtmlTokens(item.tokens) : [] })); expanded.push(token); } else if (token.type === 'table') { const tableToken = token; if (tableToken.header) { tableToken.header = tableToken.header.map((cell) => ({ ...cell, tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : [] })); } if (tableToken.rows) { tableToken.rows = tableToken.rows.map((row) => row.map((cell) => ({ ...cell, tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : [] }))); } expanded.push(token); } else if (token.type === 'html') { const expansion = expandHtmlToken(token); for (const t of expansion) expanded.push(t); } else { expanded.push(token); } } return pairFlatHtmlTokens(expanded); }; /** * Core token processing logic that handles the complexities of HTML nesting. * Uses a stack-based approach to match opening and closing tags while * maintaining proper hierarchical relationships. * * Implementation details: * - Maintains a stack of opening tags * - Processes nested tokens recursively * - Preserves HTML attributes * - Handles malformed HTML gracefully * * @param {Token[]} tokens - Tokens to be processed * @returns {Token[]} Processed tokens with proper nesting structure * * @internal */ export const processHtmlTokens = (tokens) => { const result = []; // Stack to keep track of opening tags and their positions const stack = []; for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; // If token contains nested tokens, process them recursively if ('tokens' in token && Array.isArray(token.tokens)) { token.tokens = processHtmlTokens(token.tokens); } if (token.type === 'html') { const tagInfo = isHtmlOpenTag(token.raw); if (!tagInfo) { // If we can't parse the tag, just add it as-is result.push(token); continue; } if (tagInfo.isOpening) { // For opening tags, push to stack and add to result stack.push({ tag: tagInfo.tag, startIndex: result.length }); result.push(token); } else { // For closing tags, try to match with last opening tag const lastOpening = stack.pop(); if (!lastOpening || lastOpening.tag !== tagInfo.tag) { // If no matching opening tag, add closing tag as-is result.push(token); continue; } // Found matching tags - create nested structure const startIndex = lastOpening.startIndex; // Remove all tokens between opening and closing tags const innerTokens = result.splice(startIndex + 1, result.length - startIndex - 1); // Remove the opening tag const openingToken = result.pop(); // Extract attributes from opening tag const attributes = extractAttributes(openingToken.raw); // Create new nested token structure result.push({ type: 'html', raw: openingToken.raw, tag: tagInfo.tag, tokens: processHtmlTokens(innerTokens), attributes }); } } else { // Non-HTML tokens are added as-is result.push(token); } } // If we have unclosed tags, return partial result (better than discarding all work) if (stack.length > 0) { return result; } return result; };