@humanspeak/svelte-markdown
Version:
Markdown and HTML renderer for Svelte 5 — built for rendering streaming AI agent output from Claude Code, ChatGPT, and agentic workflows. XSS-safe defaults, streaming-aware sanitization, token caching, TypeScript types, and Svelte 5 runes.
619 lines (618 loc) • 22.3 kB
JavaScript
import * as htmlparser2 from 'htmlparser2';
/**
* Matches HTML tags with comprehensive coverage of edge cases.
* Pattern breakdown:
* - <\/? : Matches opening < and optional /
* - [a-zA-Z] : Tag must start with letter
* - [a-zA-Z0-9-] : Subsequent chars can be letters, numbers, or hyphens
* - (?:\s+[^>]*)?: Optional attributes
* - > : Closing bracket
*
* @const {RegExp}
*/
const htmlTagRegex = /<\/?([a-zA-Z][a-zA-Z0-9-]{0,})(?:\s+[^>]*)?>/;
/**
* Regex pattern for self-closing HTML tags.
* @const {RegExp}
*/
const SELF_CLOSING_TAGS = /^(br|hr|img|input|link|meta|area|base|col|embed|keygen|param|source|track|wbr)$/i;
/**
* Analyzes a string to determine if it contains an HTML tag and its characteristics.
*
* @param {string} raw - Raw string potentially containing an HTML tag
* @returns {Object|null} Returns null if no tag found, otherwise returns:
* {
* tag: string - The name of the HTML tag
* isOpening: bool - True if opening tag, false if closing
* }
*
* @example
* isHtmlOpenTag('<div class="test">') // Returns { tag: 'div', isOpening: true }
* isHtmlOpenTag('</span>') // Returns { tag: 'span', isOpening: false }
* isHtmlOpenTag('plain text') // Returns null
*/
export const isHtmlOpenTag = (raw) => {
const match = htmlTagRegex.exec(raw);
if (!match)
return null;
return { tag: match[1], isOpening: !raw.startsWith('</') };
};
/**
* Formats individual HTML tokens to ensure self-closing tags are properly formatted.
* This handles cases like <br> -> <br/> without affecting the token structure.
*
* @param {Token} token - HTML token to format
* @returns {Token} Formatted token with proper self-closing syntax
*/
const formatSelfClosingHtmlToken = (token) => {
// Extract tag name from raw HTML
const tagMatch = token.raw.match(/<\/?([a-zA-Z][a-zA-Z0-9-]*)/i);
if (!tagMatch)
return token;
const tagName = tagMatch[1];
if (!SELF_CLOSING_TAGS.test(tagName))
return token;
// Self-closing tags get `.tag` and `.attributes` set so downstream
// code (pairing, dispatch, sanitization) has structured access. If
// the source already used the `<.../>` form we keep raw as-is;
// otherwise we normalize the `>` to `/>`.
const formattedRaw = token.raw.endsWith('/>') ? token.raw : token.raw.replace(/\s*>$/, '/>');
return {
...token,
raw: formattedRaw,
tag: tagName,
attributes: extractAttributes(token.raw)
};
};
/**
* Parses HTML attributes from a tag string into a structured object.
* Handles both single and double quoted attributes, plus bare boolean
* attributes. Quoted regions are stripped before the boolean pass so
* space-separated words inside a value (e.g. `bar` in `title="foo bar
* baz"`) aren't mistakenly harvested as boolean attributes (issue #297).
*
* @param raw - Raw HTML tag string containing attributes.
* @returns Map of attribute names to their values. Boolean attributes
* are represented as `''`.
*
* @example
* extractAttributes('<div class="foo" id="bar">')
* // → { class: 'foo', id: 'bar' }
*
* extractAttributes('<Tip title="foo bar baz">')
* // → { title: 'foo bar baz' } // not { title: …, bar: '' }
*
* extractAttributes('<input type="checkbox" checked disabled>')
* // → { type: 'checkbox', checked: '', disabled: '' }
*
* @internal
*/
export const extractAttributes = (raw) => {
const attributes = {};
// First pass: handle regular and unclosed quoted attributes
const quotedRegex = /([a-zA-Z][\w-]*?)=["']([^"']*?)(?:["']|$)/g;
let match;
while ((match = quotedRegex.exec(raw)) !== null) {
const [, key, value] = match;
attributes[key] = value.trim();
}
// Strip quoted attribute blocks before the boolean pass so word-like
// tokens inside a value (e.g. `bar` in `title="foo bar baz"`) aren't
// mistakenly harvested as boolean attributes.
const stripped = raw.replace(/[a-zA-Z][\w-]*?=["'][^"']*?(?:["']|$)/g, ' ');
// Second pass: handle boolean attributes
const booleanRegex = /(?:^|\s)([a-zA-Z][\w-]*?)(?=[\s>]|$)/g;
while ((match = booleanRegex.exec(stripped)) !== null) {
const [, key] = match;
if (key && !attributes[key]) {
attributes[key] = '';
}
}
return attributes;
};
/**
* Converts an HTML string into a sequence of tokens using htmlparser2.
* Handles complex nested structures while maintaining proper order and relationships.
*
* Key features:
* - Preserves original HTML structure without automatic tag closing
* - Handles self-closing tags with proper XML syntax (e.g., <br/> instead of <br>)
* - Gracefully handles malformed HTML by preserving the original structure
* - Maintains attribute information in opening tags
* - Processes text content between tags
*
* @param {string} html - HTML string to be parsed
* @returns {Token[]} Array of tokens representing the HTML structure
*
* @example
* // Well-formed HTML
* parseHtmlBlock('<div>Hello <span>world</span></div>')
* // Returns [
* // { type: 'html', raw: '<div>', ... },
* // { type: 'text', raw: 'Hello ', ... },
* // { type: 'html', raw: '<span>', ... },
* // { type: 'text', raw: 'world', ... },
* // { type: 'html', raw: '</span>', ... },
* // { type: 'html', raw: '</div>', ... }
* // ]
*
* // Self-closing tags
* parseHtmlBlock('<div>Before<br/>After</div>')
* // Returns [
* // { type: 'html', raw: '<div>', ... },
* // { type: 'text', raw: 'Before', ... },
* // { type: 'html', raw: '<br/>', ... },
* // { type: 'text', raw: 'After', ... },
* // { type: 'html', raw: '</div>', ... }
* // ]
*
* // Malformed HTML
* parseHtmlBlock('<div>Unclosed')
* // Returns [
* // { type: 'html', raw: '<div>', ... },
* // { type: 'text', raw: 'Unclosed', ... }
* // ]
*
* @internal
*/
/**
* Serializes an HTML attribute map into a string for tag construction.
* Escapes double quotes in values to prevent attribute injection.
*
* @param {Record<string, string>} attributes - Map of attribute names to values
* @returns {string} Serialized attributes string with leading spaces
*
* @example
* serializeAttributes({ class: 'foo', id: 'bar' })
* // Returns ' class="foo" id="bar"'
*
* @internal
*/
const serializeAttributes = (attributes) => Object.entries(attributes)
.map(([key, value]) => ` ${key}="${value.replace(/"/g, '"')}"`)
.join('');
export const parseHtmlBlock = (html) => {
const tokens = [];
let currentText = '';
const openTags = [];
const parser = new htmlparser2.Parser({
onopentag: (name, attributes) => {
if (currentText.trim()) {
tokens.push({
type: 'text',
raw: currentText,
text: currentText
});
currentText = '';
}
if (SELF_CLOSING_TAGS.test(name)) {
tokens.push({
type: 'html',
raw: `<${name}${serializeAttributes(attributes)}/>`,
tag: name,
attributes
});
}
else {
openTags.push(name);
tokens.push({
type: 'html',
raw: `<${name}${serializeAttributes(attributes)}>`,
tag: name,
attributes
});
}
},
ontext: (text) => {
currentText += text;
},
onclosetag: (name) => {
if (currentText.trim()) {
tokens.push({
type: 'text',
raw: currentText,
text: currentText
});
currentText = '';
}
// Only add closing tag if we found its opening tag
// and it's not a self-closing tag
if (openTags.includes(name) && !SELF_CLOSING_TAGS.test(name)) {
if (html.includes(`</${name}>`)) {
tokens.push({
type: 'html',
raw: `</${name}>`,
tag: name
});
}
openTags.splice(openTags.indexOf(name), 1);
}
}
}, {
xmlMode: false,
recognizeSelfClosing: true
});
parser.write(html);
parser.end();
if (currentText.trim()) {
tokens.push({
type: 'text',
raw: currentText,
text: currentText
});
}
return tokens;
};
/**
* Determines if an HTML string contains multiple distinct tags.
* Used as a preprocessing step to optimize token processing.
*
* @param {string} html - HTML string to analyze
* @returns {boolean} True if multiple tags are present or if it's a single pair of matching tags
*
* @internal
*/
const TAG_REGEX = /<\/?[a-zA-Z][^>]*>/g;
export const containsMultipleTags = (html) => {
let openCount = 0;
let closeCount = 0;
TAG_REGEX.lastIndex = 0;
let match;
while ((match = TAG_REGEX.exec(html)) !== null) {
if (match[0][1] === '/') {
closeCount++;
}
else {
openCount++;
}
if (openCount > 1 || closeCount > 1)
return true;
if (openCount >= 1 && closeCount >= 1)
return true;
}
return false;
};
/**
* Fast scan used by `expandHtmlToken` to decide whether the htmlparser2
* expansion path is worth invoking. Returns true when the input contains
* at least two `<` characters separated by a `>` — i.e. the cheapest
* possible witness that more than one tag is present.
*
* Cheaper than `containsMultipleTags` (two `indexOf` calls vs a global
* regex sweep) and good enough for the perf gate: false positives just
* route through htmlparser2 (correct, slightly slower); false negatives
* cannot occur for any input that contains two tags.
*
* @internal
*/
const hasMultipleTags = (html) => {
const firstClose = html.indexOf('>');
if (firstClose === -1)
return false;
return html.indexOf('<', firstClose + 1) !== -1;
};
/**
* Single-pass expansion of one html token's raw string into nested
* tokens. Combines what `parseHtmlBlock` (flat tokenization) and the
* subsequent `processHtmlTokens` walk (stack-based nesting) used to do
* separately into a single htmlparser2 traversal: opening tags push a
* fresh child array onto the stack, closing tags pop it and attach the
* collected children to the opening token via `tokens`.
*
* Behavior is matched to the legacy two-pass pipeline:
* - Self-closing tags (`<br>` etc.) are emitted with `<.../>` form.
* - Auto-closes injected by htmlparser2 at end-of-input (when the
* source did not literally contain `</tag>`) keep the children flat
* under the opening tag rather than nesting them — this preserves
* the legacy "partial result on unclosed tags" output.
* - Whitespace-only text between tags is dropped.
*
* Post-condition (depended on by `IncrementalParser`, see #291): an html
* token's `.tokens` array is set only when a real (non-implied) closing
* tag was seen in the source. Unclosed openings leave `.tokens` as
* `undefined`, which is how downstream streaming code distinguishes
* `<div>` (still streaming) from `<div></div>` (genuinely empty).
*
* @internal
*/
const expandHtmlBlockNested = (html) => {
const root = [];
const stack = [root];
const opens = [];
let currentText = '';
const flushText = () => {
if (currentText.length === 0)
return;
if (currentText.trim()) {
stack[stack.length - 1].push({
type: 'text',
raw: currentText,
text: currentText
});
}
currentText = '';
};
const parser = new htmlparser2.Parser({
onopentag: (name, attributes) => {
flushText();
if (SELF_CLOSING_TAGS.test(name)) {
stack[stack.length - 1].push({
type: 'html',
raw: `<${name}${serializeAttributes(attributes)}/>`,
tag: name,
attributes
});
return;
}
const childTokens = [];
const opening = {
type: 'html',
raw: `<${name}${serializeAttributes(attributes)}>`,
tag: name,
attributes
};
stack[stack.length - 1].push(opening);
stack.push(childTokens);
opens.push({ tag: name, opening, childTokens });
},
ontext: (text) => {
currentText += text;
},
onclosetag: (name, implied) => {
flushText();
if (opens.length === 0)
return;
const top = opens[opens.length - 1];
if (top.tag !== name)
return;
opens.pop();
stack.pop();
if (!implied) {
// Real `</tag>` in source — fully resolved nested token.
;
top.opening.tokens = top.childTokens;
}
else {
// Auto-closed by htmlparser2 at end-of-input — this
// opening tag is unclosed in the source. Leave `.tokens`
// undefined so downstream code can tell it apart from a
// genuinely empty closed element (`<div></div>`), and
// flatten any children under the parent to match the
// legacy partial-result behavior.
const parent = stack[stack.length - 1];
for (const child of top.childTokens)
parent.push(child);
}
}
}, {
xmlMode: false,
recognizeSelfClosing: true
});
parser.write(html);
parser.end();
flushText();
return root;
};
/**
* Expands a single html token. Single-tag inputs (the dominant inline
* shape — opening tag alone, closing tag alone, self-closing) skip
* htmlparser2 entirely and go through the cheap `formatSelfClosingHtmlToken`
* path. Anything with two or more tags routes through
* `expandHtmlBlockNested` for inline nesting.
*
* @internal
*/
const expandHtmlToken = (token) => {
if (!hasMultipleTags(token.raw)) {
return [formatSelfClosingHtmlToken(token)];
}
return expandHtmlBlockNested(token.raw);
};
/**
* Pair-matches flat html opens/closes that span across separate marked
* tokens (e.g. marked emits `<details>` and `</details>` as two
* top-level html tokens with markdown blocks between them). Tokens that
* `expandHtmlToken` already nested (recognizable by the populated
* `tokens` array on an html token) are passed through opaquely — no
* recursion, no re-walk. This is the key delta from the legacy
* `processHtmlTokens` walk, which re-traversed every nested descendant
* for each html token in the result.
*
* @internal
*/
const pairFlatHtmlTokens = (tokens) => {
const result = [];
const stack = [];
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
if (token.type !== 'html') {
result.push(token);
continue;
}
// Already-nested html tokens (from expandHtmlToken) are opaque —
// their internal structure is fully resolved.
if ('tokens' in token && Array.isArray(token.tokens)) {
result.push(token);
continue;
}
const tagInfo = isHtmlOpenTag(token.raw);
if (!tagInfo) {
result.push(token);
continue;
}
// Self-closing tags (e.g. <img src="x"/>) don't participate in
// open/close pairing — pushing them onto the stack would block
// a later `</tag>` from finding its real opening.
if (token.raw.endsWith('/>')) {
result.push(token);
continue;
}
if (tagInfo.isOpening) {
stack.push({ tag: tagInfo.tag, startIndex: result.length });
result.push(token);
}
else {
const lastOpen = stack.pop();
if (!lastOpen || lastOpen.tag !== tagInfo.tag) {
result.push(token);
continue;
}
const startIndex = lastOpen.startIndex;
const innerTokens = result.splice(startIndex + 1, result.length - startIndex - 1);
const openingToken = result.pop();
result.push({
type: 'html',
raw: openingToken.raw,
tag: tagInfo.tag,
tokens: innerTokens,
attributes: extractAttributes(openingToken.raw)
});
}
}
return result;
};
/**
* Primary entry point for HTML token processing. Transforms flat token arrays
* into properly nested structures while preserving HTML semantics.
*
* Key features:
* - Breaks down complex HTML structures into atomic tokens
* - Formats self-closing tags with proper syntax (e.g., <br> -> <br/>)
* - Maintains attribute information
* - Preserves proper nesting relationships
* - Handles malformed HTML gracefully
*
* @param {Token[]} tokens - Array of tokens to process
* @returns {Token[]} Processed and properly nested token array
*
* @example
* const tokens = [
* { type: 'html', raw: '<div class="wrapper">' },
* { type: 'text', raw: 'content' },
* { type: 'html', raw: '</div>' }
* ];
* shrinkHtmlTokens(tokens);
* // Returns nested structure with proper token relationships
*
* @public
*/
export const shrinkHtmlTokens = (tokens) => {
const expanded = [];
for (const token of tokens) {
if (token.type !== 'html' &&
'tokens' in token &&
Array.isArray(token.tokens)) {
const t = token;
t.tokens = shrinkHtmlTokens(t.tokens);
expanded.push(token);
}
else if (token.type === 'list') {
token.items = token.items.map((item, index) => ({
...item,
listItemIndex: index,
tokens: item.tokens ? shrinkHtmlTokens(item.tokens) : []
}));
expanded.push(token);
}
else if (token.type === 'table') {
const tableToken = token;
if (tableToken.header) {
tableToken.header = tableToken.header.map((cell) => ({
...cell,
tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : []
}));
}
if (tableToken.rows) {
tableToken.rows = tableToken.rows.map((row) => row.map((cell) => ({
...cell,
tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : []
})));
}
expanded.push(token);
}
else if (token.type === 'html') {
const expansion = expandHtmlToken(token);
for (const t of expansion)
expanded.push(t);
}
else {
expanded.push(token);
}
}
return pairFlatHtmlTokens(expanded);
};
/**
* Core token processing logic that handles the complexities of HTML nesting.
* Uses a stack-based approach to match opening and closing tags while
* maintaining proper hierarchical relationships.
*
* Implementation details:
* - Maintains a stack of opening tags
* - Processes nested tokens recursively
* - Preserves HTML attributes
* - Handles malformed HTML gracefully
*
* @param {Token[]} tokens - Tokens to be processed
* @returns {Token[]} Processed tokens with proper nesting structure
*
* @internal
*/
export const processHtmlTokens = (tokens) => {
const result = [];
// Stack to keep track of opening tags and their positions
const stack = [];
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
// If token contains nested tokens, process them recursively
if ('tokens' in token && Array.isArray(token.tokens)) {
token.tokens = processHtmlTokens(token.tokens);
}
if (token.type === 'html') {
const tagInfo = isHtmlOpenTag(token.raw);
if (!tagInfo) {
// If we can't parse the tag, just add it as-is
result.push(token);
continue;
}
if (tagInfo.isOpening) {
// For opening tags, push to stack and add to result
stack.push({ tag: tagInfo.tag, startIndex: result.length });
result.push(token);
}
else {
// For closing tags, try to match with last opening tag
const lastOpening = stack.pop();
if (!lastOpening || lastOpening.tag !== tagInfo.tag) {
// If no matching opening tag, add closing tag as-is
result.push(token);
continue;
}
// Found matching tags - create nested structure
const startIndex = lastOpening.startIndex;
// Remove all tokens between opening and closing tags
const innerTokens = result.splice(startIndex + 1, result.length - startIndex - 1);
// Remove the opening tag
const openingToken = result.pop();
// Extract attributes from opening tag
const attributes = extractAttributes(openingToken.raw);
// Create new nested token structure
result.push({
type: 'html',
raw: openingToken.raw,
tag: tagInfo.tag,
tokens: processHtmlTokens(innerTokens),
attributes
});
}
}
else {
// Non-HTML tokens are added as-is
result.push(token);
}
}
// If we have unclosed tags, return partial result (better than discarding all work)
if (stack.length > 0) {
return result;
}
return result;
};