UNPKG

@humanspeak/svelte-markdown

Version:

A powerful, customizable markdown renderer for Svelte with TypeScript support

395 lines (394 loc) 14.1 kB
import * as htmlparser2 from 'htmlparser2'; /** * Matches HTML tags with comprehensive coverage of edge cases. * Pattern breakdown: * - <\/? : Matches opening < and optional / * - [a-zA-Z] : Tag must start with letter * - [a-zA-Z0-9-] : Subsequent chars can be letters, numbers, or hyphens * - (?:\s+[^>]*)?: Optional attributes * - > : Closing bracket * * @const {RegExp} */ const HTML_TAG_PATTERN = /<\/?([a-zA-Z][a-zA-Z0-9-]{0,})(?:\s+[^>]*)?>/; const htmlTagRegex = new RegExp(HTML_TAG_PATTERN); /** * Regex pattern for self-closing HTML tags. * @const {RegExp} */ const SELF_CLOSING_TAGS = /^(br|hr|img|input|link|meta|area|base|col|embed|keygen|param|source|track|wbr)$/i; /** * Analyzes a string to determine if it contains an HTML tag and its characteristics. * * @param {string} raw - Raw string potentially containing an HTML tag * @returns {Object|null} Returns null if no tag found, otherwise returns: * { * tag: string - The name of the HTML tag * isOpening: bool - True if opening tag, false if closing * } * * @example * isHtmlOpenTag('<div class="test">') // Returns { tag: 'div', isOpening: true } * isHtmlOpenTag('</span>') // Returns { tag: 'span', isOpening: false } * isHtmlOpenTag('plain text') // Returns null */ export const isHtmlOpenTag = (raw) => { // First check if the string contains any HTML tags at all (faster than full regex match) if (!htmlTagRegex.test(raw)) return null; // If we found a tag, extract its name and check if it's an opening tag const match = raw.match(HTML_TAG_PATTERN); if (!match) return null; return { tag: match[1], isOpening: !raw.startsWith('</') }; }; /** * Formats individual HTML tokens to ensure self-closing tags are properly formatted. * This handles cases like <br> -> <br/> without affecting the token structure. * * @param {Token} token - HTML token to format * @returns {Token} Formatted token with proper self-closing syntax */ const formatSelfClosingHtmlToken = (token) => { // Extract tag name from raw HTML const tagMatch = token.raw.match(/<\/?([a-zA-Z][a-zA-Z0-9-]*)/i); if (!tagMatch) return token; const tagName = tagMatch[1]; if (!SELF_CLOSING_TAGS.test(tagName)) return token; // If it's a self-closing tag and doesn't already end with />, format it properly if (!token.raw.endsWith('/>')) { const formattedRaw = token.raw.replace(/\s*>$/, '/>'); return { ...token, raw: formattedRaw, tag: tagName, attributes: extractAttributes(token.raw) }; } return token; }; /** * Parses HTML attributes from a tag string into a structured object. * Handles both single and double quoted attributes. * * @param {string} raw - Raw HTML tag string containing attributes * @returns {Record<string, string>} Map of attribute names to their values * * @example * extractAttributes('<div class="foo" id="bar">') * // Returns { class: 'foo', id: 'bar' } * * @internal */ export const extractAttributes = (raw) => { const attributes = {}; // First pass: handle regular and unclosed quoted attributes const quotedRegex = /([a-zA-Z][\w-]*?)=["']([^"']*?)(?:["']|$)/g; let match; while ((match = quotedRegex.exec(raw)) !== null) { const [, key, value] = match; attributes[key] = value.trim(); } // Second pass: handle boolean attributes const booleanRegex = /(?:^|\s)([a-zA-Z][\w-]*?)(?=[\s>]|$)/g; while ((match = booleanRegex.exec(raw)) !== null) { const [, key] = match; if (key && !attributes[key]) { attributes[key] = ''; } } return attributes; }; /** * Converts an HTML string into a sequence of tokens using htmlparser2. * Handles complex nested structures while maintaining proper order and relationships. * * Key features: * - Preserves original HTML structure without automatic tag closing * - Handles self-closing tags with proper XML syntax (e.g., <br/> instead of <br>) * - Gracefully handles malformed HTML by preserving the original structure * - Maintains attribute information in opening tags * - Processes text content between tags * * @param {string} html - HTML string to be parsed * @returns {Token[]} Array of tokens representing the HTML structure * * @example * // Well-formed HTML * parseHtmlBlock('<div>Hello <span>world</span></div>') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Hello ', ... }, * // { type: 'html', raw: '<span>', ... }, * // { type: 'text', raw: 'world', ... }, * // { type: 'html', raw: '</span>', ... }, * // { type: 'html', raw: '</div>', ... } * // ] * * // Self-closing tags * parseHtmlBlock('<div>Before<br/>After</div>') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Before', ... }, * // { type: 'html', raw: '<br/>', ... }, * // { type: 'text', raw: 'After', ... }, * // { type: 'html', raw: '</div>', ... } * // ] * * // Malformed HTML * parseHtmlBlock('<div>Unclosed') * // Returns [ * // { type: 'html', raw: '<div>', ... }, * // { type: 'text', raw: 'Unclosed', ... } * // ] * * @internal */ export const parseHtmlBlock = (html) => { const tokens = []; let currentText = ''; const openTags = []; const parser = new htmlparser2.Parser({ onopentag: (name, attributes) => { if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); currentText = ''; } if (SELF_CLOSING_TAGS.test(name)) { tokens.push({ type: 'html', raw: `<${name}${Object.entries(attributes) .map(([key, value]) => ` ${key}="${value}"`) .join('')}/>`, tag: name, attributes }); } else { openTags.push(name); tokens.push({ type: 'html', raw: `<${name}${Object.entries(attributes) .map(([key, value]) => ` ${key}="${value}"`) .join('')}>`, tag: name, attributes }); } }, ontext: (text) => { currentText += text; }, onclosetag: (name) => { if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); currentText = ''; } // Only add closing tag if we found its opening tag // and it's not a self-closing tag if (openTags.includes(name) && !SELF_CLOSING_TAGS.test(name)) { if (html.includes(`</${name}>`)) { tokens.push({ type: 'html', raw: `</${name}>`, tag: name }); } openTags.splice(openTags.indexOf(name), 1); } } }, { xmlMode: true, // Add this to prevent automatic tag closing recognizeSelfClosing: true }); parser.write(html); parser.end(); if (currentText.trim()) { tokens.push({ type: 'text', raw: currentText, text: currentText }); } return tokens; }; /** * Determines if an HTML string contains multiple distinct tags. * Used as a preprocessing step to optimize token processing. * * @param {string} html - HTML string to analyze * @returns {boolean} True if multiple tags are present or if it's a single pair of matching tags * * @internal */ export const containsMultipleTags = (html) => { // Count the number of opening and closing tags const openingTags = html.match(/<[a-zA-Z][^>]*>/g) || []; const closingTags = html.match(/<\/[a-zA-Z][^>]*>/g) || []; // Return true if: // 1. There are multiple opening tags OR // 2. There are multiple closing tags OR // 3. There is exactly one opening and one closing tag (matching pair) return (openingTags.length > 1 || closingTags.length > 1 || (openingTags.length === 1 && closingTags.length === 1)); }; /** * Primary entry point for HTML token processing. Transforms flat token arrays * into properly nested structures while preserving HTML semantics. * * Key features: * - Breaks down complex HTML structures into atomic tokens * - Formats self-closing tags with proper syntax (e.g., <br> -> <br/>) * - Maintains attribute information * - Preserves proper nesting relationships * - Handles malformed HTML gracefully * * @param {Token[]} tokens - Array of tokens to process * @returns {Token[]} Processed and properly nested token array * * @example * const tokens = [ * { type: 'html', raw: '<div class="wrapper">' }, * { type: 'text', raw: 'content' }, * { type: 'html', raw: '</div>' } * ]; * shrinkHtmlTokens(tokens); * // Returns nested structure with proper token relationships * * @public */ export const shrinkHtmlTokens = (tokens) => { const result = []; for (const token of tokens) { if (token.type === 'list') { token.items = token.items.map((item, index) => ({ ...item, listItemIndex: index, tokens: item.tokens ? shrinkHtmlTokens(item.tokens) : [] })); result.push(token); } else if (token.type === 'table') { // Process header cells if (token.header) { // @ts-expect-error: expected any token.header = token.header.map((cell) => ({ ...cell, tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : [] })); } // Process row cells if (token.rows) { // @ts-expect-error: expected any token.rows = token.rows.map((row) => // @ts-expect-error: expected any row.map((cell) => ({ ...cell, tokens: cell.tokens ? shrinkHtmlTokens(cell.tokens) : [] }))); } result.push(token); } else if (token.type === 'html' && containsMultipleTags(token.raw)) { // Parse HTML with multiple tags into separate tokens result.push(...parseHtmlBlock(token.raw)); } else if (token.type === 'html') { // Format self-closing tags properly (e.g., <br> -> <br/>) const formattedToken = formatSelfClosingHtmlToken(token); result.push(formattedToken); } else { result.push(token); } } // Then process the tokens as before return processHtmlTokens(result); }; /** * Core token processing logic that handles the complexities of HTML nesting. * Uses a stack-based approach to match opening and closing tags while * maintaining proper hierarchical relationships. * * Implementation details: * - Maintains a stack of opening tags * - Processes nested tokens recursively * - Preserves HTML attributes * - Handles malformed HTML gracefully * * @param {Token[]} tokens - Tokens to be processed * @returns {Token[]} Processed tokens with proper nesting structure * * @internal */ export const processHtmlTokens = (tokens) => { const result = []; // Stack to keep track of opening tags and their positions const stack = []; for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; // If token contains nested tokens, process them recursively if ('tokens' in token && Array.isArray(token.tokens)) { token.tokens = processHtmlTokens(token.tokens); } if (token.type === 'html') { const tagInfo = isHtmlOpenTag(token.raw); if (!tagInfo) { // If we can't parse the tag, just add it as-is result.push(token); continue; } if (tagInfo.isOpening) { // For opening tags, push to stack and add to result stack.push({ tag: tagInfo.tag, startIndex: result.length }); result.push(token); } else { // For closing tags, try to match with last opening tag const lastOpening = stack.pop(); if (!lastOpening || lastOpening.tag !== tagInfo.tag) { // If no matching opening tag, add closing tag as-is result.push(token); continue; } // Found matching tags - create nested structure const startIndex = lastOpening.startIndex; // Remove all tokens between opening and closing tags const innerTokens = result.splice(startIndex + 1, result.length - startIndex - 1); // Remove the opening tag const openingToken = result.pop(); // Extract attributes from opening tag const attributes = extractAttributes(openingToken.raw); // Create new nested token structure result.push({ type: 'html', raw: openingToken.raw, tag: tagInfo.tag, tokens: processHtmlTokens(innerTokens), attributes }); } } else { // Non-HTML tokens are added as-is result.push(token); } } // If we have unclosed tags, return original tokens if (stack.length > 0) { return tokens; } return result; };