UNPKG

quikdown

Version:

Small, safe, bidirectional Markdown parser and editor with streaming, undo/redo, rich fences, MCP tools, and a standalone airgapped build. Zero runtime deps. Browser and Node.js.

deftio.github.io/quikdown/pages/

deftio/quikdown

1,069 lines (956 loc) • 76.1 kB

JavaScript

/** * quikdown - Lightweight Markdown Parser * @version 1.2.21 * @license BSD-2-Clause * @copyright DeftIO 2025 */ (function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : typeof define === 'function' && define.amd ? define(factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.quikdown = factory()); })(this, (function () { 'use strict'; /** * quikdown_classify — Shared line-classification utilities * ═════════════════════════════════════════════════════════ * * Pure functions for classifying markdown lines. Used by both the main * parser (quikdown.js) and the editor (quikdown_edit.js) so the logic * lives in one place. * * All functions operate on a **trimmed** line (caller must trim). * None use regexes with nested quantifiers — every check is either a * simple regex or a linear scan, so there is zero ReDoS risk. */ /** * Full CommonMark HR check: three or more identical characters from * {-, *, _} with optional interspersed whitespace. * * Examples that return true: ---, ***, ___, ----, - - -, * * *, _ _ _ * Examples that return false: --, - text, ---text, mixed -_*, empty * * Algorithm (O(n), single pass, no backtracking): * 1. Strip all whitespace * 2. Verify length >= 3 * 3. First char must be -, *, or _ * 4. Every remaining char must equal the first * * @param {string} trimmed The line, already trimmed * @returns {boolean} */ function isHRLine(trimmed) { if (trimmed.length < 3) return false; // Strip whitespace via linear scan let stripped = ''; for (let i = 0; i < trimmed.length; i++) { const ch = trimmed[i]; if (ch !== ' ' && ch !== '\t') stripped += ch; } if (stripped.length < 3) return false; const ch = stripped[0]; if (ch !== '-' && ch !== '*' && ch !== '_') return false; for (let i = 1; i < stripped.length; i++) { if (stripped[i] !== ch) return false; } return true; } /** * quikdown — A compact, scanner-based markdown parser * ════════════════════════════════════════════════════ * * Architecture overview (v1.2.8 — lexer rewrite) * ─────────────────────────────────────────────── * Prior to v1.2.8, quikdown used a multi-pass regex pipeline: each block * type (headings, blockquotes, HR, lists, tables) and each inline format * (bold, italic, links, …) was handled by its own global regex applied * sequentially to the full document string. That worked but made the code * hard to extend and debug — a new construct meant adding another regex * pass, and ordering bugs between passes were subtle. * * Starting in v1.2.8 the parser uses a **line-scanning** approach for * block detection and a **per-block inline pass** for formatting: * * ┌─────────────────────────────────────────────────────────┐ * │ Phase 1 — Code + Escape Extraction │ * │ 1a. Fenced code blocks (``` / ~~~) → §CB§ placeholders│ * │ 1b. Escaped backticks (\`) → §BE§ placeholders │ * │ 1c. Inline code spans (`…`) → §IC§ placeholders │ * │ Cannot cross newlines (scoped to single line). │ * │ 1d. Remaining backslash escapes (\* \_ etc.) → §BE§ │ * │ Only ASCII punctuation is escapable; \a stays. │ * ├─────────────────────────────────────────────────────────┤ * │ Phase 2 — HTML Escaping │ * │ Escape &, <, >, ", ' in the remaining text to prevent │ * │ XSS. (Skipped when allow_unsafe_html is true.) │ * ├─────────────────────────────────────────────────────────┤ * │ Phase 3 — Block Scanning │ * │ Walk the text **line by line**. At each line, the │ * │ scanner checks (in order): │ * │ • table rows (|) │ * │ • headings (#) │ * │ • HR (---) │ * │ • blockquotes (>) │ * │ • list items (-, *, +, 1.) │ * │ • code-block placeholder (§CB…§) │ * │ • paragraph text (everything else) │ * │ │ * │ Block text is run through the **inline formatter** │ * │ which handles bold, italic, strikethrough, links, │ * │ images, and autolinks. │ * │ │ * │ Paragraphs are wrapped in <p> tags. Lazy linefeeds │ * │ (single \n → <br>) are handled here too. │ * ├─────────────────────────────────────────────────────────┤ * │ Phase 4 — Code Restoration │ * │ Replace §CB§ / §IC§ placeholders with rendered <pre> │ * │ / <code> HTML, applying the fence_plugin if present. │ * └─────────────────────────────────────────────────────────┘ * * Why this design? * • Single pass over lines for block identification — no re-scanning. * • Each block type is a clearly separated branch, easy to add new ones. * • Inline formatting is confined to block text — can't accidentally * match across block boundaries or inside HTML tags. * • Code extraction still uses a simple regex (it's one pattern, not a * chain) because the §-placeholder approach is proven and simple. * * @param {string} markdown The markdown source text * @param {Object} options Configuration (see below) * @returns {string} Rendered HTML */ // ──────────────────────────────────────────────────────────────────── // Constants // ──────────────────────────────────────────────────────────────────── /** Build-time version stamp (injected by tools/updateVersion) */ const quikdownVersion = '1.2.21'; /** CSS class prefix used for all generated elements */ const CLASS_PREFIX = 'quikdown-'; /** Placeholder sigils — chosen to be extremely unlikely in real text */ const PLACEHOLDER_CB = '§CB'; // fenced code blocks const PLACEHOLDER_IC = '§IC'; // inline code spans const PLACEHOLDER_HT = '§HT'; // safe HTML tags (limited mode) const PLACEHOLDER_BE = '§BE'; // backslash escapes /** Attributes whose values need URL sanitization */ const URL_ATTRIBUTES = { href:1, src:1, action:1, formaction:1 }; /** HTML entity escape map */ const ESC_MAP = {'&':'&','<':'<','>':'>','"':'"',"'":'''}; // ──────────────────────────────────────────────────────────────────── // Style definitions // ──────────────────────────────────────────────────────────────────── /** * Inline styles for every element quikdown can emit. * When `inline_styles: true` these are injected as style="…" attributes. * When `inline_styles: false` (default) we use class="quikdown-<tag>" * and these same values are emitted by `quikdown.emitStyles()`. */ const QUIKDOWN_STYLES = { h1: 'font-size:2em;margin:.67em 0;text-align:left', h2: 'font-size:1.5em;margin:.83em 0', h3: 'font-size:1.25em;margin:1em 0', h4: 'font-size:1em;margin:1.33em 0', h5: 'font-size:.875em;margin:1.67em 0', h6: 'font-size:.85em;margin:2em 0', pre: 'background:#f4f4f4;padding:10px;border-radius:4px;overflow-x:auto;margin:1em 0', code: 'background:#f0f0f0;padding:2px 4px;border-radius:3px;font-family:monospace', blockquote: 'border-left:4px solid #ddd;margin-left:0;padding-left:1em', table: 'border-collapse:collapse;width:100%;margin:1em 0', th: 'border:1px solid #ddd;padding:8px;background-color:#f2f2f2;font-weight:bold;text-align:left', td: 'border:1px solid #ddd;padding:8px;text-align:left', hr: 'border:none;border-top:1px solid #ddd;margin:1em 0', img: 'max-width:100%;height:auto', a: 'color:#06c;text-decoration:underline', strong: 'font-weight:bold', em: 'font-style:italic', del: 'text-decoration:line-through', ul: 'margin:.5em 0;padding-left:2em', ol: 'margin:.5em 0;padding-left:2em', li: 'margin:.25em 0', 'task-item': 'list-style:none', 'task-checkbox': 'margin-right:.5em', 'alert': 'padding:1em;margin:1em 0;border-left:4px solid #0969da;border-radius:4px;background:#ddf4ff', 'alert-title': 'font-weight:600;margin:0 0 .4em', 'alert-note': 'border-left-color:#0969da;background:#ddf4ff', 'alert-tip': 'border-left-color:#1a7f37;background:#dafbe1', 'alert-important': 'border-left-color:#8250df;background:#fbefff', 'alert-warning': 'border-left-color:#9a6700;background:#fff8c5', 'alert-caution': 'border-left-color:#cf222e;background:#ffebe9', sup: 'font-size:.75em;vertical-align:super;line-height:0', 'footnotes': 'margin-top:2em;font-size:.9em', 'footnote-backref': 'text-decoration:none;margin-left:.25em' }; // ──────────────────────────────────────────────────────────────────── // Attribute factory // ──────────────────────────────────────────────────────────────────── /** * Creates a `getAttr(tag, additionalStyle?)` helper that returns * either a class="…" or style="…" attribute string depending on mode. * * @param {boolean} inline_styles True → emit style="…"; false → class="…" * @param {Object} styles The QUIKDOWN_STYLES map * @returns {Function} */ function createGetAttr(inline_styles, styles) { return function(tag, additionalStyle = '') { if (inline_styles) { let style = styles[tag]; if (!style && !additionalStyle) return ''; // When adding alignment that conflicts with the tag's default, // strip the default text-align first. if (additionalStyle && additionalStyle.includes('text-align') && style && style.includes('text-align')) { style = style.replace(/text-align:[^;]+;?/, '').trim(); /* istanbul ignore next */ if (style && !style.endsWith(';')) style += ';'; } /* istanbul ignore next - defensive: additionalStyle without style doesn't occur with current tags */ const fullStyle = additionalStyle ? (style ? `${style}${additionalStyle}` : additionalStyle) : style; return ` style="${fullStyle}"`; } else { const classAttr = ` class="${CLASS_PREFIX}${tag}"`; if (additionalStyle) { return `${classAttr} style="${additionalStyle}"`; } return classAttr; } }; } // ──────────────────────────────────────────────────────────────────── // Link destination + heading slug + blockquote helpers // ──────────────────────────────────────────────────────────────────── /** * Parse a markdown link/image destination: url with optional "title" or 'title'. * Supports angle-bracket form: <url> * @returns {{ url: string, title: string|null }} */ function parseLinkDestination(raw) { if (raw === undefined || raw === null || raw === '') return { url: '', title: null }; const dblQuote = raw.match(/^(.*)\s+(?:"([^"]*)"|"([^&]*?)")\s*$/); if (dblQuote) { return { url: dblQuote[1].replace(/\s+$/, ''), title: dblQuote[2] ?? dblQuote[3] }; } const sglQuote = raw.match(/^(.*)\s+(?:'([^']*)'|'([^&]*?)')\s*$/); if (sglQuote) { return { url: sglQuote[1].replace(/\s+$/, ''), title: sglQuote[2] ?? sglQuote[3] }; } if (raw.startsWith('<') && raw.endsWith('>')) { return { url: raw.slice(4, -4), title: null }; } return { url: raw, title: null }; } /** Build a URL-safe slug from heading text (inline markdown stripped). */ function headingSlug(text) { return text .replace(/[*_`~]/g, '') .trim() .toLowerCase() .replace(/[^a-z0-9]+/g, '-') .replace(/^-+|-+$/g, '') || 'section'; } /** Return a unique slug, suffixing -1, -2, … on duplicates. */ function uniqueSlug(base, counts) { const n = counts.get(base) || 0; counts.set(base, n + 1); return n === 0 ? base : `${base}-${n}`; } /** * Strip trailing punctuation from an autolinked URL. * Handles balanced parentheses (e.g. Wikipedia URLs). * @param {string} url The matched URL text * @returns {{ url: string, trailing: string }} */ function stripTrailingPunctuation(url) { let trailing = ''; const punct = /[.,;:!?)]/; while (url.length > 0 && punct.test(url[url.length - 1])) { const ch = url[url.length - 1]; if (ch === ')') { const opens = (url.match(/\(/g) || []).length; const closes = (url.match(/\)/g) || []).length; if (opens >= closes) break; // balanced — ) is part of URL } trailing = ch + trailing; url = url.slice(0, -1); } return { url, trailing }; } /** * Count leading blockquote depth on an HTML-escaped line (> markers). * @returns {{ depth: number, content: string }} */ function parseBlockquoteLine(line) { let depth = 0; let pos = 0; while (pos < line.length && line.startsWith('>', pos)) { pos += 4; depth++; if (line[pos] === ' ') pos++; } return { depth, content: line.slice(pos) }; } /** * Check if a line breaks lazy blockquote continuation. * @param {string} line HTML-escaped line text * @returns {boolean} */ function isLazyContinuationBreaker(line) { const trimmed = line.trim(); if (trimmed === '') return true; // blank line if (/^#{1,6}\s/.test(trimmed)) return true; // heading if (isHRLine(trimmed)) return true; // HR /* istanbul ignore next -- defensive: > lines are caught by parseBlockquoteLine first */ if (/^>/.test(trimmed)) return true; // new blockquote if (/^[-*+]\s/.test(trimmed)) return true; // unordered list if (/^\d+\.\s/.test(trimmed)) return true; // ordered list if (trimmed.startsWith('|')) return true; // table row if (trimmed.startsWith(PLACEHOLDER_CB)) return true; // code block placeholder return false; } /** * Base inline formatting patterns shared between the main pass and * footnotes rendering. Defined once at module level to avoid * recreating regex objects on every call. */ const BASE_INLINE_PATTERNS = [ [/\*\*(.+?)\*\*/g, 'strong'], [/__(.+?)__/g, 'strong'], [/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, 'em'], [/(?<![A-Za-z0-9_])_(?![_\s])(.+?)(?<![\s_])_(?![A-Za-z0-9_])/g, 'em'], [/~~(.+?)~~/g, 'del'], [/`([^`\n]+)`/g, 'code'] ]; /** GFM alert type labels */ const ALERT_LABELS = { NOTE: 'Note', TIP: 'Tip', IMPORTANT: 'Important', WARNING: 'Warning', CAUTION: 'Caution' }; /** Render nested blockquotes from a run of parsed lines. */ function renderNestedBlockquotes(items, getAttr, dataQd) { // ── GFM alert detection ── // Check if the first item's content matches [!TYPE] /* istanbul ignore next -- depth is always 1 for outermost blockquote */ const alertMatch = items.length > 0 && items[0].depth === 1 ? items[0].content.trim().match(/^\[!(NOTE|TIP|IMPORTANT|WARNING|CAUTION)\]\s*$/i) : null; let html = ''; const stack = []; const useInlineStyles = getAttr('blockquote').includes('style='); for (let idx = 0; idx < items.length; idx++) { const { depth, content } = items[idx]; /* istanbul ignore next -- depth is always >= 1 from parseBlockquoteLine gate */ if (depth <= 0) continue; // Skip the [!TYPE] marker line — we'll render it as a title if (alertMatch && idx === 0) { const alertType = alertMatch[1].toUpperCase(); const typeLower = alertType.toLowerCase(); if (useInlineStyles) { const baseStyle = QUIKDOWN_STYLES['alert']; const typeStyle = QUIKDOWN_STYLES['alert-' + typeLower]; /* istanbul ignore next -- typeStyle is always defined for valid alert types */ const merged = typeStyle ? `${baseStyle};${typeStyle}` : baseStyle; html += `<div style="${merged}"${dataQd('>')}>`; } else { html += `<div class="${CLASS_PREFIX}alert ${CLASS_PREFIX}alert-${typeLower}"${dataQd('>')}>`; } // Title const label = ALERT_LABELS[alertType]; if (useInlineStyles) { html += `<p style="${QUIKDOWN_STYLES['alert-title']}">${label}</p>`; } else { html += `<p class="${CLASS_PREFIX}alert-title">${label}</p>`; } stack.push('alert'); continue; } while (stack.length > depth) { const tag = stack.pop(); /* istanbul ignore next -- alert closing uses </div>, blockquote uses </blockquote> */ html += tag === 'alert' ? '</div>' : '</blockquote>'; } while (stack.length < depth) { /* istanbul ignore next -- defensive: alert div already opened at depth 0 */ if (stack.length === 0 && alertMatch) { stack.push('alert'); } else { html += `<blockquote${getAttr('blockquote')}${dataQd('>')}>`; stack.push('blockquote'); } } html += content; if (idx < items.length - 1) html += '\n'; } while (stack.length > 0) { const tag = stack.pop(); html += tag === 'alert' ? '</div>' : '</blockquote>'; } return html.trimEnd(); } /** True when a line is part of a 4-space (or tab) indented code block. */ function isIndentedCodeLine(line) { if (line.length === 0) return false; const m = line.match(/^([ \t]+)(.*)$/); if (!m) return false; const spaceEquiv = m[1].replace(/\t/g, ' ').length; if (spaceEquiv < 4) return false; const content = m[2]; if (/^[-*+]\s/.test(content) || /^\d+\.\s/.test(content) || /^>/.test(content) || /^#{1,6}\s/.test(content)) { return false; } return true; } /** Strip one indent level (4 spaces or one tab) from a code line. */ function stripCodeIndent(line) { if (line.startsWith(' ')) return line.slice(4); if (line[0] === '\t') return line.slice(1); const m = line.match(/^[ \t]+(.*)$/); /* istanbul ignore next -- isIndentedCodeLine guarantees leading whitespace */ return m ? m[1] : line; } /** * processIndentedCodeBlocks — line walker for 4-space / tab indented code * @param {Function} escapeHtmlFn Escape helper (for title attrs; code already escaped) */ function processIndentedCodeBlocks(text, getAttr, dataQd) { const lines = text.split('\n'); const result = []; let i = 0; while (i < lines.length) { if (!isIndentedCodeLine(lines[i])) { result.push(lines[i]); i++; continue; } const codeLines = []; while (i < lines.length) { const line = lines[i]; if (line === '') { if (i + 1 < lines.length && isIndentedCodeLine(lines[i + 1])) { codeLines.push(''); i++; continue; } break; } if (isIndentedCodeLine(line)) { codeLines.push(stripCodeIndent(line)); i++; } else { break; } } const codeBody = codeLines.join('\n'); result.push(`<pre${getAttr('pre')}${dataQd(' ')}><code${getAttr('code')}>${codeBody}</code></pre>`); } return result.join('\n'); } /** Split a table row into cell strings. */ function parseTableCells(line) { return line.trim().replace(/^\|/, '').replace(/\|$/, '').split('|'); } // ════════════════════════════════════════════════════════════════════ // Main parser function // ════════════════════════════════════════════════════════════════════ function quikdown(markdown, options = {}) { // ── Guard: only process non-empty strings ── if (!markdown || typeof markdown !== 'string') { return ''; } // ── Unpack options ── const { fence_plugin, inline_styles = false, bidirectional = false, lazy_linefeeds = false, allow_unsafe_html = false, heading_ids = false, reference_links = false, footnotes = false } = options; const styles = QUIKDOWN_STYLES; const getAttr = createGetAttr(inline_styles, styles); const headingSlugCounts = new Map(); // ── Helpers (closed over options) ── /** Escape the five HTML-special characters. */ function escapeHtml(text) { return text.replace(/[&<>"']/g, m => ESC_MAP[m]); } /** * Bidirectional marker helper. * When bidirectional mode is on, returns ` data-qd="…"`. * The non-bidirectional branch is a trivial no-op arrow; it is * exercised in the core bundle but never in quikdown_bd. */ /* istanbul ignore next - trivial no-op fallback */ const dataQd = bidirectional ? (marker) => ` data-qd="${escapeHtml(marker)}"` : () => ''; /** * Sanitize a URL to block javascript:, vbscript:, and non-image data: URIs. * Returns '#' for blocked URLs. */ function sanitizeUrl(url, allowUnsafe = false) { /* istanbul ignore next - defensive programming, regex ensures url is never empty */ if (!url) return ''; if (allowUnsafe) return url; const trimmedUrl = url.trim(); const lowerUrl = trimmedUrl.toLowerCase(); const dangerousProtocols = ['javascript:', 'vbscript:', 'data:']; for (const protocol of dangerousProtocols) { if (lowerUrl.startsWith(protocol)) { if (protocol === 'data:' && lowerUrl.startsWith('data:image/')) { return trimmedUrl; } return '#'; } } return trimmedUrl; } /** * Sanitize attributes on an HTML tag string for limited mode. * Strips on* event handlers (case-insensitive) and runs sanitizeUrl() * on href/src/action/formaction values. */ function sanitizeHtmlTagAttrs(tagStr) { // Self-closing or void tag without attributes — pass through if (!/\s/.test(tagStr.replace(/<\/?[a-zA-Z][a-zA-Z0-9]*/, '').replace(/\/?>$/, ''))) { return tagStr; } // Parse: <tagname ...attrs... > or <tagname ...attrs... /> const m = tagStr.match(/^(<\/?[a-zA-Z][a-zA-Z0-9]*)([\s\S]*?)(\/?>)$/); /* istanbul ignore next - defensive: Phase 1.5 regex guarantees valid tag shape */ if (!m) return tagStr; const [, open, attrStr, close] = m; // Match individual attributes: name="value", name='value', name=value, or bare name // eslint-disable-next-line security/detect-unsafe-regex -- linear: no nested quantifiers const attrRe = /([a-zA-Z_][\w\-.:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g; const attrs = []; let am; while ((am = attrRe.exec(attrStr)) !== null) { const name = am[1]; const value = am[2] !== undefined ? am[2] : am[3] !== undefined ? am[3] : am[4]; // Strip event handlers (on*) if (/^on/i.test(name)) continue; if (value === undefined) { // Boolean attribute (e.g. disabled, checked) attrs.push(name); } else { let sanitized = value; if (name.toLowerCase() in URL_ATTRIBUTES) { sanitized = sanitizeUrl(value); } attrs.push(`${name}="${sanitized}"`); } } return open + (attrs.length ? ' ' + attrs.join(' ') : '') + close; } // ──────────────────────────────────────────────────────────────── // Phase 1 — Code Extraction // ──────────────────────────────────────────────────────────────── // Why extract code first? Fenced blocks and inline code spans can // contain markdown-like characters (*, _, #, |, etc.) that must NOT // be interpreted as formatting. By pulling them out and replacing // with unique placeholders, the rest of the pipeline never sees them. let html = markdown; const codeBlocks = []; // Array of {lang, code, custom, fence, hasReverse} const inlineCodes = []; // Array of escaped-HTML strings // ── Fenced code blocks ── // Matches paired fences: ``` with ``` and ~~~ with ~~~. // The fence must start at column 0 of a line (^ with /m flag). // Group 1 = fence marker, Group 2 = language hint, Group 3 = code body. html = html.replace(/^(```|~~~)([^\n]*)\n([\s\S]*?)^\1$/gm, (match, fence, lang, code) => { const placeholder = `${PLACEHOLDER_CB}${codeBlocks.length}§`; const langTrimmed = lang ? lang.trim() : ''; if (fence_plugin && fence_plugin.render && typeof fence_plugin.render === 'function') { // Custom plugin — store raw code (un-escaped) so the plugin // receives the original source. codeBlocks.push({ lang: langTrimmed, code: code.trimEnd(), custom: true, fence: fence, hasReverse: !!fence_plugin.reverse }); } else { // Default — pre-escape the code for safe HTML output. codeBlocks.push({ lang: langTrimmed, code: escapeHtml(code.trimEnd()), custom: false, fence: fence }); } return placeholder; }); // ── Escaped backticks ── // Extract \` before inline code extraction so an escaped backtick // does not participate in code span pairing. const backslashEscapes = []; html = html.replace(/\\`/g, () => { const placeholder = `${PLACEHOLDER_BE}${backslashEscapes.length}§`; backslashEscapes.push('`'); return placeholder; }); // ── Inline code spans ── // Matches a single backtick pair: `content`. // Content is captured and HTML-escaped immediately. html = html.replace(/`([^`\n]+)`/g, (match, code) => { const placeholder = `${PLACEHOLDER_IC}${inlineCodes.length}§`; inlineCodes.push(escapeHtml(code)); return placeholder; }); // ──────────────────────────────────────────────────────────────── // Phase 1.25 — Backslash Escape Extraction // ──────────────────────────────────────────────────────────────── // Extract remaining backslash-escaped ASCII punctuation so those // characters are not interpreted as markdown formatting. Runs // after code extraction (so \* inside code blocks and inline code // is already protected) and before HTML escaping. html = html.replace(/\\([\\*_{}[\]()#+\-.!~|<>])/g, (match, char) => { const placeholder = `${PLACEHOLDER_BE}${backslashEscapes.length}§`; backslashEscapes.push(escapeHtml(char)); return placeholder; }); // ──────────────────────────────────────────────────────────────── // Phase 1.75 — Reference Link & Footnote Definition Collection // ──────────────────────────────────────────────────────────────── // Scan lines BEFORE HTML escaping to collect definitions. // [id]: url "title" — reference link definition // [^id]: text — footnote definition (with indented continuation) // Characters [, ], :, ^ are NOT in the HTML escape map so they // survive all phases unchanged. Definition lines are stripped. const refDefs = new Map(); // id (lowercase) → { url, title } const fnDefs = new Map(); // id → text const fnOrder = []; // ordered list of footnote ids as referenced if (reference_links || footnotes) { const lines = html.split('\n'); const kept = []; let i = 0; while (i < lines.length) { const line = lines[i]; // Skip lines inside code block placeholders if (line.includes(PLACEHOLDER_CB)) { kept.push(line); i++; continue; } // Footnote definition: [^id]: text if (footnotes) { const fnMatch = line.match(/^\[\^([^\]]+)\]:\s+([\s\S]*)$/); if (fnMatch) { const fnId = fnMatch[1]; let fnText = fnMatch[2]; // Collect indented continuation lines while (i + 1 < lines.length) { const next = lines[i + 1]; if (/^[ \t]+\S/.test(next) && !next.includes(PLACEHOLDER_CB)) { fnText += ' ' + next.trim(); i++; } else { break; } } // First definition wins const key = fnId; if (!fnDefs.has(key)) { fnDefs.set(key, fnText); } i++; continue; } } // Reference link definition: [id]: url "title" if (reference_links) { // eslint-disable-next-line security/detect-unsafe-regex -- linear: no nested quantifiers on same path const refMatch = line.match(/^\[([^\]]+)\]:\s+<?([^\s>]+)>?(?:\s+(?:"([^"]*)"|'([^']*)'|\(([^)]*)\)))?\s*$/); if (refMatch) { const refId = refMatch[1].toLowerCase(); const url = refMatch[2]; const title = refMatch[3] !== undefined ? refMatch[3] : refMatch[4] !== undefined ? refMatch[4] : refMatch[5] !== undefined ? refMatch[5] : null; // First definition wins if (!refDefs.has(refId)) { refDefs.set(refId, { url, title }); } i++; continue; } } kept.push(line); i++; } html = kept.join('\n'); } // ──────────────────────────────────────────────────────────────── // Phase 1.5 — Safe HTML Extraction (whitelist mode) // ──────────────────────────────────────────────────────────────── // When allow_unsafe_html is an object or array, extract whitelisted // HTML tags, sanitize their attributes, and replace with placeholders. // Non-whitelisted tags stay in text so Phase 2 will escape them. const safeTags = []; // Normalize: array → object for O(1) lookup; object used as-is const htmlAllow = Array.isArray(allow_unsafe_html) ? Object.fromEntries(allow_unsafe_html.map(t => [t, 1])) : (allow_unsafe_html && typeof allow_unsafe_html === 'object') ? allow_unsafe_html : null; if (htmlAllow) { // Pass through HTML comments — browsers render them as nothing html = html.replace(//g, (match) => { const idx = safeTags.length; safeTags.push(match); return `${PLACEHOLDER_HT}${idx}§`; }); html = html.replace(/<\/?([a-zA-Z][a-zA-Z0-9]*)\b[^>]*\/?>/g, (match, tagName) => { if (tagName.toLowerCase() in htmlAllow) { const sanitized = sanitizeHtmlTagAttrs(match); const idx = safeTags.length; safeTags.push(sanitized); return `${PLACEHOLDER_HT}${idx}§`; } // Not whitelisted — leave in text for Phase 2 to escape return match; }); } // ──────────────────────────────────────────────────────────────── // Phase 2 — HTML Escaping // ──────────────────────────────────────────────────────────────── // All remaining text (everything except code placeholders) is escaped // to prevent XSS. The `allow_unsafe_html` option skips this for // trusted pipelines that intentionally embed raw HTML. // For whitelist mode, escaping still runs (only `true` bypasses it). if (allow_unsafe_html !== true) { html = escapeHtml(html); } // Restore safe HTML tag placeholders after escaping if (htmlAllow) { safeTags.forEach((tag, i) => { html = html.replace(`${PLACEHOLDER_HT}${i}§`, tag); }); } // ──────────────────────────────────────────────────────────────── // Phase 3 — Block Scanning + Inline Formatting + Paragraphs // ──────────────────────────────────────────────────────────────── // This is the heart of the lexer rewrite. Instead of applying // 10+ global regex passes, we: // 0. Process indented code blocks (4-space / tab, before other blocks) // 1. Process tables (line walker — tables need multi-line lookahead) // 2. Scan remaining lines for headings, HR, blockquotes // 3. Process lists (line walker — lists need indent tracking) // 4. Apply inline formatting to all text content // 5. Wrap remaining text in <p> tags // // Steps 0, 1 and 3 are line-walkers that process the full text in a // single pass each. Step 2 replaces global regex with a per-line // scanner. Steps 4-5 are applied to the result. // // Total: 4 structured passes instead of 10+ regex passes. // ── Step 0: Indented code blocks ── html = processIndentedCodeBlocks(html, getAttr, dataQd); // ── Step 1: Tables ── // Tables need multi-line lookahead (header → separator → body rows) // so they're handled by a dedicated line-walker first. html = processTable(html, getAttr, bidirectional); // ── Step 2: Headings, HR, Blockquotes ── // These are simple line-level constructs. We scan each line once // and replace matching lines with their HTML representation. html = scanLineBlocks(html, getAttr, dataQd, heading_ids, headingSlugCounts, escapeHtml); // ── Step 3: Lists ── // Lists need indent-level tracking across lines, so they get their // own line-walker. html = processLists(html, getAttr, inline_styles, bidirectional); // ── Step 4: Inline formatting ── // Apply bold, italic, strikethrough, images, links, and autolinks // to all text content. This runs on the output of steps 1-3, so // it sees text inside headings, blockquotes, table cells, list // items, and paragraph text. // Images (must come before links — ![alt](src) vs [text](url)) html = html.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, dest) => { const { url, title } = parseLinkDestination(dest); const sanitizedSrc = sanitizeUrl(url, options.allow_unsafe_urls); const titleAttr = title ? ` title="${escapeHtml(title)}"` : ''; /* istanbul ignore next - bd-only branch */ const altAttr = bidirectional && alt ? ` data-qd-alt="${escapeHtml(alt)}"` : ''; /* istanbul ignore next - bd-only branch */ const srcAttr = bidirectional ? ` data-qd-src="${escapeHtml(url)}"` : ''; /* istanbul ignore next - bd-only branch */ const titleQd = bidirectional && title ? ` data-qd-title="${escapeHtml(title)}"` : ''; return `<img${getAttr('img')} src="${sanitizedSrc}" alt="${alt}"${titleAttr}${altAttr}${srcAttr}${titleQd}${dataQd('!')}>`; }); // Links html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, dest) => { const { url, title } = parseLinkDestination(dest); const sanitizedHref = sanitizeUrl(url, options.allow_unsafe_urls); const isExternal = /^https?:\/\//i.test(sanitizedHref); const rel = isExternal ? ' rel="noopener noreferrer"' : ''; const titleAttr = title ? ` title="${escapeHtml(title)}"` : ''; /* istanbul ignore next - bd-only branch */ const textAttr = bidirectional ? ` data-qd-text="${escapeHtml(text)}"` : ''; /* istanbul ignore next - bd-only branch */ const titleQd = bidirectional && title ? ` data-qd-title="${escapeHtml(title)}"` : ''; return `<a${getAttr('a')} href="${sanitizedHref}"${rel}${titleAttr}${textAttr}${titleQd}${dataQd('[')}>${text}</a>`; }); // Autolinks — bare https?:// URLs become clickable <a> tags html = html.replace(/(^|\s)(https?:\/\/[^\s<]+)/g, (match, prefix, rawUrl) => { const { url, trailing } = stripTrailingPunctuation(rawUrl); const sanitizedUrl = sanitizeUrl(url, options.allow_unsafe_urls); return `${prefix}<a${getAttr('a')} href="${sanitizedUrl}" rel="noopener noreferrer">${url}</a>${trailing}`; }); // ── Phase 3.5: Reference Link & Footnote Resolution ── // Resolve [text][id], [text][], [id] patterns to <a> tags using // collected definitions. Footnote markers [^id] become <sup> links. if (reference_links && refDefs.size > 0) { /** Build an <a> tag from a resolved reference definition. */ function buildRefAnchor(def, id, displayText) { const sanitizedHref = sanitizeUrl(def.url, options.allow_unsafe_urls); const isExternal = /^https?:\/\//i.test(sanitizedHref); const rel = isExternal ? ' rel="noopener noreferrer"' : ''; const titleAttr = def.title ? ` title="${escapeHtml(def.title)}"` : ''; /* istanbul ignore next - bd-only branch */ const refAttr = bidirectional ? ` data-qd-ref="${escapeHtml(id)}"` : ''; return `<a${getAttr('a')} href="${sanitizedHref}"${rel}${titleAttr}${refAttr}${dataQd('[ref')}>${displayText}</a>`; } // Full reference: [text][id] and collapsed: [text][] html = html.replace(/\[([^\]]+)\]\[([^\]]*)\]/g, (match, text, id) => { const def = refDefs.get((id === '' ? text : id).toLowerCase()); return def ? buildRefAnchor(def, id, text) : match; }); // Shortcut reference: [id] (not followed by ( or [, not containing ^) html = html.replace(/(?<!\])\[([^\]^[\n]+)\](?!\(|\[)/g, (match, id) => { const def = refDefs.get(id.toLowerCase()); return def ? buildRefAnchor(def, id, id) : match; }); } if (footnotes && fnDefs.size > 0) { // Footnote markers: [^id] html = html.replace(/\[\^([^\]]+)\]/g, (match, id) => { if (!fnDefs.has(id)) return match; // unresolved — leave as text // Track ordering — each unique id gets a sequential number let fnNum = fnOrder.indexOf(id); if (fnNum === -1) { fnOrder.push(id); fnNum = fnOrder.length; } else { fnNum = fnNum + 1; } /* istanbul ignore next - bd-only branch */ const fnAttr = bidirectional ? ` data-qd-fn="${escapeHtml(id)}"` : ''; return `<sup${getAttr('sup')}${fnAttr}${dataQd('[^')}><a href="#fn-${escapeHtml(id)}" id="fnref-${escapeHtml(id)}">${fnNum}</a></sup>`; }); } // Protect rendered tags so emphasis regexes don't see attribute // values — fixes #3 (underscores in URLs interpreted as emphasis). const savedTags = []; html = html.replace(/<[^>]+>/g, m => { savedTags.push(m); return `%%T${savedTags.length - 1}%%`; }); // Bold, italic, strikethrough (reuses BASE_INLINE_PATTERNS; code spans // are already extracted in Phase 1 so only the first 5 patterns apply here) const emphasisMarkers = ['**', '__', '*', '_', '~~']; for (let pi = 0; pi < 5; pi++) { const [pattern, tag] = BASE_INLINE_PATTERNS[pi]; html = html.replace(pattern, `<${tag}${getAttr(tag)}${dataQd(emphasisMarkers[pi])}>$1</${tag}>`); } // Restore protected tags html = html.replace(/%%T(\d+)%%/g, (_, i) => savedTags[i]); // ── Step 5: Line breaks + paragraph wrapping ── // Backslash at end of line → hard line break (CommonMark) html = html.replace(/\\\n/g, `<br${getAttr('br')}>`); if (lazy_linefeeds) { // Lazy linefeeds mode: every single \n becomes <br> EXCEPT: // • Double newlines → paragraph break // • Newlines adjacent to block elements (h, blockquote, pre, hr, table, list) // // Strategy: protect block-adjacent newlines with §N§, convert // the rest, then restore. const blocks = []; let bi = 0; // Protect tables and lists from <br> injection html = html.replace(/<(table|[uo]l)[^>]*>[\s\S]*?<\/\1>/g, m => { blocks[bi] = m; return `§B${bi++}§`; }); html = html.replace(/\n\n+/g, '§P§') // After block-level closing tags .replace(/(<\/(?:h[1-6]|blockquote|pre)>)\n/g, '$1§N§') .replace(/(<(?:h[1-6]|blockquote|pre|hr)[^>]*>)\n/g, '$1§N§') // Before block-level opening tags .replace(/\n(<(?:h[1-6]|blockquote|pre|hr)[^>]*>)/g, '§N§$1') .replace(/\n(§B\d+§)/g, '§N§$1') .replace(/(§B\d+§)\n/g, '$1§N§') // Convert surviving newlines to <br> .replace(/\n/g, `<br${getAttr('br')}>`) // Restore .replace(/§N§/g, '\n') .replace(/§P§/g, '</p><p>'); // Restore protected blocks blocks.forEach((b, i) => html = html.replace(`§B${i}§`, b)); html = '<p>' + html + '</p>'; } else { // Standard mode: two trailing spaces → <br>, double newline → new paragraph html = html.replace(/ {2}$/gm, `<br${getAttr('br')}>`); html = html.replace(/\n\n+/g, (match, offset) => { const before = html.substring(0, offset); if (before.match(/<\/(h[1-6]|blockquote|ul|ol|table|pre|hr)>$/)) { return '<p>'; } return '</p><p>'; }); html = '<p>' + html + '</p>'; } // ── Step 6: Cleanup ── // Remove <p> wrappers that accidentally enclose block elements. // This is simpler than trying to prevent them during wrapping. const cleanupPatterns = [ [/<p><\/p>/g, ''], [/<p>(<h[1-6][^>]*>)/g, '$1'], [/(<\/h[1-6]>)<\/p>/g, '$1'], [/<p>(<blockquote[^>]*>)/g, '$1'], [/(<\/blockquote>)<\/p>/g, '$1'], [/<p>(<ul[^>]*>|<ol[^>]*>)/g, '$1'], [/(<\/ul>|<\/ol>)<\/p>/g, '$1'], [/<p>(<hr[^>]*>)<\/p>/g, '$1'], [/<p>(<table[^>]*>)/g, '$1'], [/(<\/table>)<\/p>/g, '$1'], [/<p>(<pre[^>]*>)/g, '$1'], [/(<\/pre>)<\/p>/g, '$1'], [/<p>(<div[^>]*>)/g, '$1'], [/(<\/div>)<\/p>/g, '$1'], [/<p>(<section[^>]*>)/g, '$1'], [/(<\/section>)<\/p>/g, '$1'], [new RegExp(`<p>(${PLACEHOLDER_CB}\\d+§)</p>`, 'g'), '$1'] ]; cleanupPatterns.forEach(([pattern, replacement]) => { html = html.replace(pattern, replacement); }); // When a block element is followed by a newline and then text, open a <p>. html = html.replace(/(<\/(?:h[1-6]|blockquote|div|section|ul|ol|table|pre|hr)>)\n([^<])/g, '$1\n<p>$2'); // ── Footnotes section ── // Only rendered if footnotes were actually referenced in the document. if (footnotes && fnOrder.length > 0) { /* istanbul ignore next - bd-only branch */ const sectionAttr = bidirectional ? ` data-qd="[^section"` : ''; let fnSection = `<section${getAttr('footnotes')}${sectionAttr}>`; fnSection += `<hr${getAttr('hr')}>`; fnSection += `<ol${getAttr('ol')}>`; for (const id of fnOrder) { const rawText = fnDefs.get(id) || ''; // Escape HTML in footnote text, then apply inline formatting let fnHtml = allow_unsafe_html === true ? rawText : escapeHtml(rawText); for (const [pattern, tag] of BASE_INLINE_PATTERNS) { fnHtml = fnHtml.replace(pattern, `<${tag}${getAttr(tag)}>$1</${tag}>`); } /* istanbul ignore next - bd-only branch */ const liAttr = bidirectional ? ` data-qd-fn-id="${escapeHtml(id)}"` : ''; fnSection += `<li${getAttr('li')} id="fn-${escapeHtml(id)}"${liAttr}>${fnHtml} <a href="#fnref-${escapeHtml(id)}"${getAttr('footnote-backref')}>↩</a></li>`; } fnSection += '</ol></section>'; html += fnSection; } // ──────────────────────────────────────────────────────────────── // Phase 4 — Code Restoration // ──────────────────────────────────────────────────────────────── // Replace placeholders with rendered HTML. For fenced blocks this // means wrapping in <pre><code>…</code></pre> (or calling the // fence_plugin). For inline code it means <code>…</code>. codeBlocks.forEach((block, i) => { let replacement; if (block.custom && fence_plugin && fence_plugin.render) { // Delegate to the user-provided fence plugin. replacement = fence_plugin.render(block.code, block.lang); if (replacement === undefined) { // Plugin declined — fall back to default rendering. const langClass = !inline_styles && block.lang ? ` class="language-${escapeHtml(block.lang)}"` : ''; const codeAttr = inline_styles ? getAttr('code') : langClass; /* istanbul ignore next - bd-only branch */ const langAttr = bidirectional && block.lang ? ` data-qd-lang="${escapeHtml(block.lang)}"` : ''; /* istanbul ignore next - bd-only branch */ const fenceAttr = bidirectional ? ` data-qd-fence="${escapeHtml(block.fence)}"` : ''; replacement = `<pre${getAttr('pre')}${fenceAttr}${langAttr}><code${codeAttr}>${escapeHtml(block.code)}</code></pre>`; } else /* istanbul ignore next - bd-only branch */ if (bidirectional) { // Plugin returned HTML — inject data attributes for roundtrip. replacement = replacement.replace(/^<(\w+)/, `<$1 data-qd-fence="${escapeHtml(block.fence)}" data-qd-lang="${